From d4a1a6536c3d99d4128206c52e672b8d564a1536 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 28 Feb 2022 21:32:12 -0500 Subject: [PATCH 001/154] move np models to np directory --- hyperion/bin/apply-mvn-select-frames.py | 4 +- hyperion/bin/compute-energy-vad.py | 2 +- hyperion/bin/compute-mfcc-feats.py | 2 +- hyperion/bin/eval-cos-1vs1.py | 2 +- hyperion/bin/eval-linear-gbe-up.py | 4 +- hyperion/bin/eval-linear-gbe.py | 4 +- hyperion/bin/eval-linear-svmc.py | 4 +- hyperion/bin/eval-logistic-regression.py | 4 +- hyperion/bin/eval-plda-1vs1.py | 2 +- hyperion/bin/eval-plda-nvs1.py | 2 +- hyperion/bin/plot-vector-hist.py | 2 +- hyperion/bin/plot-vector-tsne.py | 2 +- hyperion/bin/torch-compute-mfcc-feats.py | 1 - hyperion/bin/torch-eval-vae.py | 2 +- ...osine-scoring-from-adv-test-wav-wavegan.py | 2 +- ...l-xvec-cosine-scoring-from-adv-test-wav.py | 2 +- ...l-xvec-cosine-scoring-from-art-test-wav.py | 2 +- ...-eval-xvec-cosine-scoring-from-test-wav.py | 2 +- ...sine-scoring-from-transfer-adv-test-wav.py | 2 +- ...sine-scoring-from-transfer-art-test-wav.py | 2 +- .../bin/torch-eval-xvec-logits-from-wav.py | 2 +- ...rch-extract-xvectors-from-wav-with-rttm.py | 2 +- .../bin/torch-extract-xvectors-from-wav.py | 2 +- ...torch-extract-xvectors-slidwin-from-wav.py | 2 +- .../bin/torch-extract-xvectors-slidwin.py | 2 +- .../bin/torch-extract-xvectors-vae-preproc.py | 2 +- hyperion/bin/torch-extract-xvectors.py | 2 +- ...orch-generate-adv-attacks-xvector-verif.py | 2 +- hyperion/bin/train-cw-up.py | 4 +- hyperion/bin/train-cw.py | 4 +- hyperion/bin/train-gaussianizer.py | 4 +- hyperion/bin/train-lda.py | 2 +- hyperion/bin/train-linear-gbe-up.py | 4 +- hyperion/bin/train-linear-gbe.py | 4 +- hyperion/bin/train-linear-svmc.py | 4 +- hyperion/bin/train-logistic-regression.py | 4 +- hyperion/bin/train-mvn.py | 4 +- hyperion/bin/train-nda.py | 2 +- hyperion/bin/train-pca.py | 2 +- hyperion/bin/train-plda.py | 2 +- hyperion/helpers/__init__.py | 4 - hyperion/helpers/classif_trial_data_reader.py | 2 +- .../helpers/multi_test_trial_data_reader.py | 2 +- .../multi_test_trial_data_reader_v2.py | 2 +- hyperion/helpers/plda_factory.py | 2 +- hyperion/helpers/tracking_data_reader.py | 2 +- hyperion/helpers/trial_data_reader.py | 2 +- hyperion/helpers/vector_class_reader.py | 2 +- hyperion/helpers/vector_reader.py | 2 +- hyperion/np/__init__.py | 7 + hyperion/{ => np}/augment/__init__.py | 0 hyperion/{ => np}/augment/noise_augment.py | 4 +- hyperion/{ => np}/augment/reverb_augment.py | 2 +- hyperion/{ => np}/augment/speech_augment.py | 2 +- hyperion/{ => np}/augment/speed_augment.py | 2 +- hyperion/{ => np}/calibration/__init__.py | 0 .../{ => np}/calibration/gauss_calibration.py | 4 +- .../calibration/unsup_gauss_calibration.py | 0 hyperion/{ => np}/classifiers/__init__.py | 0 .../classifiers/binary_logistic_regression.py | 0 .../{ => np}/classifiers/greedy_fusion.py | 6 +- hyperion/{ => np}/classifiers/linear_gbe.py | 8 +- hyperion/{ => np}/classifiers/linear_gbe1.py | 8 +- .../{ => np}/classifiers/linear_gbe_up.py | 6 +- hyperion/{ => np}/classifiers/linear_svmc.py | 8 +- .../classifiers/logistic_regression.py | 8 +- .../classifiers/q_scoring_homo_gbe.py | 8 +- hyperion/{ => np}/clustering/__init__.py | 0 hyperion/{ => np}/clustering/ahc.py | 6 +- hyperion/{ => np}/clustering/kmeans.py | 6 +- hyperion/{ => np}/diarization/__init__.py | 0 .../{ => np}/diarization/diar_ahc_plda.py | 0 hyperion/{ => np}/feats/__init__.py | 0 hyperion/{ => np}/feats/energy_vad.py | 4 +- .../{ => np}/feats/feature_normalization.py | 2 +- hyperion/{ => np}/feats/feature_windows.py | 2 +- hyperion/{ => np}/feats/filter_banks.py | 3 +- hyperion/{ => np}/feats/frame_selector.py | 0 hyperion/{ => np}/feats/mfcc.py | 4 +- hyperion/{ => np}/feats/stft.py | 2 +- hyperion/{ => np}/metrics/__init__.py | 0 hyperion/{ => np}/metrics/acc.py | 0 hyperion/{ => np}/metrics/cllr.py | 0 hyperion/{ => np}/metrics/confidence.py | 0 hyperion/{ => np}/metrics/confusion_matrix.py | 0 hyperion/{ => np}/metrics/dcf.py | 0 hyperion/{ => np}/metrics/dcf_plot.py | 0 hyperion/{ => np}/metrics/det_plot.py | 0 hyperion/{ => np}/metrics/eer.py | 0 hyperion/{ => np}/metrics/roc.py | 0 hyperion/{ => np}/metrics/utils.py | 2 +- .../metrics/verification_evaluator.py | 6 +- hyperion/{hyp_model.py => np/np_model.py} | 4 +- .../np_model_loader.py} | 9 +- hyperion/{ => np}/pdfs/__init__.py | 0 hyperion/{ => np}/pdfs/core/__init__.py | 0 hyperion/{ => np}/pdfs/core/exp_family.py | 0 hyperion/{ => np}/pdfs/core/normal.py | 6 +- .../{ => np}/pdfs/core/normal_diag_cov.py | 4 +- hyperion/{ => np}/pdfs/core/pdf.py | 4 +- hyperion/{ => np}/pdfs/hmm/__init__.py | 0 hyperion/{ => np}/pdfs/hmm/hmm.py | 4 +- hyperion/{ => np}/pdfs/jfa/__init__.py | 0 hyperion/{ => np}/pdfs/jfa/jfa_total.py | 4 +- hyperion/{ => np}/pdfs/mixtures/__init__.py | 0 .../pdfs/mixtures/exp_family_mixture.py | 6 +- hyperion/{ => np}/pdfs/mixtures/gmm.py | 6 +- .../{ => np}/pdfs/mixtures/gmm_diag_cov.py | 6 +- .../pdfs/mixtures/gmm_tied_diag_cov.py | 6 +- hyperion/{ => np}/pdfs/plda/__init__.py | 0 hyperion/{ => np}/pdfs/plda/frplda.py | 4 +- hyperion/{ => np}/pdfs/plda/plda.py | 4 +- hyperion/{ => np}/pdfs/plda/plda_base.py | 2 +- hyperion/{ => np}/pdfs/plda/splda.py | 4 +- hyperion/{ => np}/score_norm/__init__.py | 0 hyperion/{ => np}/score_norm/adapt_s_norm.py | 0 hyperion/{ => np}/score_norm/s_norm.py | 0 hyperion/{ => np}/score_norm/score_norm.py | 4 +- hyperion/{ => np}/score_norm/t_norm.py | 0 hyperion/{ => np}/score_norm/tz_norm.py | 0 hyperion/{ => np}/score_norm/z_norm.py | 0 hyperion/{ => np}/score_norm/zt_norm.py | 0 hyperion/{ => np}/transforms/__init__.py | 0 hyperion/{ => np}/transforms/cent_whiten.py | 4 +- .../{ => np}/transforms/cent_whiten_up.py | 2 +- hyperion/{ => np}/transforms/coral.py | 4 +- hyperion/{ => np}/transforms/gaussianizer.py | 6 +- hyperion/{ => np}/transforms/lda.py | 4 +- hyperion/{ => np}/transforms/lnorm.py | 0 hyperion/{ => np}/transforms/lnorm_up.py | 1 - hyperion/{ => np}/transforms/mvn.py | 4 +- hyperion/{ => np}/transforms/nap.py | 4 +- hyperion/{ => np}/transforms/nda.py | 6 +- hyperion/{ => np}/transforms/pca.py | 4 +- hyperion/{ => np}/transforms/sb_sw.py | 6 +- hyperion/{ => np}/transforms/skl_tsne.py | 4 +- .../{ => np}/transforms/transform_list.py | 0 hyperion/pipeline/pipeline.py | 63 -------- hyperion/torch/data/audio_dataset.py | 2 +- hyperion/torch/layers/audio_feats.py | 4 +- hyperion/vb_pdfs/core/exponential_family.py | 139 ------------------ hyperion/vb_pdfs/core/pdf.py | 32 ---- requirements.txt | 1 + 143 files changed, 184 insertions(+), 418 deletions(-) create mode 100644 hyperion/np/__init__.py rename hyperion/{ => np}/augment/__init__.py (100%) rename hyperion/{ => np}/augment/noise_augment.py (98%) rename hyperion/{ => np}/augment/reverb_augment.py (99%) rename hyperion/{ => np}/augment/speech_augment.py (99%) rename hyperion/{ => np}/augment/speed_augment.py (99%) rename hyperion/{ => np}/calibration/__init__.py (100%) rename hyperion/{ => np}/calibration/gauss_calibration.py (98%) rename hyperion/{ => np}/calibration/unsup_gauss_calibration.py (100%) rename hyperion/{ => np}/classifiers/__init__.py (100%) rename hyperion/{ => np}/classifiers/binary_logistic_regression.py (100%) rename hyperion/{ => np}/classifiers/greedy_fusion.py (99%) rename hyperion/{ => np}/classifiers/linear_gbe.py (98%) rename hyperion/{ => np}/classifiers/linear_gbe1.py (97%) rename hyperion/{ => np}/classifiers/linear_gbe_up.py (98%) rename hyperion/{ => np}/classifiers/linear_svmc.py (98%) rename hyperion/{ => np}/classifiers/logistic_regression.py (99%) rename hyperion/{ => np}/classifiers/q_scoring_homo_gbe.py (97%) rename hyperion/{ => np}/clustering/__init__.py (100%) rename hyperion/{ => np}/clustering/ahc.py (97%) rename hyperion/{ => np}/clustering/kmeans.py (95%) rename hyperion/{ => np}/diarization/__init__.py (100%) rename hyperion/{ => np}/diarization/diar_ahc_plda.py (100%) rename hyperion/{ => np}/feats/__init__.py (100%) rename hyperion/{ => np}/feats/energy_vad.py (99%) rename hyperion/{ => np}/feats/feature_normalization.py (99%) rename hyperion/{ => np}/feats/feature_windows.py (98%) rename hyperion/{ => np}/feats/filter_banks.py (98%) rename hyperion/{ => np}/feats/frame_selector.py (100%) rename hyperion/{ => np}/feats/mfcc.py (99%) rename hyperion/{ => np}/feats/stft.py (98%) rename hyperion/{ => np}/metrics/__init__.py (100%) rename hyperion/{ => np}/metrics/acc.py (100%) rename hyperion/{ => np}/metrics/cllr.py (100%) rename hyperion/{ => np}/metrics/confidence.py (100%) rename hyperion/{ => np}/metrics/confusion_matrix.py (100%) rename hyperion/{ => np}/metrics/dcf.py (100%) rename hyperion/{ => np}/metrics/dcf_plot.py (100%) rename hyperion/{ => np}/metrics/det_plot.py (100%) rename hyperion/{ => np}/metrics/eer.py (100%) rename hyperion/{ => np}/metrics/roc.py (100%) rename hyperion/{ => np}/metrics/utils.py (99%) rename hyperion/{ => np}/metrics/verification_evaluator.py (99%) rename hyperion/{hyp_model.py => np/np_model.py} (98%) rename hyperion/{model_loader.py => np/np_model_loader.py} (78%) rename hyperion/{ => np}/pdfs/__init__.py (100%) rename hyperion/{ => np}/pdfs/core/__init__.py (100%) rename hyperion/{ => np}/pdfs/core/exp_family.py (100%) rename hyperion/{ => np}/pdfs/core/normal.py (99%) rename hyperion/{ => np}/pdfs/core/normal_diag_cov.py (99%) rename hyperion/{ => np}/pdfs/core/pdf.py (93%) rename hyperion/{ => np}/pdfs/hmm/__init__.py (100%) rename hyperion/{ => np}/pdfs/hmm/hmm.py (99%) rename hyperion/{ => np}/pdfs/jfa/__init__.py (100%) rename hyperion/{ => np}/pdfs/jfa/jfa_total.py (99%) rename hyperion/{ => np}/pdfs/mixtures/__init__.py (100%) rename hyperion/{ => np}/pdfs/mixtures/exp_family_mixture.py (99%) rename hyperion/{ => np}/pdfs/mixtures/gmm.py (99%) rename hyperion/{ => np}/pdfs/mixtures/gmm_diag_cov.py (99%) rename hyperion/{ => np}/pdfs/mixtures/gmm_tied_diag_cov.py (98%) rename hyperion/{ => np}/pdfs/plda/__init__.py (100%) rename hyperion/{ => np}/pdfs/plda/frplda.py (99%) rename hyperion/{ => np}/pdfs/plda/plda.py (99%) rename hyperion/{ => np}/pdfs/plda/plda_base.py (99%) rename hyperion/{ => np}/pdfs/plda/splda.py (99%) rename hyperion/{ => np}/score_norm/__init__.py (100%) rename hyperion/{ => np}/score_norm/adapt_s_norm.py (100%) rename hyperion/{ => np}/score_norm/s_norm.py (100%) rename hyperion/{ => np}/score_norm/score_norm.py (85%) rename hyperion/{ => np}/score_norm/t_norm.py (100%) rename hyperion/{ => np}/score_norm/tz_norm.py (100%) rename hyperion/{ => np}/score_norm/z_norm.py (100%) rename hyperion/{ => np}/score_norm/zt_norm.py (100%) rename hyperion/{ => np}/transforms/__init__.py (100%) rename hyperion/{ => np}/transforms/cent_whiten.py (98%) rename hyperion/{ => np}/transforms/cent_whiten_up.py (96%) rename hyperion/{ => np}/transforms/coral.py (98%) rename hyperion/{ => np}/transforms/gaussianizer.py (96%) rename hyperion/{ => np}/transforms/lda.py (98%) rename hyperion/{ => np}/transforms/lnorm.py (100%) rename hyperion/{ => np}/transforms/lnorm_up.py (99%) rename hyperion/{ => np}/transforms/mvn.py (94%) rename hyperion/{ => np}/transforms/nap.py (97%) rename hyperion/{ => np}/transforms/nda.py (94%) rename hyperion/{ => np}/transforms/pca.py (98%) rename hyperion/{ => np}/transforms/sb_sw.py (98%) rename hyperion/{ => np}/transforms/skl_tsne.py (99%) rename hyperion/{ => np}/transforms/transform_list.py (100%) delete mode 100644 hyperion/pipeline/pipeline.py delete mode 100644 hyperion/vb_pdfs/core/exponential_family.py delete mode 100644 hyperion/vb_pdfs/core/pdf.py diff --git a/hyperion/bin/apply-mvn-select-frames.py b/hyperion/bin/apply-mvn-select-frames.py index 71c52cda..4f73628e 100755 --- a/hyperion/bin/apply-mvn-select-frames.py +++ b/hyperion/bin/apply-mvn-select-frames.py @@ -23,8 +23,8 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import RandomAccessDataReaderFactory as RDRF -from hyperion.feats import MeanVarianceNorm as MVN -from hyperion.feats import FrameSelector as FSel +from hyperion.np.feats import MeanVarianceNorm as MVN +from hyperion.np.feats import FrameSelector as FSel def process_feats( diff --git a/hyperion/bin/compute-energy-vad.py b/hyperion/bin/compute-energy-vad.py index 397aea80..99f562cf 100755 --- a/hyperion/bin/compute-energy-vad.py +++ b/hyperion/bin/compute-energy-vad.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import config_logger from hyperion.io import SequentialAudioReader as AR from hyperion.io import DataWriterFactory as DWF -from hyperion.feats import EnergyVAD +from hyperion.np.feats import EnergyVAD def compute_vad(input_path, output_path, write_num_frames, **kwargs): diff --git a/hyperion/bin/compute-mfcc-feats.py b/hyperion/bin/compute-mfcc-feats.py index 589d3188..b7e90056 100755 --- a/hyperion/bin/compute-mfcc-feats.py +++ b/hyperion/bin/compute-mfcc-feats.py @@ -21,7 +21,7 @@ from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import DataWriterFactory as DWF from hyperion.io import compression_methods -from hyperion.feats import MFCC +from hyperion.np.feats import MFCC def compute_mfcc_feats( diff --git a/hyperion/bin/eval-cos-1vs1.py b/hyperion/bin/eval-cos-1vs1.py index 123221f2..16c9122a 100755 --- a/hyperion/bin/eval-cos-1vs1.py +++ b/hyperion/bin/eval-cos-1vs1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR -from hyperion.transforms import TransformList, LNorm +from hyperion.np.transforms import TransformList, LNorm def eval_cos( diff --git a/hyperion/bin/eval-linear-gbe-up.py b/hyperion/bin/eval-linear-gbe-up.py index 287117fd..a8c3d999 100755 --- a/hyperion/bin/eval-linear-gbe-up.py +++ b/hyperion/bin/eval-linear-gbe-up.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.io import HypDataWriter as HDW from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearGBEUP as GBE +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearGBEUP as GBE def eval_linear_gbe( diff --git a/hyperion/bin/eval-linear-gbe.py b/hyperion/bin/eval-linear-gbe.py index a93b6c39..0970bb5e 100755 --- a/hyperion/bin/eval-linear-gbe.py +++ b/hyperion/bin/eval-linear-gbe.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.io import HypDataWriter as HDW from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearGBE as GBE +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearGBE as GBE def eval_linear_gbe( diff --git a/hyperion/bin/eval-linear-svmc.py b/hyperion/bin/eval-linear-svmc.py index ff7b1faa..d6c96c11 100755 --- a/hyperion/bin/eval-linear-svmc.py +++ b/hyperion/bin/eval-linear-svmc.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.io import HypDataWriter as HDW from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearSVMC as SVM +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearSVMC as SVM def eval_svm( diff --git a/hyperion/bin/eval-logistic-regression.py b/hyperion/bin/eval-logistic-regression.py index d96e2473..91a092ea 100755 --- a/hyperion/bin/eval-logistic-regression.py +++ b/hyperion/bin/eval-logistic-regression.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.io import HypDataWriter as HDW from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.transforms import TransformList -from hyperion.classifiers import LogisticRegression as LR +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LogisticRegression as LR def eval_lr( diff --git a/hyperion/bin/eval-plda-1vs1.py b/hyperion/bin/eval-plda-1vs1.py index 715d043a..eadf4a87 100755 --- a/hyperion/bin/eval-plda-1vs1.py +++ b/hyperion/bin/eval-plda-1vs1.py @@ -20,7 +20,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/hyperion/bin/eval-plda-nvs1.py b/hyperion/bin/eval-plda-nvs1.py index 30ea2606..5a63e5a5 100755 --- a/hyperion/bin/eval-plda-nvs1.py +++ b/hyperion/bin/eval-plda-nvs1.py @@ -20,7 +20,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/hyperion/bin/plot-vector-hist.py b/hyperion/bin/plot-vector-hist.py index cd86b1c1..60560a80 100755 --- a/hyperion/bin/plot-vector-hist.py +++ b/hyperion/bin/plot-vector-hist.py @@ -18,7 +18,7 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def plot_vector_hist( diff --git a/hyperion/bin/plot-vector-tsne.py b/hyperion/bin/plot-vector-tsne.py index 030d7e39..c4c30302 100755 --- a/hyperion/bin/plot-vector-tsne.py +++ b/hyperion/bin/plot-vector-tsne.py @@ -22,7 +22,7 @@ from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, PCA +from hyperion.np.transforms import TransformList, PCA colors = ["b", "g", "r", "c", "m", "y", "k"] markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] diff --git a/hyperion/bin/torch-compute-mfcc-feats.py b/hyperion/bin/torch-compute-mfcc-feats.py index 5f7d9f7d..4fc6bec2 100755 --- a/hyperion/bin/torch-compute-mfcc-feats.py +++ b/hyperion/bin/torch-compute-mfcc-feats.py @@ -22,7 +22,6 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import compression_methods from hyperion.torch.layers import AudioFeatsFactory as AFF -from hyperion.feats import MFCC def compute_mfcc_feats( diff --git a/hyperion/bin/torch-eval-vae.py b/hyperion/bin/torch-eval-vae.py index dfcdaa38..44ed0bfb 100755 --- a/hyperion/bin/torch-eval-vae.py +++ b/hyperion/bin/torch-eval-vae.py @@ -29,7 +29,7 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.feats import MeanVarianceNorm as MVN +from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py index 8d55b719..411873ac 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py @@ -27,7 +27,7 @@ from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores from hyperion.utils.list_utils import ismember from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch.utils import open_device from hyperion.torch.layers import LinBinCalibrator as Calibrator diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py index a5783654..18d6843f 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py @@ -27,7 +27,7 @@ from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores from hyperion.utils.list_utils import ismember from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch.utils import open_device from hyperion.torch.layers import LinBinCalibrator as Calibrator diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py index 44a3b98f..73da6088 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py @@ -28,7 +28,7 @@ from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores from hyperion.utils.list_utils import ismember from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch.utils import open_device from hyperion.torch.layers import LinBinCalibrator as Calibrator diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py index c7bcc50a..a8b4b962 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py @@ -26,7 +26,7 @@ from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores from hyperion.utils.list_utils import ismember from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch.utils import open_device from hyperion.torch.layers import LinBinCalibrator as Calibrator diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py index 4b08c7ab..51a8afbb 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py @@ -27,7 +27,7 @@ from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores from hyperion.utils.list_utils import ismember from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch.utils import open_device from hyperion.torch.layers import LinBinCalibrator as Calibrator diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py index 9d9d4666..9fcc8f30 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py @@ -28,7 +28,7 @@ from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores from hyperion.utils.list_utils import ismember from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch.utils import open_device from hyperion.torch.layers import LinBinCalibrator as Calibrator diff --git a/hyperion/bin/torch-eval-xvec-logits-from-wav.py b/hyperion/bin/torch-eval-xvec-logits-from-wav.py index 58cc9005..61acebd4 100755 --- a/hyperion/bin/torch-eval-xvec-logits-from-wav.py +++ b/hyperion/bin/torch-eval-xvec-logits-from-wav.py @@ -25,7 +25,7 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF -from hyperion.augment import SpeechAugment +from hyperion.np.augment import SpeechAugment from hyperion.torch.utils import open_device from hyperion.torch.narchs import AudioFeatsMVN as AF diff --git a/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py b/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py index bf227045..c4f1ba9a 100755 --- a/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py +++ b/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py @@ -25,7 +25,7 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF -from hyperion.augment import SpeechAugment +from hyperion.np.augment import SpeechAugment from hyperion.torch.utils import open_device from hyperion.torch.narchs import AudioFeatsMVN as AF diff --git a/hyperion/bin/torch-extract-xvectors-from-wav.py b/hyperion/bin/torch-extract-xvectors-from-wav.py index 0aea084e..48c23687 100755 --- a/hyperion/bin/torch-extract-xvectors-from-wav.py +++ b/hyperion/bin/torch-extract-xvectors-from-wav.py @@ -25,7 +25,7 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF -from hyperion.augment import SpeechAugment +from hyperion.np.augment import SpeechAugment from hyperion.torch.utils import open_device from hyperion.torch.narchs import AudioFeatsMVN as AF diff --git a/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py b/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py index e3ab70e9..ecf65037 100755 --- a/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py +++ b/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py @@ -26,7 +26,7 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF -from hyperion.augment import SpeechAugment +from hyperion.np.augment import SpeechAugment from hyperion.torch.utils import open_device from hyperion.torch.narchs import AudioFeatsMVN as AF diff --git a/hyperion/bin/torch-extract-xvectors-slidwin.py b/hyperion/bin/torch-extract-xvectors-slidwin.py index 0e2f0173..7d6d9f11 100755 --- a/hyperion/bin/torch-extract-xvectors-slidwin.py +++ b/hyperion/bin/torch-extract-xvectors-slidwin.py @@ -24,7 +24,7 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.feats import MeanVarianceNorm as MVN +from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML diff --git a/hyperion/bin/torch-extract-xvectors-vae-preproc.py b/hyperion/bin/torch-extract-xvectors-vae-preproc.py index 376de911..64f6359d 100755 --- a/hyperion/bin/torch-extract-xvectors-vae-preproc.py +++ b/hyperion/bin/torch-extract-xvectors-vae-preproc.py @@ -24,7 +24,7 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.feats import MeanVarianceNorm as MVN +from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML diff --git a/hyperion/bin/torch-extract-xvectors.py b/hyperion/bin/torch-extract-xvectors.py index 18bab96f..f36e35e2 100755 --- a/hyperion/bin/torch-extract-xvectors.py +++ b/hyperion/bin/torch-extract-xvectors.py @@ -24,7 +24,7 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF -from hyperion.feats import MeanVarianceNorm as MVN +from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML diff --git a/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py b/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py index 58f73b00..c13bd815 100755 --- a/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py +++ b/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py @@ -29,7 +29,7 @@ from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores from hyperion.utils.list_utils import ismember from hyperion.io import VADReaderFactory as VRF -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch.utils import open_device from hyperion.torch.layers import LinBinCalibrator as Calibrator diff --git a/hyperion/bin/train-cw-up.py b/hyperion/bin/train-cw-up.py index 48b8dfc4..a7392a32 100755 --- a/hyperion/bin/train-cw-up.py +++ b/hyperion/bin/train-cw-up.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.pdfs.core import Normal -from hyperion.transforms import TransformList, CentWhitenUP, LNormUP +from hyperion.np.pdfs.core import Normal +from hyperion.np.transforms import TransformList, CentWhitenUP, LNormUP def load_model(input_path, with_lnorm, name, **kwargs): diff --git a/hyperion/bin/train-cw.py b/hyperion/bin/train-cw.py index c64d4892..a70485a6 100755 --- a/hyperion/bin/train-cw.py +++ b/hyperion/bin/train-cw.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.pdfs.core import Normal -from hyperion.transforms import TransformList, CentWhiten, LNorm +from hyperion.np.pdfs.core import Normal +from hyperion.np.transforms import TransformList, CentWhiten, LNorm def load_model(input_path, with_lnorm, name, **kwargs): diff --git a/hyperion/bin/train-gaussianizer.py b/hyperion/bin/train-gaussianizer.py index eefd2456..a265403e 100755 --- a/hyperion/bin/train-gaussianizer.py +++ b/hyperion/bin/train-gaussianizer.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.pdfs.core import Normal -from hyperion.transforms import TransformList, Gaussianizer +from hyperion.np.pdfs.core import Normal +from hyperion.np.transforms import TransformList, Gaussianizer def load_model(input_path, **kwargs): diff --git a/hyperion/bin/train-lda.py b/hyperion/bin/train-lda.py index 17cd5ab6..36217c8f 100755 --- a/hyperion/bin/train-lda.py +++ b/hyperion/bin/train-lda.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, LDA, SbSw +from hyperion.np.transforms import TransformList, LDA, SbSw def train_lda( diff --git a/hyperion/bin/train-linear-gbe-up.py b/hyperion/bin/train-linear-gbe-up.py index 3e102b1f..5accb785 100755 --- a/hyperion/bin/train-linear-gbe-up.py +++ b/hyperion/bin/train-linear-gbe-up.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearGBEUP as GBE +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearGBEUP as GBE def train_linear_gbe(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin/train-linear-gbe.py b/hyperion/bin/train-linear-gbe.py index 1428358e..a7ac5236 100755 --- a/hyperion/bin/train-linear-gbe.py +++ b/hyperion/bin/train-linear-gbe.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearGBE as GBE +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearGBE as GBE def train_linear_gbe(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin/train-linear-svmc.py b/hyperion/bin/train-linear-svmc.py index 6c0e2de2..6b589491 100755 --- a/hyperion/bin/train-linear-svmc.py +++ b/hyperion/bin/train-linear-svmc.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList -from hyperion.classifiers import LinearSVMC as SVM +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearSVMC as SVM def train_svm(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin/train-logistic-regression.py b/hyperion/bin/train-logistic-regression.py index 6a409119..1d657dc4 100755 --- a/hyperion/bin/train-logistic-regression.py +++ b/hyperion/bin/train-logistic-regression.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList -from hyperion.classifiers import LogisticRegression as LR +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LogisticRegression as LR def train_lr(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin/train-mvn.py b/hyperion/bin/train-mvn.py index 8ddc5e92..a0204fd5 100755 --- a/hyperion/bin/train-mvn.py +++ b/hyperion/bin/train-mvn.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.pdfs.core import Normal -from hyperion.transforms import TransformList, MVN, SbSw +from hyperion.np.pdfs.core import Normal +from hyperion.np.transforms import TransformList, MVN, SbSw def train_mvn( diff --git a/hyperion/bin/train-nda.py b/hyperion/bin/train-nda.py index dcc856ed..11cd7da3 100755 --- a/hyperion/bin/train-nda.py +++ b/hyperion/bin/train-nda.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, NDA, NSbSw +from hyperion.np.transforms import TransformList, NDA, NSbSw def train_nda( diff --git a/hyperion/bin/train-pca.py b/hyperion/bin/train-pca.py index b82a7772..d1ab1c7e 100755 --- a/hyperion/bin/train-pca.py +++ b/hyperion/bin/train-pca.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, PCA +from hyperion.np.transforms import TransformList, PCA def load_model(input_path, name, **kwargs): diff --git a/hyperion/bin/train-plda.py b/hyperion/bin/train-plda.py index ba9a40c2..26f6e0a8 100755 --- a/hyperion/bin/train-plda.py +++ b/hyperion/bin/train-plda.py @@ -18,7 +18,7 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def train_plda( diff --git a/hyperion/helpers/__init__.py b/hyperion/helpers/__init__.py index eeaf2cce..48bf1476 100644 --- a/hyperion/helpers/__init__.py +++ b/hyperion/helpers/__init__.py @@ -11,8 +11,4 @@ from .multi_test_trial_data_reader_v2 import MultiTestTrialDataReaderV2 from .classif_trial_data_reader import ClassifTrialDataReader -# from .sequence_reader import SequenceReader -# from .sequence_class_reader import SequenceClassReader -# from .sequence_post_reader import SequencePostReader -# from .sequence_post_class_reader import SequencePostClassReader from .plda_factory import PLDAFactory diff --git a/hyperion/helpers/classif_trial_data_reader.py b/hyperion/helpers/classif_trial_data_reader.py index f5d74640..f7aeb727 100644 --- a/hyperion/helpers/classif_trial_data_reader.py +++ b/hyperion/helpers/classif_trial_data_reader.py @@ -14,7 +14,7 @@ from ..io import HypDataReader from ..utils import TrialNdx, SCPList -from ..transforms import TransformList +from ..np.transforms import TransformList class ClassifTrialDataReader(object): diff --git a/hyperion/helpers/multi_test_trial_data_reader.py b/hyperion/helpers/multi_test_trial_data_reader.py index 57355cd0..eeea60f2 100644 --- a/hyperion/helpers/multi_test_trial_data_reader.py +++ b/hyperion/helpers/multi_test_trial_data_reader.py @@ -13,7 +13,7 @@ from ..io import RandomAccessDataReaderFactory as DRF from ..utils import TrialNdx, TrialKey, Utt2Info -from ..transforms import TransformList +from ..np.transforms import TransformList class MultiTestTrialDataReader(object): diff --git a/hyperion/helpers/multi_test_trial_data_reader_v2.py b/hyperion/helpers/multi_test_trial_data_reader_v2.py index 306f75ae..43fd1254 100644 --- a/hyperion/helpers/multi_test_trial_data_reader_v2.py +++ b/hyperion/helpers/multi_test_trial_data_reader_v2.py @@ -13,7 +13,7 @@ from ..io import RandomAccessDataReaderFactory as DRF from ..utils import Utt2Info, TrialNdx, TrialKey -from ..transforms import TransformList +from ..np.transforms import TransformList class MultiTestTrialDataReaderV2(object): diff --git a/hyperion/helpers/plda_factory.py b/hyperion/helpers/plda_factory.py index b9c2ec60..0fdd2609 100644 --- a/hyperion/helpers/plda_factory.py +++ b/hyperion/helpers/plda_factory.py @@ -5,7 +5,7 @@ import numpy as np -from ..pdfs.plda import FRPLDA, SPLDA, PLDA +from ..np.pdfs.plda import FRPLDA, SPLDA, PLDA class PLDAFactory(object): diff --git a/hyperion/helpers/tracking_data_reader.py b/hyperion/helpers/tracking_data_reader.py index 6dfc9a19..4bac5be2 100644 --- a/hyperion/helpers/tracking_data_reader.py +++ b/hyperion/helpers/tracking_data_reader.py @@ -13,7 +13,7 @@ from ..io import RandomAccessDataReaderFactory as DRF from ..utils import Utt2Info, TrialNdx, ExtSegmentList -from ..transforms import TransformList +from ..np.transforms import TransformList class TrackingDataReader(object): diff --git a/hyperion/helpers/trial_data_reader.py b/hyperion/helpers/trial_data_reader.py index 984cdb1f..e6da5b7d 100644 --- a/hyperion/helpers/trial_data_reader.py +++ b/hyperion/helpers/trial_data_reader.py @@ -13,7 +13,7 @@ from ..io import RandomAccessDataReaderFactory as DRF from ..utils.utt2info import Utt2Info from ..utils import TrialNdx, TrialKey -from ..transforms import TransformList +from ..np.transforms import TransformList class TrialDataReader(object): diff --git a/hyperion/helpers/vector_class_reader.py b/hyperion/helpers/vector_class_reader.py index 4f893aac..0c6f346d 100644 --- a/hyperion/helpers/vector_class_reader.py +++ b/hyperion/helpers/vector_class_reader.py @@ -14,7 +14,7 @@ from ..io import RandomAccessDataReaderFactory as DRF from ..utils.utt2info import Utt2Info from ..utils.tensors import to3D_by_class -from ..transforms import TransformList +from ..np.transforms import TransformList class VectorClassReader(object): diff --git a/hyperion/helpers/vector_reader.py b/hyperion/helpers/vector_reader.py index 3f0fa1d2..0ac1b11a 100644 --- a/hyperion/helpers/vector_reader.py +++ b/hyperion/helpers/vector_reader.py @@ -13,7 +13,7 @@ from ..io import RandomAccessDataReaderFactory as DRF from ..utils.scp_list import SCPList -from ..transforms import TransformList +from ..np.transforms import TransformList class VectorReader(object): diff --git a/hyperion/np/__init__.py b/hyperion/np/__init__.py new file mode 100644 index 00000000..d2774314 --- /dev/null +++ b/hyperion/np/__init__.py @@ -0,0 +1,7 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .np_model import NPModel +from .np_model_loader import NPModelLoader diff --git a/hyperion/augment/__init__.py b/hyperion/np/augment/__init__.py similarity index 100% rename from hyperion/augment/__init__.py rename to hyperion/np/augment/__init__.py diff --git a/hyperion/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py similarity index 98% rename from hyperion/augment/noise_augment.py rename to hyperion/np/augment/noise_augment.py index ad88ff08..e180a292 100644 --- a/hyperion/augment/noise_augment.py +++ b/hyperion/np/augment/noise_augment.py @@ -11,8 +11,8 @@ import numpy as np -from ..hyp_defs import float_cpu -from ..io import RandomAccessAudioReader as AR +from ...hyp_defs import float_cpu +from ...io import RandomAccessAudioReader as AR class SingleNoiseAugment(object): diff --git a/hyperion/augment/reverb_augment.py b/hyperion/np/augment/reverb_augment.py similarity index 99% rename from hyperion/augment/reverb_augment.py rename to hyperion/np/augment/reverb_augment.py index 9f80c168..fe225e53 100644 --- a/hyperion/augment/reverb_augment.py +++ b/hyperion/np/augment/reverb_augment.py @@ -14,7 +14,7 @@ import numpy as np from scipy import signal -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu from ..io import RandomAccessDataReaderFactory as DRF diff --git a/hyperion/augment/speech_augment.py b/hyperion/np/augment/speech_augment.py similarity index 99% rename from hyperion/augment/speech_augment.py rename to hyperion/np/augment/speech_augment.py index b6756ce7..e3eab4ea 100644 --- a/hyperion/augment/speech_augment.py +++ b/hyperion/np/augment/speech_augment.py @@ -9,7 +9,7 @@ import numpy as np -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu from .noise_augment import NoiseAugment from .reverb_augment import ReverbAugment diff --git a/hyperion/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py similarity index 99% rename from hyperion/augment/speed_augment.py rename to hyperion/np/augment/speed_augment.py index b72bf338..7fdaab3c 100644 --- a/hyperion/augment/speed_augment.py +++ b/hyperion/np/augment/speed_augment.py @@ -9,7 +9,7 @@ import numpy as np from librosa.effects import time_stretch -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu class SpeedAugment(object): diff --git a/hyperion/calibration/__init__.py b/hyperion/np/calibration/__init__.py similarity index 100% rename from hyperion/calibration/__init__.py rename to hyperion/np/calibration/__init__.py diff --git a/hyperion/calibration/gauss_calibration.py b/hyperion/np/calibration/gauss_calibration.py similarity index 98% rename from hyperion/calibration/gauss_calibration.py rename to hyperion/np/calibration/gauss_calibration.py index 07d882ed..630d5e95 100644 --- a/hyperion/calibration/gauss_calibration.py +++ b/hyperion/np/calibration/gauss_calibration.py @@ -4,10 +4,10 @@ """ import numpy as np -from ..hyp_model import HypModel +from ..np_model import NPModel -class GaussCalibration(HypModel): +class GaussCalibration(NPModel): """Class for supervised Gaussian calibration. The model assumes that targer and non-target score distributions are Gaussians with shared covariance. diff --git a/hyperion/calibration/unsup_gauss_calibration.py b/hyperion/np/calibration/unsup_gauss_calibration.py similarity index 100% rename from hyperion/calibration/unsup_gauss_calibration.py rename to hyperion/np/calibration/unsup_gauss_calibration.py diff --git a/hyperion/classifiers/__init__.py b/hyperion/np/classifiers/__init__.py similarity index 100% rename from hyperion/classifiers/__init__.py rename to hyperion/np/classifiers/__init__.py diff --git a/hyperion/classifiers/binary_logistic_regression.py b/hyperion/np/classifiers/binary_logistic_regression.py similarity index 100% rename from hyperion/classifiers/binary_logistic_regression.py rename to hyperion/np/classifiers/binary_logistic_regression.py diff --git a/hyperion/classifiers/greedy_fusion.py b/hyperion/np/classifiers/greedy_fusion.py similarity index 99% rename from hyperion/classifiers/greedy_fusion.py rename to hyperion/np/classifiers/greedy_fusion.py index 6eff32ad..2102bc22 100644 --- a/hyperion/classifiers/greedy_fusion.py +++ b/hyperion/np/classifiers/greedy_fusion.py @@ -6,14 +6,14 @@ import logging import numpy as np -from ..hyp_defs import float_cpu, float_save -from ..hyp_model import HypModel +from ...hyp_defs import float_cpu, float_save +from ..np_model import NPModel from ..metrics import dcf from .binary_logistic_regression import BinaryLogisticRegression as BLR -class GreedyFusionBinaryLR(HypModel): +class GreedyFusionBinaryLR(NPModel): """Greedy score fusion based on binary logistic regression. It computes ``max_systmes`` fusions. The best system, the best fusion of two, diff --git a/hyperion/classifiers/linear_gbe.py b/hyperion/np/classifiers/linear_gbe.py similarity index 98% rename from hyperion/classifiers/linear_gbe.py rename to hyperion/np/classifiers/linear_gbe.py index 075ea751..c786cb50 100644 --- a/hyperion/classifiers/linear_gbe.py +++ b/hyperion/np/classifiers/linear_gbe.py @@ -7,12 +7,12 @@ import numpy as np from scipy.special import gammaln -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax +from ...hyp_defs import float_cpu +from ..np_model import NPModel +from ...utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax -class LinearGBE(HypModel): +class LinearGBE(NPModel): """Linear Gaussian Back-end. Attributes: diff --git a/hyperion/classifiers/linear_gbe1.py b/hyperion/np/classifiers/linear_gbe1.py similarity index 97% rename from hyperion/classifiers/linear_gbe1.py rename to hyperion/np/classifiers/linear_gbe1.py index 71edd606..8c5df381 100644 --- a/hyperion/classifiers/linear_gbe1.py +++ b/hyperion/np/classifiers/linear_gbe1.py @@ -5,12 +5,12 @@ import numpy as np -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax +from ...hyp_defs import float_cpu +from ..np_model import NPModel +from ...utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax -class LinearGBE(HypModel): +class LinearGBE(NPModel): def __init__( self, mu=None, diff --git a/hyperion/classifiers/linear_gbe_up.py b/hyperion/np/classifiers/linear_gbe_up.py similarity index 98% rename from hyperion/classifiers/linear_gbe_up.py rename to hyperion/np/classifiers/linear_gbe_up.py index 8c855dfa..4a489639 100644 --- a/hyperion/classifiers/linear_gbe_up.py +++ b/hyperion/np/classifiers/linear_gbe_up.py @@ -7,9 +7,9 @@ import numpy as np from scipy.special import gammaln -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import ( +from ...hyp_defs import float_cpu +from ..np_model import NPModel +from ...utils.math import ( int2onehot, logdet_pdmat, invert_pdmat, diff --git a/hyperion/classifiers/linear_svmc.py b/hyperion/np/classifiers/linear_svmc.py similarity index 98% rename from hyperion/classifiers/linear_svmc.py rename to hyperion/np/classifiers/linear_svmc.py index 244e0dc0..df14a16e 100644 --- a/hyperion/classifiers/linear_svmc.py +++ b/hyperion/np/classifiers/linear_svmc.py @@ -8,12 +8,12 @@ from sklearn.svm import LinearSVC as SVC -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import softmax +from ...hyp_defs import float_cpu +from ..np_model import NPModel +from ...utils.math import softmax -class LinearSVMC(HypModel): +class LinearSVMC(NPModel): """Linear Support Vector Machine for Classification. Attributes: diff --git a/hyperion/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py similarity index 99% rename from hyperion/classifiers/logistic_regression.py rename to hyperion/np/classifiers/logistic_regression.py index ad845170..932a28e3 100644 --- a/hyperion/classifiers/logistic_regression.py +++ b/hyperion/np/classifiers/logistic_regression.py @@ -8,12 +8,12 @@ from sklearn.linear_model import LogisticRegression as LR -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import softmax +from ...hyp_defs import float_cpu +from ..np_model import NPModel +from ...utils.math import softmax -class LogisticRegression(HypModel): +class LogisticRegression(NPModel): """Multi-class logistic regression. This is a wrapper that add functionalities to sklearn logistic regression. diff --git a/hyperion/classifiers/q_scoring_homo_gbe.py b/hyperion/np/classifiers/q_scoring_homo_gbe.py similarity index 97% rename from hyperion/classifiers/q_scoring_homo_gbe.py rename to hyperion/np/classifiers/q_scoring_homo_gbe.py index 83f2408b..8ef42052 100644 --- a/hyperion/classifiers/q_scoring_homo_gbe.py +++ b/hyperion/np/classifiers/q_scoring_homo_gbe.py @@ -7,12 +7,12 @@ import numpy as np from scipy.special import gammaln -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel -from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax +from ...hyp_defs import float_cpu +from ..np_model import NPModel +from ...utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax -class QScoringHomoGBE(HypModel): +class QScoringHomoGBE(NPModel): def __init__( self, mu=None, diff --git a/hyperion/clustering/__init__.py b/hyperion/np/clustering/__init__.py similarity index 100% rename from hyperion/clustering/__init__.py rename to hyperion/np/clustering/__init__.py diff --git a/hyperion/clustering/ahc.py b/hyperion/np/clustering/ahc.py similarity index 97% rename from hyperion/clustering/ahc.py rename to hyperion/np/clustering/ahc.py index 2f8dbe08..fc5dbb30 100644 --- a/hyperion/clustering/ahc.py +++ b/hyperion/np/clustering/ahc.py @@ -10,11 +10,11 @@ from scipy.cluster.hierarchy import linkage from sklearn.metrics import homogeneity_score, completeness_score -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel +from ...hyp_defs import float_cpu +from ..np_model import NPModel -class AHC(HypModel): +class AHC(NPModel): def __init__(self, method="average", metric="llr", **kwargs): super().__init__(**kwargs) self.method = method diff --git a/hyperion/clustering/kmeans.py b/hyperion/np/clustering/kmeans.py similarity index 95% rename from hyperion/clustering/kmeans.py rename to hyperion/np/clustering/kmeans.py index 7da2bd01..9d8758e1 100644 --- a/hyperion/clustering/kmeans.py +++ b/hyperion/np/clustering/kmeans.py @@ -8,11 +8,11 @@ import numpy as np import h5py -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel +from ...hyp_defs import float_cpu +from ..np_model import NPModel -class KMeans(HypModel): +class KMeans(NPModel): def __init__(self, num_clusters, mu=None, rtol=0.001, **kwargs): super(KMeans, self).__init__(**kwargs) self.num_clusters = num_clusters diff --git a/hyperion/diarization/__init__.py b/hyperion/np/diarization/__init__.py similarity index 100% rename from hyperion/diarization/__init__.py rename to hyperion/np/diarization/__init__.py diff --git a/hyperion/diarization/diar_ahc_plda.py b/hyperion/np/diarization/diar_ahc_plda.py similarity index 100% rename from hyperion/diarization/diar_ahc_plda.py rename to hyperion/np/diarization/diar_ahc_plda.py diff --git a/hyperion/feats/__init__.py b/hyperion/np/feats/__init__.py similarity index 100% rename from hyperion/feats/__init__.py rename to hyperion/np/feats/__init__.py diff --git a/hyperion/feats/energy_vad.py b/hyperion/np/feats/energy_vad.py similarity index 99% rename from hyperion/feats/energy_vad.py rename to hyperion/np/feats/energy_vad.py index 734e86bb..7785318f 100644 --- a/hyperion/feats/energy_vad.py +++ b/hyperion/np/feats/energy_vad.py @@ -7,8 +7,8 @@ import numpy as np from scipy.signal import lfilter -from ..hyp_defs import float_cpu -from ..utils.misc import str2bool +from ...hyp_defs import float_cpu +from ...utils.misc import str2bool from .stft import st_logE diff --git a/hyperion/feats/feature_normalization.py b/hyperion/np/feats/feature_normalization.py similarity index 99% rename from hyperion/feats/feature_normalization.py rename to hyperion/np/feats/feature_normalization.py index 313d027b..38f7b766 100644 --- a/hyperion/feats/feature_normalization.py +++ b/hyperion/np/feats/feature_normalization.py @@ -7,7 +7,7 @@ from jsonargparse import ArgumentParser, ActionParser from scipy.signal import convolve2d -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu class MeanVarianceNorm(object): diff --git a/hyperion/feats/feature_windows.py b/hyperion/np/feats/feature_windows.py similarity index 98% rename from hyperion/feats/feature_windows.py rename to hyperion/np/feats/feature_windows.py index 3e880f7e..ae5d07d2 100644 --- a/hyperion/feats/feature_windows.py +++ b/hyperion/np/feats/feature_windows.py @@ -8,7 +8,7 @@ import numpy as np from scipy.signal import blackman, hamming, hann -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu class FeatureWindowFactory(object): diff --git a/hyperion/feats/filter_banks.py b/hyperion/np/feats/filter_banks.py similarity index 98% rename from hyperion/feats/filter_banks.py rename to hyperion/np/feats/filter_banks.py index b92535da..3b0da644 100644 --- a/hyperion/feats/filter_banks.py +++ b/hyperion/np/feats/filter_banks.py @@ -9,8 +9,7 @@ import numpy as np from librosa.filters import mel as make_mel_librosa -from ..hyp_defs import float_cpu -from ..utils.misc import str2bool +from ...hyp_defs import float_cpu class FilterBankFactory(object): diff --git a/hyperion/feats/frame_selector.py b/hyperion/np/feats/frame_selector.py similarity index 100% rename from hyperion/feats/frame_selector.py rename to hyperion/np/feats/frame_selector.py diff --git a/hyperion/feats/mfcc.py b/hyperion/np/feats/mfcc.py similarity index 99% rename from hyperion/feats/mfcc.py rename to hyperion/np/feats/mfcc.py index 94af5c2e..cf517ee5 100644 --- a/hyperion/feats/mfcc.py +++ b/hyperion/np/feats/mfcc.py @@ -9,8 +9,8 @@ from scipy.fftpack import dct from scipy.signal import lfilter -from ..hyp_defs import float_cpu -from ..utils.misc import str2bool +from ...hyp_defs import float_cpu +from ...utils.misc import str2bool from .feature_windows import FeatureWindowFactory as FWF from .filter_banks import FilterBankFactory as FBF from .stft import strft, st_logE diff --git a/hyperion/feats/stft.py b/hyperion/np/feats/stft.py similarity index 98% rename from hyperion/feats/stft.py rename to hyperion/np/feats/stft.py index 7f22bdee..0a55453c 100644 --- a/hyperion/feats/stft.py +++ b/hyperion/np/feats/stft.py @@ -7,7 +7,7 @@ import numpy as np -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu def stft(x, frame_length, frame_shift, fft_length, window=None): diff --git a/hyperion/metrics/__init__.py b/hyperion/np/metrics/__init__.py similarity index 100% rename from hyperion/metrics/__init__.py rename to hyperion/np/metrics/__init__.py diff --git a/hyperion/metrics/acc.py b/hyperion/np/metrics/acc.py similarity index 100% rename from hyperion/metrics/acc.py rename to hyperion/np/metrics/acc.py diff --git a/hyperion/metrics/cllr.py b/hyperion/np/metrics/cllr.py similarity index 100% rename from hyperion/metrics/cllr.py rename to hyperion/np/metrics/cllr.py diff --git a/hyperion/metrics/confidence.py b/hyperion/np/metrics/confidence.py similarity index 100% rename from hyperion/metrics/confidence.py rename to hyperion/np/metrics/confidence.py diff --git a/hyperion/metrics/confusion_matrix.py b/hyperion/np/metrics/confusion_matrix.py similarity index 100% rename from hyperion/metrics/confusion_matrix.py rename to hyperion/np/metrics/confusion_matrix.py diff --git a/hyperion/metrics/dcf.py b/hyperion/np/metrics/dcf.py similarity index 100% rename from hyperion/metrics/dcf.py rename to hyperion/np/metrics/dcf.py diff --git a/hyperion/metrics/dcf_plot.py b/hyperion/np/metrics/dcf_plot.py similarity index 100% rename from hyperion/metrics/dcf_plot.py rename to hyperion/np/metrics/dcf_plot.py diff --git a/hyperion/metrics/det_plot.py b/hyperion/np/metrics/det_plot.py similarity index 100% rename from hyperion/metrics/det_plot.py rename to hyperion/np/metrics/det_plot.py diff --git a/hyperion/metrics/eer.py b/hyperion/np/metrics/eer.py similarity index 100% rename from hyperion/metrics/eer.py rename to hyperion/np/metrics/eer.py diff --git a/hyperion/metrics/roc.py b/hyperion/np/metrics/roc.py similarity index 100% rename from hyperion/metrics/roc.py rename to hyperion/np/metrics/roc.py diff --git a/hyperion/metrics/utils.py b/hyperion/np/metrics/utils.py similarity index 99% rename from hyperion/metrics/utils.py rename to hyperion/np/metrics/utils.py index 8a764c3d..4f06bb18 100644 --- a/hyperion/metrics/utils.py +++ b/hyperion/np/metrics/utils.py @@ -7,7 +7,7 @@ import numpy as np -from ..hyp_defs import float_cpu +from ...hyp_defs import float_cpu def effective_prior(p_tar, c_miss, c_fa): diff --git a/hyperion/metrics/verification_evaluator.py b/hyperion/np/metrics/verification_evaluator.py similarity index 99% rename from hyperion/metrics/verification_evaluator.py rename to hyperion/np/metrics/verification_evaluator.py index d2b26ed6..9c9c3208 100644 --- a/hyperion/metrics/verification_evaluator.py +++ b/hyperion/np/metrics/verification_evaluator.py @@ -17,9 +17,9 @@ matplotlib.rc("text", usetex=True) import matplotlib.pyplot as plt -from ..hyp_defs import float_cpu -from ..utils import TrialKey, TrialScores -from ..utils.trial_stats import TrialStats +from ...hyp_defs import float_cpu +from ...utils import TrialKey, TrialScores +from ...utils.trial_stats import TrialStats from .utils import effective_prior from .dcf import fast_eval_dcf_eer diff --git a/hyperion/hyp_model.py b/hyperion/np/np_model.py similarity index 98% rename from hyperion/hyp_model.py rename to hyperion/np/np_model.py index 0ffd2285..35717a82 100644 --- a/hyperion/hyp_model.py +++ b/hyperion/np/np_model.py @@ -10,10 +10,10 @@ import numpy as np import h5py -from .hyp_defs import float_save, float_cpu +from ..hyp_defs import float_save, float_cpu -class HypModel(object): +class NPModel(object): __metaclass__ = ABCMeta def __init__(self, name=None, **kwargs): diff --git a/hyperion/model_loader.py b/hyperion/np/np_model_loader.py similarity index 78% rename from hyperion/model_loader.py rename to hyperion/np/np_model_loader.py index 30780d7b..efdd27a9 100644 --- a/hyperion/model_loader.py +++ b/hyperion/np/np_model_loader.py @@ -3,12 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .hyp_model import HypModel +from .np_model import NPModel from .pdfs import * from .transforms import * -class ModelLoader(object): +class NPModelLoader(object): @staticmethod def get_object(): obj_dict = { @@ -18,6 +18,7 @@ def get_object(): "GMM": GMM, "FRPLDA": FRPLDA, "SPLDA": SPLDA, + "PLDA": PLDA, "CentWhiten": CentWhiten, "LNorm": LNorm, "PCA": PCA, @@ -31,6 +32,6 @@ def get_object(): @staticmethod def load(file_path): - class_name = HypModel.load_config(file_path)["class_name"] - class_obj = ModelLoader.get_object()[class_name] + class_name = NPModel.load_config(file_path)["class_name"] + class_obj = NPModelLoader.get_object()[class_name] return class_obj.load(file_path) diff --git a/hyperion/pdfs/__init__.py b/hyperion/np/pdfs/__init__.py similarity index 100% rename from hyperion/pdfs/__init__.py rename to hyperion/np/pdfs/__init__.py diff --git a/hyperion/pdfs/core/__init__.py b/hyperion/np/pdfs/core/__init__.py similarity index 100% rename from hyperion/pdfs/core/__init__.py rename to hyperion/np/pdfs/core/__init__.py diff --git a/hyperion/pdfs/core/exp_family.py b/hyperion/np/pdfs/core/exp_family.py similarity index 100% rename from hyperion/pdfs/core/exp_family.py rename to hyperion/np/pdfs/core/exp_family.py diff --git a/hyperion/pdfs/core/normal.py b/hyperion/np/pdfs/core/normal.py similarity index 99% rename from hyperion/pdfs/core/normal.py rename to hyperion/np/pdfs/core/normal.py index b1ff4224..ed60edb7 100644 --- a/hyperion/pdfs/core/normal.py +++ b/hyperion/np/pdfs/core/normal.py @@ -8,14 +8,14 @@ import scipy.linalg as la from scipy.special import erf -from ...hyp_defs import float_cpu -from ...utils.plotting import ( +from ....hyp_defs import float_cpu +from ....utils.plotting import ( plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, plot_gaussian_3D, ) -from ...utils.math import ( +from ....utils.math import ( invert_pdmat, invert_trimat, symmat2vec, diff --git a/hyperion/pdfs/core/normal_diag_cov.py b/hyperion/np/pdfs/core/normal_diag_cov.py similarity index 99% rename from hyperion/pdfs/core/normal_diag_cov.py rename to hyperion/np/pdfs/core/normal_diag_cov.py index 562d3899..cb21f84c 100644 --- a/hyperion/pdfs/core/normal_diag_cov.py +++ b/hyperion/np/pdfs/core/normal_diag_cov.py @@ -10,8 +10,8 @@ # import matplotlib.pyplot as plt # import matplotlib.mlab as mlab -from ...hyp_defs import float_cpu -from ...utils.plotting import ( +from ....hyp_defs import float_cpu +from ....utils.plotting import ( plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, diff --git a/hyperion/pdfs/core/pdf.py b/hyperion/np/pdfs/core/pdf.py similarity index 93% rename from hyperion/pdfs/core/pdf.py rename to hyperion/np/pdfs/core/pdf.py index 2764780c..acd26105 100644 --- a/hyperion/pdfs/core/pdf.py +++ b/hyperion/np/pdfs/core/pdf.py @@ -6,10 +6,10 @@ import numpy as np from abc import ABCMeta, abstractmethod -from ...hyp_model import HypModel +from ...np_model import NPModel -class PDF(HypModel): +class PDF(NPModel): __metaclass__ = ABCMeta def __init__(self, x_dim=1, **kwargs): diff --git a/hyperion/pdfs/hmm/__init__.py b/hyperion/np/pdfs/hmm/__init__.py similarity index 100% rename from hyperion/pdfs/hmm/__init__.py rename to hyperion/np/pdfs/hmm/__init__.py diff --git a/hyperion/pdfs/hmm/hmm.py b/hyperion/np/pdfs/hmm/hmm.py similarity index 99% rename from hyperion/pdfs/hmm/hmm.py rename to hyperion/np/pdfs/hmm/hmm.py index aeec994a..b8497b38 100644 --- a/hyperion/pdfs/hmm/hmm.py +++ b/hyperion/np/pdfs/hmm/hmm.py @@ -5,8 +5,8 @@ import numpy as np -from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp +from ....hyp_defs import float_cpu +from ....utils.math import softmax, logsumexp from ..core import PDF diff --git a/hyperion/pdfs/jfa/__init__.py b/hyperion/np/pdfs/jfa/__init__.py similarity index 100% rename from hyperion/pdfs/jfa/__init__.py rename to hyperion/np/pdfs/jfa/__init__.py diff --git a/hyperion/pdfs/jfa/jfa_total.py b/hyperion/np/pdfs/jfa/jfa_total.py similarity index 99% rename from hyperion/pdfs/jfa/jfa_total.py rename to hyperion/np/pdfs/jfa/jfa_total.py index 74fe0f95..4a11b5cf 100644 --- a/hyperion/pdfs/jfa/jfa_total.py +++ b/hyperion/np/pdfs/jfa/jfa_total.py @@ -6,8 +6,8 @@ import numpy as np from scipy import linalg as sla -from ...hyp_defs import float_cpu -from ...utils.math import ( +from ....hyp_defs import float_cpu +from ....utils.math import ( invert_pdmat, invert_trimat, logdet_pdmat, diff --git a/hyperion/pdfs/mixtures/__init__.py b/hyperion/np/pdfs/mixtures/__init__.py similarity index 100% rename from hyperion/pdfs/mixtures/__init__.py rename to hyperion/np/pdfs/mixtures/__init__.py diff --git a/hyperion/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py similarity index 99% rename from hyperion/pdfs/mixtures/exp_family_mixture.py rename to hyperion/np/pdfs/mixtures/exp_family_mixture.py index 113bb8fc..143d7df5 100644 --- a/hyperion/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -7,9 +7,9 @@ import logging from abc import ABCMeta, abstractmethod -from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp -from ...utils.queues import GeneratorQueue +from ....hyp_defs import float_cpu +from ....utils.math import softmax, logsumexp +from ....utils.queues import GeneratorQueue from ..core import PDF diff --git a/hyperion/pdfs/mixtures/gmm.py b/hyperion/np/pdfs/mixtures/gmm.py similarity index 99% rename from hyperion/pdfs/mixtures/gmm.py rename to hyperion/np/pdfs/mixtures/gmm.py index b71f0a61..391c59ee 100644 --- a/hyperion/pdfs/mixtures/gmm.py +++ b/hyperion/np/pdfs/mixtures/gmm.py @@ -8,8 +8,8 @@ from scipy.special import erf -from ...hyp_defs import float_cpu -from ...utils.math import ( +from ....hyp_defs import float_cpu +from ....utils.math import ( softmax, logsumexp, invert_pdmat, @@ -19,7 +19,7 @@ fullcov_varfloor, logdet_pdmat, ) -from ...utils.plotting import ( +from ....utils.plotting import ( plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, diff --git a/hyperion/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py similarity index 99% rename from hyperion/pdfs/mixtures/gmm_diag_cov.py rename to hyperion/np/pdfs/mixtures/gmm_diag_cov.py index b586a900..46a30f81 100644 --- a/hyperion/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -7,9 +7,9 @@ import h5py from scipy.special import erf -from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp -from ...utils.plotting import ( +from ....hyp_defs import float_cpu +from ....utils.math import softmax, logsumexp +from ....utils.plotting import ( plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, diff --git a/hyperion/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py similarity index 98% rename from hyperion/pdfs/mixtures/gmm_tied_diag_cov.py rename to hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py index a3e7f93e..87043cc4 100644 --- a/hyperion/pdfs/mixtures/gmm_tied_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py @@ -6,9 +6,9 @@ import h5py from scipy.special import erf -from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp -from ...utils.plotting import ( +from ....hyp_defs import float_cpu +from ....utils.math import softmax, logsumexp +from ....utils.plotting import ( plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, diff --git a/hyperion/pdfs/plda/__init__.py b/hyperion/np/pdfs/plda/__init__.py similarity index 100% rename from hyperion/pdfs/plda/__init__.py rename to hyperion/np/pdfs/plda/__init__.py diff --git a/hyperion/pdfs/plda/frplda.py b/hyperion/np/pdfs/plda/frplda.py similarity index 99% rename from hyperion/pdfs/plda/frplda.py rename to hyperion/np/pdfs/plda/frplda.py index 5ea628fe..137276c7 100644 --- a/hyperion/pdfs/plda/frplda.py +++ b/hyperion/np/pdfs/plda/frplda.py @@ -6,8 +6,8 @@ import numpy as np from scipy import linalg as sla -from ...hyp_defs import float_cpu -from ...utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....hyp_defs import float_cpu +from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase diff --git a/hyperion/pdfs/plda/plda.py b/hyperion/np/pdfs/plda/plda.py similarity index 99% rename from hyperion/pdfs/plda/plda.py rename to hyperion/np/pdfs/plda/plda.py index 16dee5ea..30c21361 100644 --- a/hyperion/pdfs/plda/plda.py +++ b/hyperion/np/pdfs/plda/plda.py @@ -6,8 +6,8 @@ import numpy as np from scipy import linalg as sla -from ...hyp_defs import float_cpu -from ...utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....hyp_defs import float_cpu +from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase diff --git a/hyperion/pdfs/plda/plda_base.py b/hyperion/np/pdfs/plda/plda_base.py similarity index 99% rename from hyperion/pdfs/plda/plda_base.py rename to hyperion/np/pdfs/plda/plda_base.py index 8a83543d..1d5d758c 100644 --- a/hyperion/pdfs/plda/plda_base.py +++ b/hyperion/np/pdfs/plda/plda_base.py @@ -7,7 +7,7 @@ from abc import ABCMeta, abstractmethod -from ...hyp_defs import float_cpu +from ....hyp_defs import float_cpu from ..core.pdf import PDF from ...transforms import LNorm diff --git a/hyperion/pdfs/plda/splda.py b/hyperion/np/pdfs/plda/splda.py similarity index 99% rename from hyperion/pdfs/plda/splda.py rename to hyperion/np/pdfs/plda/splda.py index 1ffaaa1c..f10759cf 100644 --- a/hyperion/pdfs/plda/splda.py +++ b/hyperion/np/pdfs/plda/splda.py @@ -5,8 +5,8 @@ import numpy as np from scipy import linalg as sla -from ...hyp_defs import float_cpu -from ...utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....hyp_defs import float_cpu +from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase diff --git a/hyperion/score_norm/__init__.py b/hyperion/np/score_norm/__init__.py similarity index 100% rename from hyperion/score_norm/__init__.py rename to hyperion/np/score_norm/__init__.py diff --git a/hyperion/score_norm/adapt_s_norm.py b/hyperion/np/score_norm/adapt_s_norm.py similarity index 100% rename from hyperion/score_norm/adapt_s_norm.py rename to hyperion/np/score_norm/adapt_s_norm.py diff --git a/hyperion/score_norm/s_norm.py b/hyperion/np/score_norm/s_norm.py similarity index 100% rename from hyperion/score_norm/s_norm.py rename to hyperion/np/score_norm/s_norm.py diff --git a/hyperion/score_norm/score_norm.py b/hyperion/np/score_norm/score_norm.py similarity index 85% rename from hyperion/score_norm/score_norm.py rename to hyperion/np/score_norm/score_norm.py index f20a0b98..45df0323 100644 --- a/hyperion/score_norm/score_norm.py +++ b/hyperion/np/score_norm/score_norm.py @@ -5,10 +5,10 @@ import numpy as np -from ..hyp_model import HypModel +from ..np_model import NPModel -class ScoreNorm(HypModel): +class ScoreNorm(NPModel): """ Base class for score normalization """ diff --git a/hyperion/score_norm/t_norm.py b/hyperion/np/score_norm/t_norm.py similarity index 100% rename from hyperion/score_norm/t_norm.py rename to hyperion/np/score_norm/t_norm.py diff --git a/hyperion/score_norm/tz_norm.py b/hyperion/np/score_norm/tz_norm.py similarity index 100% rename from hyperion/score_norm/tz_norm.py rename to hyperion/np/score_norm/tz_norm.py diff --git a/hyperion/score_norm/z_norm.py b/hyperion/np/score_norm/z_norm.py similarity index 100% rename from hyperion/score_norm/z_norm.py rename to hyperion/np/score_norm/z_norm.py diff --git a/hyperion/score_norm/zt_norm.py b/hyperion/np/score_norm/zt_norm.py similarity index 100% rename from hyperion/score_norm/zt_norm.py rename to hyperion/np/score_norm/zt_norm.py diff --git a/hyperion/transforms/__init__.py b/hyperion/np/transforms/__init__.py similarity index 100% rename from hyperion/transforms/__init__.py rename to hyperion/np/transforms/__init__.py diff --git a/hyperion/transforms/cent_whiten.py b/hyperion/np/transforms/cent_whiten.py similarity index 98% rename from hyperion/transforms/cent_whiten.py rename to hyperion/np/transforms/cent_whiten.py index 00a83cca..f1cdf227 100644 --- a/hyperion/transforms/cent_whiten.py +++ b/hyperion/np/transforms/cent_whiten.py @@ -8,11 +8,11 @@ import scipy.linalg as la -from ..hyp_model import HypModel +from ..np_model import NPModel from ..pdfs import Normal -class CentWhiten(HypModel): +class CentWhiten(NPModel): """Class to do centering and whitening of i-vectors.""" def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): diff --git a/hyperion/transforms/cent_whiten_up.py b/hyperion/np/transforms/cent_whiten_up.py similarity index 96% rename from hyperion/transforms/cent_whiten_up.py rename to hyperion/np/transforms/cent_whiten_up.py index f22488f4..f3793328 100644 --- a/hyperion/transforms/cent_whiten_up.py +++ b/hyperion/np/transforms/cent_whiten_up.py @@ -8,7 +8,7 @@ import scipy.linalg as la -from ..hyp_model import HypModel +from ..np_model import NPModel from ..pdfs import Normal from .cent_whiten import CentWhiten diff --git a/hyperion/transforms/coral.py b/hyperion/np/transforms/coral.py similarity index 98% rename from hyperion/transforms/coral.py rename to hyperion/np/transforms/coral.py index 0c9dea85..9aee7579 100644 --- a/hyperion/transforms/coral.py +++ b/hyperion/np/transforms/coral.py @@ -8,10 +8,10 @@ import scipy.linalg as la -from ..hyp_model import HypModel +from ..np_model import NPModel -class CORAL(HypModel): +class CORAL(NPModel): """Class to do CORAL""" def __init__( diff --git a/hyperion/transforms/gaussianizer.py b/hyperion/np/transforms/gaussianizer.py similarity index 96% rename from hyperion/transforms/gaussianizer.py rename to hyperion/np/transforms/gaussianizer.py index ea512ade..26294134 100644 --- a/hyperion/transforms/gaussianizer.py +++ b/hyperion/np/transforms/gaussianizer.py @@ -10,11 +10,11 @@ import scipy.linalg as la from scipy.special import erfinv -from ..hyp_defs import float_cpu -from ..hyp_model import HypModel +from ...hyp_defs import float_cpu +from ..np_model import NPModel -class Gaussianizer(HypModel): +class Gaussianizer(NPModel): """Class to make i-vector distribution standard Normal.""" def __init__(self, max_vectors=None, r=None, **kwargs): diff --git a/hyperion/transforms/lda.py b/hyperion/np/transforms/lda.py similarity index 98% rename from hyperion/transforms/lda.py rename to hyperion/np/transforms/lda.py index 142ed2bd..13c74fe8 100644 --- a/hyperion/transforms/lda.py +++ b/hyperion/np/transforms/lda.py @@ -8,11 +8,11 @@ import scipy.linalg as la -from ..hyp_model import HypModel +from ..np_model import NPModel from .sb_sw import SbSw -class LDA(HypModel): +class LDA(NPModel): """Class to do linear discriminant analysis.""" def __init__( diff --git a/hyperion/transforms/lnorm.py b/hyperion/np/transforms/lnorm.py similarity index 100% rename from hyperion/transforms/lnorm.py rename to hyperion/np/transforms/lnorm.py diff --git a/hyperion/transforms/lnorm_up.py b/hyperion/np/transforms/lnorm_up.py similarity index 99% rename from hyperion/transforms/lnorm_up.py rename to hyperion/np/transforms/lnorm_up.py index ab7b1ec9..b6e211d5 100644 --- a/hyperion/transforms/lnorm_up.py +++ b/hyperion/np/transforms/lnorm_up.py @@ -3,7 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import numpy as np import h5py diff --git a/hyperion/transforms/mvn.py b/hyperion/np/transforms/mvn.py similarity index 94% rename from hyperion/transforms/mvn.py rename to hyperion/np/transforms/mvn.py index a3b77582..7f60206e 100644 --- a/hyperion/transforms/mvn.py +++ b/hyperion/np/transforms/mvn.py @@ -8,10 +8,10 @@ import scipy.linalg as la -from ..hyp_model import HypModel +from ..np_model import NPModel -class MVN(HypModel): +class MVN(NPModel): """Class to do global mean and variance normalization.""" def __init__(self, mu=None, s=None, **kwargs): diff --git a/hyperion/transforms/nap.py b/hyperion/np/transforms/nap.py similarity index 97% rename from hyperion/transforms/nap.py rename to hyperion/np/transforms/nap.py index 6917c6b4..ee13e7e0 100644 --- a/hyperion/transforms/nap.py +++ b/hyperion/np/transforms/nap.py @@ -8,10 +8,10 @@ import scipy.linalg as la -from ..hyp_model import HypModel +from ..np_model import NPModel -class NAP(HypModel): +class NAP(NPModel): """Class to do nussance attribute projection.""" def __init__(self, U=None, **kwargs): diff --git a/hyperion/transforms/nda.py b/hyperion/np/transforms/nda.py similarity index 94% rename from hyperion/transforms/nda.py rename to hyperion/np/transforms/nda.py index 4f9772fc..c84a4527 100644 --- a/hyperion/transforms/nda.py +++ b/hyperion/np/transforms/nda.py @@ -8,11 +8,11 @@ import scipy.linalg as la -from ..hyp_model import HypModel -from ..hyp_defs import float_cpu +from ..np_model import NPModel +from ...hyp_defs import float_cpu -class NDA(HypModel): +class NDA(NPModel): """Class to do nearest-neighbors discriminant analysis""" def __init__(self, mu=None, T=None, **kwargs): diff --git a/hyperion/transforms/pca.py b/hyperion/np/transforms/pca.py similarity index 98% rename from hyperion/transforms/pca.py rename to hyperion/np/transforms/pca.py index cd8d6973..23477c84 100644 --- a/hyperion/transforms/pca.py +++ b/hyperion/np/transforms/pca.py @@ -8,10 +8,10 @@ from numpy.linalg import matrix_rank import scipy.linalg as la -from ..hyp_model import HypModel +from ..np_model import NPModel -class PCA(HypModel): +class PCA(NPModel): """Class to do principal component analysis""" def __init__( diff --git a/hyperion/transforms/sb_sw.py b/hyperion/np/transforms/sb_sw.py similarity index 98% rename from hyperion/transforms/sb_sw.py rename to hyperion/np/transforms/sb_sw.py index 83c8d185..92cba594 100644 --- a/hyperion/transforms/sb_sw.py +++ b/hyperion/np/transforms/sb_sw.py @@ -8,11 +8,11 @@ import scipy.linalg as la from sklearn.neighbors import BallTree -from ..hyp_model import HypModel -from ..hyp_defs import float_cpu +from ..np_model import NPModel +from ...hyp_defs import float_cpu -class SbSw(HypModel): +class SbSw(NPModel): """Class to compute between and within class matrices""" def __init__(self, Sb=None, Sw=None, mu=None, num_classes=0, **kwargs): diff --git a/hyperion/transforms/skl_tsne.py b/hyperion/np/transforms/skl_tsne.py similarity index 99% rename from hyperion/transforms/skl_tsne.py rename to hyperion/np/transforms/skl_tsne.py index 048be0c7..b5be0fac 100644 --- a/hyperion/transforms/skl_tsne.py +++ b/hyperion/np/transforms/skl_tsne.py @@ -7,10 +7,10 @@ from sklearn.manifold import TSNE -from ..hyp_model import HypModel +from ..np_model import NPModel -class SklTSNE(HypModel): +class SklTSNE(NPModel): """Wrapper class for sklearn TSNE manifold learner Attributes: diff --git a/hyperion/transforms/transform_list.py b/hyperion/np/transforms/transform_list.py similarity index 100% rename from hyperion/transforms/transform_list.py rename to hyperion/np/transforms/transform_list.py diff --git a/hyperion/pipeline/pipeline.py b/hyperion/pipeline/pipeline.py deleted file mode 100644 index 6b8076f5..00000000 --- a/hyperion/pipeline/pipeline.py +++ /dev/null @@ -1,63 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import logging -import numpy as np -import h5py - -from ..hyp_model import HypModel - -from ..transforms import * - - -class Pipeline(HypModel): - """Class to process a series of models.""" - - def __init__(self, transforms, **kwargs): - super(Pipeline, self).__init__(**kwargs) - if not isinstance(transforms, list): - transforms = [transforms] - self.transforms = transforms - if transforms is not None: - self.update_names() - - def append(self, t): - self.transforms.append(t) - if self.name is not None: - t.name = self.name + "/" + t.name - - def predict(self, x): - for t in self.transforms: - x = t.predict(x) - return x - - def update_names(self): - if self.name is not None: - for t in self.transforms: - t.name = self.name + "/" + t.name - - def get_config(self): - config = super(Pipeline, self).get_config() - config_t = {} - for i in range(len(self.transforms)): - config_t[i] = self.transforms[i].get_config() - config["transforms"] = config_t - return config - - def save_params(self, f): - for t in self.transforms: - t.save_params(f) - - @classmethod - def load_params(cls, f, config): - config_ts = config["transforms"] - transforms = [] - for i in range(len(config_ts)): - config_t = config_ts[str(i)] - logging.debug(config_t) - class_t = globals()[config_t["class_name"]] - t = class_t.load_params(f, config_t) - transforms.append(t) - return cls(transforms, name=config["name"]) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index f0ab811d..1801f11a 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -16,7 +16,7 @@ from ..torch_defs import floatstr_torch from ...io import RandomAccessAudioReader as AR from ...utils.utt2info import Utt2Info -from ...augment import SpeechAugment +from ...np.augment import SpeechAugment from torch.utils.data import Dataset import torch.distributed as dist diff --git a/hyperion/torch/layers/audio_feats.py b/hyperion/torch/layers/audio_feats.py index d435ebbd..718844f5 100644 --- a/hyperion/torch/layers/audio_feats.py +++ b/hyperion/torch/layers/audio_feats.py @@ -7,8 +7,6 @@ import math import logging -from ...utils.misc import str2bool - import torch import torch.nn as nn import torch.cuda.amp as amp @@ -24,7 +22,7 @@ _pow_spectrogram = lambda x: x.pow(2).sum(-1) _spectrogram = lambda x: x.pow(2).sum(-1).sqrt() -from ...feats.filter_banks import FilterBankFactory as FBF +from ...np.feats.filter_banks import FilterBankFactory as FBF # window types HAMMING = "hamming" diff --git a/hyperion/vb_pdfs/core/exponential_family.py b/hyperion/vb_pdfs/core/exponential_family.py deleted file mode 100644 index c3e59040..00000000 --- a/hyperion/vb_pdfs/core/exponential_family.py +++ /dev/null @@ -1,139 +0,0 @@ -""" - Copyright 2017 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from abc import ABCMeta, abstractmethod -from .pdf import PDF - - -class ExpFamily(PDF): - __metaclass__ = ABCMeta - - def __init__(self, eta=None, **kwargs): - super(ExpFamily, self).__init__(**kwargs) - self.eta = eta - self.A = None - - def fit( - self, x, sample_weight=None, x_val=None, sample_weight_val=None, batch_size=None - ): - - N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) - self.Mstep(N, u_x) - elbo = self.elbo(x, N=N, u_x=u_x) - elbo = [elbo, elbo / N] - - if x_val is not None: - N, u_x = self.Estep( - x=x_val, sample_weight=sample_weight_val, batch_size=batch_size - ) - elbo_val = self.elbo(x_val, N=N, u_x=u_x) - elbo += [elbo_val, elbo_val / N] - return elbo - - def log_h(self, x): - return 0 - - def accum_logh(self, x, sample_weight=None): - if sample_weight is None: - return np.sum(self.logh(x)) - return np.sum(sample_weight * self.logh(x)) - - def compute_suff_stats(self, x): - return x - - def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): - if u_x is not None or batch_size is None: - return self._accum_suff_stats_1batch(x, u_x, sample_weight) - else: - return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) - - def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): - if u_x is None: - u_x = self.compute_suff_stats(x) - if sample_weight is None: - N = u_x.shape[0] - else: - u_x *= sample_weight[:, None] - N = np.sum(sample_weight) - acc_u_x = np.sum(u_x, axis=0) - return N, acc_u_x - - def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): - sw_i = None - for i1 in range(0, x.shape[0], batch_size): - i2 = np.minimum(i1 + batch_size, x.shape[0]) - x_i = x[i1:i2, :] - if sample_weight is not None: - sw_i = sample_weight[i1:i2] - N_i, u_x_i = self._accum_suff_stats_1batch(x_i, sample_weight=sw_i) - if i1 == 0: - N = N_i - u_x = u_x_i - else: - N += N_i - u_x += u_x_i - return N, u_x - - def add_suff_stats(self, N, u_x): - assert len(N) == len(u_x) - acc_N = N[1] - acc_u_x = u_x[1] - for i in range(1, len(N)): - acc_N += N - acc_u_x += u[i] - return acc_N, acc_u_x - - def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): - return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - - @abstractmethod - def Mstep(self, stats): - pass - - def elbo(self, x, u_x=None, N=1, logh=None, sample_weight=None, batch_size=None): - if u_x is None: - N, u_x = self.accum_suff_stats( - x, sample_weight=sample_weight, batch_size=batch_size - ) - if logh is None: - logh = self.accum_logh(x, sample_weight=sample_weight) - return logh + np.inner(u_x, self.eta) - N * self.A - - def eval_llk(self, x, u_x=None, mode="nat"): - if mode == "nat": - return self.eval_llk_nat(x, u_x) - else: - return self.eval_llk_std(x) - - def eval_llk_nat(self, x, u_x=None): - if u_x is None: - u_x = self.compute_suff_stats(x) - return self.logh(x) + np.inner(u_x, self.eta) - self.A - - @staticmethod - def compute_A_nat(eta): - raise NotImplementedError() - - @staticmethod - def compute_A_std(params): - raise NotImplementedError() - - @staticmethod - def compute_eta(param): - raise NotImplementedError() - - @staticmethod - def compute_std(eta): - raise NotImplementedError() - - @abstractmethod - def _compute_nat_params(self): - pass - - @abstractmethod - def _compute_std_params(self): - pass diff --git a/hyperion/vb_pdfs/core/pdf.py b/hyperion/vb_pdfs/core/pdf.py deleted file mode 100644 index 012ff96c..00000000 --- a/hyperion/vb_pdfs/core/pdf.py +++ /dev/null @@ -1,32 +0,0 @@ -""" - Copyright 2017 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from abc import ABCMeta, abstractmethod -from ...hyp_model import HypModel - - -class PDF(HypModel): - __metaclass__ = ABCMeta - - def __init__(self, **kwargs): - super(PDF, self).__init__(**kwargs) - - # def get_config(self): - # config = {'x_dim': self.x_dim } - # base_config = super(PDF, self).get_config() - # return dict(list(base_config.items()) + list(config.items())) - - @abstractmethod - def log_prob(self, x): - pass - - def log_cdf(self, x): - raise NotImplementedError - - @abstractmethod - def sample(self, num_samples): - pass diff --git a/requirements.txt b/requirements.txt index 7a1ae1b3..6f1c8bc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ librosa>=0.8.1 black twine wheel +transformers>=4.16.2 From 887bd3becaeba2ba621e19f54366fac674ef40b6 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 1 Mar 2022 09:44:50 -0500 Subject: [PATCH 002/154] In egs, change hyperion -> hyperion.np where needed --- egs/chime5_spkdet/v1/local/score_dcf.py | 2 +- egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py | 2 +- egs/chime5_spkdet/v1/steps_be/eval-be-v1.py | 2 +- egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py | 4 ++-- egs/chime5_spkdet/v1/steps_be/train-be-v1.py | 2 +- .../v1/steps_be/train-calibration-v1.py | 4 ++-- egs/dihard2019/v1/steps_diar/eval-ahc-v1.py | 12 ++++++------ egs/dihard2019/v1/steps_diar/train-plda-v1.py | 2 +- egs/sre18/v1.8k/local/score_dcf.py | 2 +- egs/sre18/v1.8k/steps_be/eval-calibration-v1.py | 4 ++-- egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py | 4 ++-- egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py | 2 +- .../v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py | 4 ++-- egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py | 2 +- egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py | 4 ++-- egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py | 2 +- egs/sre18/v1.8k/steps_be/train-calibration-v1.py | 4 ++-- egs/sre18/v1.8k/steps_be/train-tel-be-v1.py | 4 ++-- egs/sre18/v1.8k/steps_be/train-vid-be-v1.py | 2 +- egs/sre19-av-v/v0.1/local/score_dcf.py | 2 +- egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py | 4 ++-- .../v0.1/steps_be/eval-face-vid-be-snorm-v1.py | 2 +- .../v0.1/steps_be/eval-face-vid-be-snorm-v2.py | 2 +- .../v0.1/steps_be/eval-face-vid-be-snorm-v3.py | 2 +- .../v0.1/steps_be/eval-face-vid-be-snorm-v4.py | 2 +- .../v0.1/steps_be/eval-face-vid-be-snorm-v5.py | 4 ++-- .../v0.1/steps_be/eval-face-vid-be-snorm-v6.py | 2 +- .../v0.1/steps_be/eval-face-vid-be-snorm-v7.py | 2 +- .../v0.1/steps_be/eval-face-vid-be-snorm-v9.py | 2 +- egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-v8.py | 2 +- egs/sre19-av-v/v0.1/steps_be/eval-fusion-v1.py | 2 +- egs/sre19-av-v/v0.1/steps_be/face_be_utils.py | 4 ++-- .../v0.1/steps_be/face_video_trial_data_reader.py | 2 +- egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py | 4 ++-- egs/sre19-av-v/v0.1/steps_be/train-calibration-v2.py | 4 ++-- egs/sre19-av-v/v0.1/steps_be/train-fusion-v1.py | 4 ++-- egs/sre19-av-v/v0.1/steps_be/train-fusion-v2.py | 4 ++-- egs/sre19-cmn2/v1/local/error_analysis.py | 2 +- egs/sre19-cmn2/v1/local/score_dcf.py | 2 +- egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py | 4 ++-- egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py | 4 ++-- egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py | 2 +- .../v1/steps_be/eval-vid-be-diar-snorm-v1.py | 4 ++-- .../v1/steps_be/eval-vid-be-diar-snorm-v2.py | 4 ++-- egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py | 2 +- egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py | 2 +- egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py | 4 ++-- egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py | 2 +- egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py | 4 ++-- egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py | 4 ++-- egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py | 4 ++-- egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py | 4 ++-- egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py | 2 +- egs/sre20-cts/v1/local/score_dcf.py | 2 +- egs/sre20-cts/v1/steps_be/apply-ahc-v1.py | 8 ++++---- egs/sre20-cts/v1/steps_be/eval-calibration-v1.py | 4 ++-- egs/sre20-cts/v1/steps_be/eval-calibration-v2.py | 4 ++-- egs/sre20-cts/v1/steps_be/eval-fusion-v1.py | 2 +- .../v1/steps_be/eval-tel-be-knn-snorm-v1.py | 4 ++-- egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py | 2 +- egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py | 4 ++-- egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py | 6 +++--- egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py | 2 +- egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py | 4 ++-- egs/sre20-cts/v1/steps_be/train-calibration-v1.py | 4 ++-- egs/sre20-cts/v1/steps_be/train-calibration-v2.py | 4 ++-- egs/sre20-cts/v1/steps_be/train-fusion-v1.py | 4 ++-- egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py | 4 ++-- egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py | 4 ++-- egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py | 4 ++-- egs/sre20-cts/v1/steps_be/train-tel-be-v1.py | 2 +- egs/sre20-cts/v1/steps_be/train-tel-be-v3.py | 2 +- egs/sre20-cts/v1/steps_be/train-tel-be-v4.py | 2 +- egs/sre21-av-a/v1.16k/local/plot-tsne-cts.py | 2 +- egs/sre21-av-a/v1.16k/local/score_sre16.py | 2 +- egs/sre21-av-a/v1.16k/local/score_sre21.py | 2 +- .../v1.16k/local/score_sre_cts_superset.py | 2 +- .../v1.16k/steps_be/eval-be-plda-snorm-v1.py | 4 ++-- .../v1.16k/steps_be/eval-be-plda-snorm-v2.py | 4 ++-- egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v1.py | 2 +- egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v2.py | 2 +- egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v3.py | 2 +- .../steps_be/eval-calibration-v1-sre-superset.py | 4 ++-- .../v1.16k/steps_be/eval-calibration-v1-sre16.py | 2 +- .../v1.16k/steps_be/eval-calibration-v1-sre21-dev.py | 4 ++-- .../steps_be/eval-calibration-v1-sre21-eval.py | 4 ++-- egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v1.py | 2 +- egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v2.py | 2 +- egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v1.py | 2 +- egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v2.py | 2 +- egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v3.py | 2 +- .../v1.16k/steps_be/train-calibration-v1.py | 4 ++-- egs/sre21-av-a/v1.16k/steps_be/train-fusion-v1.py | 4 ++-- egs/sre21-av-a/v1.16k/steps_be/train-fusion-v2.py | 6 +++--- egs/sre21-av-v/v0.1/local/score_dcf.py | 2 +- egs/sre21-av/v1/local/score_sre21.py | 2 +- egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py | 4 ++-- egs/voices_challenge/v0/steps_be/eval-be-v1.py | 2 +- .../v0/steps_be/eval-calibration-v1.py | 4 ++-- egs/voices_challenge/v0/steps_be/train-be-v1.py | 2 +- egs/voices_challenge/v0/steps_be/train-be-v2.py | 2 +- .../v0/steps_be/train-calibration-v1.py | 4 ++-- egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py | 4 ++-- egs/voices_challenge/v1/steps_be/eval-be-v1.py | 2 +- .../v1/steps_be/eval-calibration-v1.py | 4 ++-- egs/voices_challenge/v1/steps_be/train-be-v1.py | 2 +- egs/voices_challenge/v1/steps_be/train-be-v2.py | 2 +- .../v1/steps_be/train-calibration-v1.py | 4 ++-- egs/voxceleb/adv.v2/local/make_some_figs.py | 2 +- egs/voxceleb/adv.v2/local/score_dcf.py | 2 +- egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py | 2 +- .../adv.v2/steps_backend/eval-be-cos-Nvs1.py | 4 ++-- egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py | 2 +- egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py | 2 +- .../eval-classif-perf-plda-unknown-attacks-noimp.py | 8 ++++---- .../eval-classif-perf-plda-unkown-attacks.py | 8 ++++---- .../eval-classif-perf-unknown-attacks.py | 4 ++-- .../adv.v2/steps_backend/eval-classif-perf.py | 4 ++-- egs/voxceleb/adv.v2/steps_backend/train-be-v1.py | 2 +- egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py | 2 +- egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py | 2 +- egs/voxceleb/v1/local/attack_analysis.py | 2 +- egs/voxceleb/v1/local/make_some_figs.py | 2 +- egs/voxceleb/v1/local/score_dcf.py | 2 +- egs/voxceleb/v1/steps_be/eval-be-v1.py | 2 +- egs/voxceleb/v1/steps_be/eval-be-v2.py | 2 +- egs/voxceleb/v1/steps_be/eval-calibration-v1.py | 4 ++-- egs/voxceleb/v1/steps_be/train-be-v1.py | 2 +- egs/voxceleb/v1/steps_be/train-be-v2.py | 2 +- egs/voxceleb/v1/steps_be/train-calibration-v1.py | 4 ++-- 130 files changed, 201 insertions(+), 201 deletions(-) diff --git a/egs/chime5_spkdet/v1/local/score_dcf.py b/egs/chime5_spkdet/v1/local/score_dcf.py index 1137e049..cba16610 100755 --- a/egs/chime5_spkdet/v1/local/score_dcf.py +++ b/egs/chime5_spkdet/v1/local/score_dcf.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py b/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py index b77d3595..9ef02a02 100755 --- a/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py +++ b/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py @@ -17,7 +17,7 @@ from hyperion.utils import TrialScores from hyperion.helpers import MultiTestTrialDataReaderV2 as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def combine_diar_scores(ndx, orig_seg, subseg_scores): diff --git a/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py b/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py index dc3e3f87..19d582e4 100755 --- a/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py b/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py index fb5dd6f9..1cf80177 100755 --- a/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py @@ -18,8 +18,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/chime5_spkdet/v1/steps_be/train-be-v1.py b/egs/chime5_spkdet/v1/steps_be/train-be-v1.py index 55c412ac..6d1af604 100755 --- a/egs/chime5_spkdet/v1/steps_be/train-be-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/train-be-v1.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F diff --git a/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py b/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py index fa1dfcf7..9eaa7187 100755 --- a/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, verbose): diff --git a/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py b/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py index 25282718..c45767b2 100755 --- a/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py +++ b/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py @@ -27,13 +27,13 @@ from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList, PCA, LNorm -from hyperion.clustering import AHC -from hyperion.pdfs import GMMTiedDiagCov as GMM -from hyperion.diarization import DiarAHCPLDA as Diar +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.clustering import AHC +from hyperion.np.pdfs import GMMTiedDiagCov as GMM +from hyperion.np.diarization import DiarAHCPLDA as Diar -# from hyperion.pdfs import GMMDiagCov as GMM2 -# from hyperion.pdfs import GMM as GMM3 +# from hyperion.np.pdfs import GMMDiagCov as GMM2 +# from hyperion.np.pdfs import GMM as GMM3 def make_timestamps(n, win_start, win_length, win_shift, win_shrink): diff --git a/egs/dihard2019/v1/steps_diar/train-plda-v1.py b/egs/dihard2019/v1/steps_diar/train-plda-v1.py index c7589c8a..713798af 100755 --- a/egs/dihard2019/v1/steps_diar/train-plda-v1.py +++ b/egs/dihard2019/v1/steps_diar/train-plda-v1.py @@ -22,7 +22,7 @@ from hyperion.utils import Utt2Info # from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, LDA, LNorm, PCA +from hyperion.np.transforms import TransformList, LDA, LNorm, PCA from hyperion.helpers import PLDAFactory as F from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/sre18/v1.8k/local/score_dcf.py b/egs/sre18/v1.8k/local/score_dcf.py index 1137e049..cba16610 100755 --- a/egs/sre18/v1.8k/local/score_dcf.py +++ b/egs/sre18/v1.8k/local/score_dcf.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py b/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py index fa16dfce..31b527f7 100755 --- a/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py b/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py index d3b35fba..954a8a4a 100755 --- a/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py b/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py index d9668e1a..06b2bc87 100755 --- a/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py index c37d450a..af8895b2 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py index c19dc074..433cbbff 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py @@ -21,7 +21,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def combine_diar_scores(ndx, diar_ndx, diar2orig, diar_scores): diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py index fc94c754..1f1ffc81 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py index f7d83d30..19ca8bdf 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre18/v1.8k/steps_be/train-calibration-v1.py b/egs/sre18/v1.8k/steps_be/train-calibration-v1.py index fa1dfcf7..9eaa7187 100755 --- a/egs/sre18/v1.8k/steps_be/train-calibration-v1.py +++ b/egs/sre18/v1.8k/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, verbose): diff --git a/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py b/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py index c9f22d83..46710992 100755 --- a/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py @@ -12,9 +12,9 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F -from hyperion.clustering import AHC +from hyperion.np.clustering import AHC from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py b/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py index a1b0cad6..4724a24a 100755 --- a/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py @@ -13,7 +13,7 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.scp_list import SCPList diff --git a/egs/sre19-av-v/v0.1/local/score_dcf.py b/egs/sre19-av-v/v0.1/local/score_dcf.py index 514ebf51..772d107a 100755 --- a/egs/sre19-av-v/v0.1/local/score_dcf.py +++ b/egs/sre19-av-v/v0.1/local/score_dcf.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py b/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py index 8087cac2..576ea3d5 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py @@ -22,8 +22,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v1.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v1.py index 1527f514..9b490e72 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v1.py @@ -14,7 +14,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v2.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v2.py index 5ac23484..40187aa4 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v2.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v2.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v3.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v3.py index 2a7abe08..3d52788e 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v3.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v3.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v4.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v4.py index 9c22cc1f..f18a53f7 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v4.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v4.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v5.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v5.py index 660854e3..af75f526 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v5.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v5.py @@ -15,10 +15,10 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm # from hyperion.helpers import PLDAFactory as F -# from hyperion.transforms import TransformList +# from hyperion.np.transforms import TransformList from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v6.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v6.py index 86ca6a8f..e23e52a1 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v6.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v6.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v7.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v7.py index 19f78a23..85bd8ee4 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v7.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v7.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v9.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v9.py index a6774a68..d36b91ec 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v9.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-snorm-v9.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.score_norm import AdaptSNorm as SNorm from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-v8.py b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-v8.py index aa9539d4..a66794da 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-v8.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-face-vid-be-v8.py @@ -18,7 +18,7 @@ from hyperion.utils.trial_scores import TrialScores # from hyperion.helpers import PLDAFactory as F -# from hyperion.transforms import TransformList +# from hyperion.np.transforms import TransformList from face_video_trial_data_reader import FaceVideoTrialDataReaderV1 as TDR from face_be_utils import * diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-fusion-v1.py b/egs/sre19-av-v/v0.1/steps_be/eval-fusion-v1.py index 70b0c81b..fe24f947 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-fusion-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-fusion-v1.py @@ -17,7 +17,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def eval_fusion(in_score_files, ndx_file, model_file, out_score_file, fus_idx): diff --git a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py index 025d11a3..14e3fc20 100644 --- a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py +++ b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py @@ -12,8 +12,8 @@ from hyperion.utils.utt2info import Utt2Info from hyperion.utils.math import softmax from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.transforms import LNorm -from hyperion.clustering import AHC +from hyperion.np.transforms import LNorm +from hyperion.np.clustering import AHC def lnorm(x): diff --git a/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py b/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py index 091a4ee1..11223607 100644 --- a/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py +++ b/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py @@ -18,7 +18,7 @@ from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.utils.utt2info import Utt2Info from hyperion.utils import TrialNdx, TrialKey -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList class FaceVideoTrialDataReaderV1(object): diff --git a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py index 35c1a3bc..0d97a4fb 100755 --- a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py @@ -21,8 +21,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v2.py b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v2.py index b247f264..f1f89bdd 100755 --- a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v2.py +++ b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v2.py @@ -21,8 +21,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre19-av-v/v0.1/steps_be/train-fusion-v1.py b/egs/sre19-av-v/v0.1/steps_be/train-fusion-v1.py index e28bfffa..564fccaa 100755 --- a/egs/sre19-av-v/v0.1/steps_be/train-fusion-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/train-fusion-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def train_fusion( diff --git a/egs/sre19-av-v/v0.1/steps_be/train-fusion-v2.py b/egs/sre19-av-v/v0.1/steps_be/train-fusion-v2.py index 0679eb7c..1f97d189 100755 --- a/egs/sre19-av-v/v0.1/steps_be/train-fusion-v2.py +++ b/egs/sre19-av-v/v0.1/steps_be/train-fusion-v2.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def train_fusion( diff --git a/egs/sre19-cmn2/v1/local/error_analysis.py b/egs/sre19-cmn2/v1/local/error_analysis.py index c4dbba5a..bbdb893d 100755 --- a/egs/sre19-cmn2/v1/local/error_analysis.py +++ b/egs/sre19-cmn2/v1/local/error_analysis.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre19-cmn2/v1/local/score_dcf.py b/egs/sre19-cmn2/v1/local/score_dcf.py index deb39682..fd7a3149 100755 --- a/egs/sre19-cmn2/v1/local/score_dcf.py +++ b/egs/sre19-cmn2/v1/local/score_dcf.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py index fa16dfce..31b527f7 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py index d3b35fba..954a8a4a 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py index 820c90db..5d77a896 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py @@ -20,7 +20,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py index c6f62957..0d5c3000 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py @@ -22,8 +22,8 @@ MultiTestTrialDataReader as TDR, ) from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py index ec4addef..e0b29fd4 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py @@ -17,8 +17,8 @@ from hyperion.utils import TrialScores from hyperion.helpers import MultiTestTrialDataReaderV2 as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py index 20e88a37..ebc77930 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py @@ -21,7 +21,7 @@ MultiTestTrialDataReader as TDR, ) from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def combine_diar_scores(ndx, orig_seg, subseg_scores): diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py index b77d3595..9ef02a02 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py @@ -17,7 +17,7 @@ from hyperion.utils import TrialScores from hyperion.helpers import MultiTestTrialDataReaderV2 as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def combine_diar_scores(ndx, orig_seg, subseg_scores): diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py index 0c5b31e0..76bf4bcd 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py index f7d83d30..19ca8bdf 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py b/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py index 779e62af..c57a1162 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py index c9f22d83..46710992 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py @@ -12,9 +12,9 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F -from hyperion.clustering import AHC +from hyperion.np.clustering import AHC from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py index d8d82405..df435852 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py +++ b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py @@ -13,9 +13,9 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F -from hyperion.clustering import AHC +from hyperion.np.clustering import AHC from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py index 1b039c40..6532b9aa 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py +++ b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py @@ -12,9 +12,9 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm, CORAL +from hyperion.np.transforms import TransformList, LDA, LNorm, CORAL from hyperion.helpers import PLDAFactory as F -from hyperion.clustering import AHC +from hyperion.np.clustering import AHC from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py b/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py index f825d59b..c1087bf4 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.scp_list import SCPList diff --git a/egs/sre20-cts/v1/local/score_dcf.py b/egs/sre20-cts/v1/local/score_dcf.py index 1137e049..cba16610 100755 --- a/egs/sre20-cts/v1/local/score_dcf.py +++ b/egs/sre20-cts/v1/local/score_dcf.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py b/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py index a5373bf4..bfa0c7c3 100755 --- a/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py +++ b/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py @@ -25,11 +25,11 @@ # from hyperion.utils.trial_scores import TrialScores # from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm -from hyperion.clustering import AHC +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm +from hyperion.np.clustering import AHC from hyperion.utils import Utt2Info -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR def apply_ahc( diff --git a/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py b/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py index fb5dd6f9..1cf80177 100755 --- a/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py @@ -18,8 +18,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py b/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py index e3d1db91..92d2c2d0 100755 --- a/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py @@ -18,8 +18,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.utils import Utt2Info diff --git a/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py b/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py index 0d67a741..f1d90241 100755 --- a/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py @@ -17,7 +17,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def eval_fusion(in_score_files, ndx_file, model_file, out_score_file, fus_idx): diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py index 651a1b7f..7ab376c1 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py index 49ad3b42..50966aeb 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py @@ -20,7 +20,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda_e( diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py index ac6710ad..e46f729b 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py index 7430caf4..907509fd 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py @@ -17,9 +17,9 @@ from hyperion.utils import TrialNdx, TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.utils.math import cosine_scoring -from hyperion.pdfs import PLDA -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.pdfs import PLDA +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py index fb2904b1..698c0f32 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py @@ -18,7 +18,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py index 9eaea8b5..b661cbde 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py @@ -16,10 +16,10 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores from hyperion.utils.math import cosine_scoring -from hyperion.pdfs import PLDA +from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre20-cts/v1/steps_be/train-calibration-v1.py b/egs/sre20-cts/v1/steps_be/train-calibration-v1.py index 779e62af..c57a1162 100755 --- a/egs/sre20-cts/v1/steps_be/train-calibration-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-calibration-v1.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre20-cts/v1/steps_be/train-calibration-v2.py b/egs/sre20-cts/v1/steps_be/train-calibration-v2.py index 16d09e3a..28597899 100755 --- a/egs/sre20-cts/v1/steps_be/train-calibration-v2.py +++ b/egs/sre20-cts/v1/steps_be/train-calibration-v2.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils import Utt2Info -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration_cond(cond, scr, key, model_file, prior, lambda_reg, verbose): diff --git a/egs/sre20-cts/v1/steps_be/train-fusion-v1.py b/egs/sre20-cts/v1/steps_be/train-fusion-v1.py index a76b2b6c..9c7f5315 100755 --- a/egs/sre20-cts/v1/steps_be/train-fusion-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-fusion-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def train_fusion( diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py index a024281a..8e7715e0 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py @@ -13,8 +13,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.helpers import VectorClassReader as VCR -from hyperion.pdfs import PLDA -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.pdfs import PLDA +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info from hyperion.utils.math import cosine_scoring diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py index 568e7edf..12f1725b 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py @@ -13,8 +13,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.helpers import VectorClassReader as VCR -from hyperion.pdfs import PLDA -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.pdfs import PLDA +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info from hyperion.utils.math import cosine_scoring diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py index 7633cf17..234f966c 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py @@ -13,8 +13,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.helpers import VectorClassReader as VCR -from hyperion.pdfs import PLDA, SPLDA -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.pdfs import PLDA, SPLDA +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info from hyperion.utils.math import cosine_scoring diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py index a388fb88..01d38b65 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py @@ -11,7 +11,7 @@ import numpy as np from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py index ac5bfa7e..e29da60b 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py @@ -12,7 +12,7 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils import Utt2Info diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py index 7326d649..baef33f1 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py @@ -12,7 +12,7 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, PCA, LDA, LNorm, CORAL +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm, CORAL from hyperion.helpers import PLDAFactory as F from hyperion.utils import Utt2Info from numpy.linalg import matrix_rank diff --git a/egs/sre21-av-a/v1.16k/local/plot-tsne-cts.py b/egs/sre21-av-a/v1.16k/local/plot-tsne-cts.py index 25cddea8..46769568 100755 --- a/egs/sre21-av-a/v1.16k/local/plot-tsne-cts.py +++ b/egs/sre21-av-a/v1.16k/local/plot-tsne-cts.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.transforms import PCA, SklTSNE, LNorm +from hyperion.np.transforms import PCA, SklTSNE, LNorm colors = ["b", "g", "r", "c", "m", "y", "k"] markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] diff --git a/egs/sre21-av-a/v1.16k/local/score_sre16.py b/egs/sre21-av-a/v1.16k/local/score_sre16.py index 4064b64f..af44fb53 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre16.py +++ b/egs/sre21-av-a/v1.16k/local/score_sre16.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_file): diff --git a/egs/sre21-av-a/v1.16k/local/score_sre21.py b/egs/sre21-av-a/v1.16k/local/score_sre21.py index 986aa3f6..72fc1a13 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre21.py +++ b/egs/sre21-av-a/v1.16k/local/score_sre21.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score(key_file, score_file, sre21_subset, output_file): diff --git a/egs/sre21-av-a/v1.16k/local/score_sre_cts_superset.py b/egs/sre21-av-a/v1.16k/local/score_sre_cts_superset.py index 3f2223a4..bb61ca18 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre_cts_superset.py +++ b/egs/sre21-av-a/v1.16k/local/score_sre_cts_superset.py @@ -14,7 +14,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score(key_file, score_file, output_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v1.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v1.py index f265ca30..51d21312 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v2.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v2.py index 35b2d501..a9e7ee03 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-snorm-v2.py @@ -20,8 +20,8 @@ from hyperion.utils import TrialNdx, TrialScores, Utt2Info from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v1.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v1.py index d122d14c..1e45f560 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v1.py @@ -20,7 +20,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v2.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v2.py index 3051383b..2eda0f47 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v2.py @@ -19,7 +19,7 @@ from hyperion.utils import TrialNdx, TrialScores, Utt2Info from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList conds = [ "cts_eng", diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v3.py b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v3.py index ee0cb558..8cceb387 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v3.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-be-plda-v3.py @@ -19,7 +19,7 @@ from hyperion.utils import TrialNdx, TrialScores, Utt2Info from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList conds = [ "cts_eng", diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre-superset.py b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre-superset.py index 0781f9f2..21d2337b 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre-superset.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre-superset.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_ndx_and_scores(ndx_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre16.py b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre16.py index 7880e358..6b2da927 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre16.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre16.py @@ -18,7 +18,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_ndx_and_scores(ndx_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-dev.py b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-dev.py index ce2be18c..240baf82 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-dev.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-dev.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_ndx_and_scores(ndx_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-eval.py b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-eval.py index 407d78dc..50ce6943 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-eval.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-calibration-v1-sre21-eval.py @@ -18,8 +18,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_ndx_and_scores(ndx_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v1.py b/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v1.py index 205a73d3..933f8864 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v1.py @@ -17,7 +17,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def eval_fusion(in_score_files, ndx_file, model_file, out_score_file, fus_idx): diff --git a/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v2.py b/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v2.py index 0fa1ee59..081d8f23 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/eval-fusion-v2.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, TrialNdx, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def read_ndx(ndx_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v1.py b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v1.py index 42d5d927..d7ba9129 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v1.py @@ -18,7 +18,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.scp_list import SCPList diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v2.py b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v2.py index 082f69a6..f38445c5 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v2.py @@ -18,7 +18,7 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v3.py b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v3.py index 423ab265..febda665 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v3.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-be-plda-v3.py @@ -18,7 +18,7 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.transforms import TransformList, PCA, LDA, LNorm +from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-calibration-v1.py b/egs/sre21-av-a/v1.16k/steps_be/train-calibration-v1.py index 2c5fa488..01a26410 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-calibration-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialScores, TrialKey, Utt2Info from hyperion.utils.list_utils import ismember -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def read_key_and_scores(key_file, score_file): diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v1.py b/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v1.py index 8935b431..65c78b41 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v1.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF def train_fusion( diff --git a/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v2.py b/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v2.py index 1c51111c..b447b81e 100755 --- a/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v2.py +++ b/egs/sre21-av-a/v1.16k/steps_be/train-fusion-v2.py @@ -17,9 +17,9 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import GreedyFusionBinaryLR as GF -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import GreedyFusionBinaryLR as GF +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_fusion_condition( diff --git a/egs/sre21-av-v/v0.1/local/score_dcf.py b/egs/sre21-av-v/v0.1/local/score_dcf.py index 514ebf51..772d107a 100755 --- a/egs/sre21-av-v/v0.1/local/score_dcf.py +++ b/egs/sre21-av-v/v0.1/local/score_dcf.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/sre21-av/v1/local/score_sre21.py b/egs/sre21-av/v1/local/score_sre21.py index 986aa3f6..72fc1a13 100755 --- a/egs/sre21-av/v1/local/score_sre21.py +++ b/egs/sre21-av/v1/local/score_sre21.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score(key_file, score_file, sre21_subset, output_file): diff --git a/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py b/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py index 78231ba1..b280ab0e 100755 --- a/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py +++ b/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/voices_challenge/v0/steps_be/eval-be-v1.py b/egs/voices_challenge/v0/steps_be/eval-be-v1.py index dc3e3f87..19d582e4 100755 --- a/egs/voices_challenge/v0/steps_be/eval-be-v1.py +++ b/egs/voices_challenge/v0/steps_be/eval-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py b/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py index fa16dfce..31b527f7 100755 --- a/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py +++ b/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/voices_challenge/v0/steps_be/train-be-v1.py b/egs/voices_challenge/v0/steps_be/train-be-v1.py index 44f93a57..ed1b5f09 100755 --- a/egs/voices_challenge/v0/steps_be/train-be-v1.py +++ b/egs/voices_challenge/v0/steps_be/train-be-v1.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F diff --git a/egs/voices_challenge/v0/steps_be/train-be-v2.py b/egs/voices_challenge/v0/steps_be/train-be-v2.py index cd4d4470..fbb961b2 100755 --- a/egs/voices_challenge/v0/steps_be/train-be-v2.py +++ b/egs/voices_challenge/v0/steps_be/train-be-v2.py @@ -18,7 +18,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info diff --git a/egs/voices_challenge/v0/steps_be/train-calibration-v1.py b/egs/voices_challenge/v0/steps_be/train-calibration-v1.py index fa1dfcf7..9eaa7187 100755 --- a/egs/voices_challenge/v0/steps_be/train-calibration-v1.py +++ b/egs/voices_challenge/v0/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, verbose): diff --git a/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py b/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py index 78231ba1..b280ab0e 100755 --- a/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py +++ b/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py @@ -20,8 +20,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList -from hyperion.score_norm import AdaptSNorm as SNorm +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR diff --git a/egs/voices_challenge/v1/steps_be/eval-be-v1.py b/egs/voices_challenge/v1/steps_be/eval-be-v1.py index dc3e3f87..19d582e4 100755 --- a/egs/voices_challenge/v1/steps_be/eval-be-v1.py +++ b/egs/voices_challenge/v1/steps_be/eval-be-v1.py @@ -19,7 +19,7 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py b/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py index fa16dfce..31b527f7 100755 --- a/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py +++ b/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py @@ -19,8 +19,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/voices_challenge/v1/steps_be/train-be-v1.py b/egs/voices_challenge/v1/steps_be/train-be-v1.py index 44f93a57..ed1b5f09 100755 --- a/egs/voices_challenge/v1/steps_be/train-be-v1.py +++ b/egs/voices_challenge/v1/steps_be/train-be-v1.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F diff --git a/egs/voices_challenge/v1/steps_be/train-be-v2.py b/egs/voices_challenge/v1/steps_be/train-be-v2.py index 36fbc341..fda28dc7 100755 --- a/egs/voices_challenge/v1/steps_be/train-be-v2.py +++ b/egs/voices_challenge/v1/steps_be/train-be-v2.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, LDA, LNorm +from hyperion.np.transforms import TransformList, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info diff --git a/egs/voices_challenge/v1/steps_be/train-calibration-v1.py b/egs/voices_challenge/v1/steps_be/train-calibration-v1.py index fa1dfcf7..9eaa7187 100755 --- a/egs/voices_challenge/v1/steps_be/train-calibration-v1.py +++ b/egs/voices_challenge/v1/steps_be/train-calibration-v1.py @@ -17,8 +17,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, verbose): diff --git a/egs/voxceleb/adv.v2/local/make_some_figs.py b/egs/voxceleb/adv.v2/local/make_some_figs.py index 0b2b672f..84c167a4 100755 --- a/egs/voxceleb/adv.v2/local/make_some_figs.py +++ b/egs/voxceleb/adv.v2/local/make_some_figs.py @@ -11,7 +11,7 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.metrics.verification_evaluator import ( +from hyperion.np.metrics.verification_evaluator import ( VerificationAdvAttackEvaluator as Eval, ) diff --git a/egs/voxceleb/adv.v2/local/score_dcf.py b/egs/voxceleb/adv.v2/local/score_dcf.py index 50babe69..1718ad4d 100755 --- a/egs/voxceleb/adv.v2/local/score_dcf.py +++ b/egs/voxceleb/adv.v2/local/score_dcf.py @@ -14,7 +14,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import SparseTrialScores, SparseTrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py index 4b017114..ea570f60 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py @@ -18,7 +18,7 @@ from hyperion.utils import TrialNdx, TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py index 0b3c9125..85e82149 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py @@ -16,10 +16,10 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores from hyperion.utils.math import cosine_scoring -from hyperion.pdfs import PLDA +from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList, LNorm +from hyperion.np.transforms import TransformList, LNorm def eval_plda( diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py index 0438e373..d5cd6a55 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py @@ -22,7 +22,7 @@ from hyperion.utils.math import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py index 3ebac1f6..29b0a2c8 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py @@ -19,7 +19,7 @@ from hyperion.utils import TrialNdx, TrialScores, Utt2Info from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList from hyperion.io import RandomAccessDataReaderFactory as DRF diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py index 630bc244..0aeb2367 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py @@ -19,13 +19,13 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import ( +from hyperion.np.metrics.acc import compute_accuracy +from hyperion.np.metrics.confusion_matrix import ( compute_confusion_matrix, print_confusion_matrix, ) -from hyperion.transforms import PCA, LNorm -from hyperion.pdfs import SPLDA +from hyperion.np.transforms import PCA, LNorm +from hyperion.np.pdfs import SPLDA from numpy.linalg import matrix_rank # colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py index 5ad87f72..796422f8 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py @@ -19,13 +19,13 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import ( +from hyperion.np.metrics.acc import compute_accuracy +from hyperion.np.metrics.confusion_matrix import ( compute_confusion_matrix, print_confusion_matrix, ) -from hyperion.transforms import PCA, LNorm -from hyperion.pdfs import SPLDA +from hyperion.np.transforms import PCA, LNorm +from hyperion.np.pdfs import SPLDA from numpy.linalg import matrix_rank # colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py index e8dd6e00..cf20735f 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py @@ -19,8 +19,8 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import ( +from hyperion.np.metrics.acc import compute_accuracy +from hyperion.np.metrics.confusion_matrix import ( compute_xlabel_confusion_matrix, print_confusion_matrix, ) diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py index 6b259a2f..c174cb3b 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py @@ -20,8 +20,8 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import ( +from hyperion.np.metrics.acc import compute_accuracy +from hyperion.np.metrics.confusion_matrix import ( compute_confusion_matrix, print_confusion_matrix, ) diff --git a/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py b/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py index b681b0ac..e2c8e928 100755 --- a/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py +++ b/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py @@ -13,7 +13,7 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, LDA, LNorm, PCA +from hyperion.np.transforms import TransformList, LDA, LNorm, PCA from hyperion.helpers import PLDAFactory as F from numpy.linalg import matrix_rank diff --git a/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py b/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py index 03fa3325..b7725386 100755 --- a/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py +++ b/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py @@ -25,7 +25,7 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.transforms import LDA +from hyperion.np.transforms import LDA colors = ["b", "g", "r", "c", "m", "y", "k"] markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] diff --git a/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py b/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py index a76a6633..b02447e8 100755 --- a/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py +++ b/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py @@ -25,7 +25,7 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.transforms import PCA, SklTSNE, LNorm +from hyperion.np.transforms import PCA, SklTSNE, LNorm colors = ["b", "g", "r", "c", "m", "y", "k"] markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] diff --git a/egs/voxceleb/v1/local/attack_analysis.py b/egs/voxceleb/v1/local/attack_analysis.py index 8c74c6e9..2e0fdb42 100755 --- a/egs/voxceleb/v1/local/attack_analysis.py +++ b/egs/voxceleb/v1/local/attack_analysis.py @@ -15,7 +15,7 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.metrics.verification_evaluator import ( +from hyperion.np.metrics.verification_evaluator import ( VerificationAdvAttackEvaluator as Eval, ) diff --git a/egs/voxceleb/v1/local/make_some_figs.py b/egs/voxceleb/v1/local/make_some_figs.py index 207cab20..a4117aba 100755 --- a/egs/voxceleb/v1/local/make_some_figs.py +++ b/egs/voxceleb/v1/local/make_some_figs.py @@ -9,7 +9,7 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.metrics.verification_evaluator import ( +from hyperion.np.metrics.verification_evaluator import ( VerificationAdvAttackEvaluator as Eval, ) diff --git a/egs/voxceleb/v1/local/score_dcf.py b/egs/voxceleb/v1/local/score_dcf.py index 9858583d..3524d222 100755 --- a/egs/voxceleb/v1/local/score_dcf.py +++ b/egs/voxceleb/v1/local/score_dcf.py @@ -14,7 +14,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import SparseTrialScores, SparseTrialKey -from hyperion.metrics import fast_eval_dcf_eer as fast_eval +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval def score_dcf(key_file, score_file, output_path): diff --git a/egs/voxceleb/v1/steps_be/eval-be-v1.py b/egs/voxceleb/v1/steps_be/eval-be-v1.py index c88b05fc..f7d26390 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-v1.py +++ b/egs/voxceleb/v1/steps_be/eval-be-v1.py @@ -23,7 +23,7 @@ from hyperion.utils import TrialNdx, TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voxceleb/v1/steps_be/eval-be-v2.py b/egs/voxceleb/v1/steps_be/eval-be-v2.py index 0438e373..d5cd6a55 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-v2.py +++ b/egs/voxceleb/v1/steps_be/eval-be-v2.py @@ -22,7 +22,7 @@ from hyperion.utils.math import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F -from hyperion.transforms import TransformList +from hyperion.np.transforms import TransformList def eval_plda( diff --git a/egs/voxceleb/v1/steps_be/eval-calibration-v1.py b/egs/voxceleb/v1/steps_be/eval-calibration-v1.py index bf252f60..fdd5516f 100755 --- a/egs/voxceleb/v1/steps_be/eval-calibration-v1.py +++ b/egs/voxceleb/v1/steps_be/eval-calibration-v1.py @@ -23,8 +23,8 @@ from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): diff --git a/egs/voxceleb/v1/steps_be/train-be-v1.py b/egs/voxceleb/v1/steps_be/train-be-v1.py index a1e6fa7e..ea8cf867 100755 --- a/egs/voxceleb/v1/steps_be/train-be-v1.py +++ b/egs/voxceleb/v1/steps_be/train-be-v1.py @@ -17,7 +17,7 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.transforms import TransformList, LDA, LNorm, PCA +from hyperion.np.transforms import TransformList, LDA, LNorm, PCA from hyperion.helpers import PLDAFactory as F diff --git a/egs/voxceleb/v1/steps_be/train-be-v2.py b/egs/voxceleb/v1/steps_be/train-be-v2.py index 1d72df93..4e3d7542 100755 --- a/egs/voxceleb/v1/steps_be/train-be-v2.py +++ b/egs/voxceleb/v1/steps_be/train-be-v2.py @@ -18,7 +18,7 @@ from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.transforms import TransformList, CentWhiten, PCA +from hyperion.np.transforms import TransformList, CentWhiten, PCA from numpy.linalg import matrix_rank diff --git a/egs/voxceleb/v1/steps_be/train-calibration-v1.py b/egs/voxceleb/v1/steps_be/train-calibration-v1.py index 7408fd1d..489ceed9 100755 --- a/egs/voxceleb/v1/steps_be/train-calibration-v1.py +++ b/egs/voxceleb/v1/steps_be/train-calibration-v1.py @@ -22,8 +22,8 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_scores import TrialScores from hyperion.utils.trial_key import TrialKey -from hyperion.metrics import compute_act_dcf, compute_min_dcf -from hyperion.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): From 569c9ba0eaa9b14cf2708ecf4aadbd6d5a301c7c Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 9 Mar 2022 13:47:18 -0500 Subject: [PATCH 003/154] added more docs and x_lengths support --- ...rch-extract-xvectors-from-wav-with-rttm.py | 26 -- .../bin/torch-extract-xvectors-vae-preproc.py | 17 +- hyperion/np/feats/mfcc.py | 2 +- hyperion/torch/layer_blocks/conformer_conv.py | 10 +- hyperion/torch/layer_blocks/dc1d_blocks.py | 57 ++- hyperion/torch/layer_blocks/dc2d_blocks.py | 52 +++ hyperion/torch/layer_blocks/etdnn_blocks.py | 29 +- hyperion/torch/layer_blocks/mbconv_blocks.py | 49 ++- .../torch/layer_blocks/resetdnn_blocks.py | 26 +- hyperion/torch/layer_blocks/se_blocks.py | 108 +++++- hyperion/torch/layer_blocks/tdnn_blocks.py | 25 ++ hyperion/torch/layers/activation_factory.py | 7 +- hyperion/torch/layers/audio_feats.py | 332 +++++++++++++++--- hyperion/torch/layers/audio_feats_factory.py | 62 +++- hyperion/torch/layers/calibrators.py | 16 + hyperion/torch/layers/dropout.py | 48 +++ hyperion/torch/layers/global_pool.py | 282 ++++++++++++--- hyperion/torch/layers/interpolate.py | 15 + hyperion/torch/layers/margin_losses.py | 74 ++++ hyperion/torch/layers/mvn.py | 28 +- hyperion/torch/layers/norm_layer_factory.py | 8 + hyperion/torch/layers/pdf_storage.py | 10 +- hyperion/torch/layers/pool_factory.py | 32 ++ hyperion/torch/layers/spec_augment.py | 48 ++- hyperion/torch/layers/subpixel_convs.py | 48 +++ hyperion/torch/layers/swish.py | 6 + hyperion/torch/layers/tensor2pdf.py | 121 ++++++- hyperion/torch/layers/vq.py | 218 +++++++++++- .../torch/models/wav2xvectors/__init__.py | 13 + .../hf_wav2vec2resnet1d_xvector.py | 40 +++ .../models/wav2xvectors/hf_wav2xvector.py | 26 ++ .../wav2xvectors/wav2resnet1d_xvector.py | 53 +++ .../models/wav2xvectors/wav2resnet_xvector.py | 53 +++ .../torch/models/wav2xvectors/wav2xvector.py | 128 +++++++ hyperion/torch/models/xvectors/__init__.py | 13 + .../torch/models/xvectors/resnet1d_xvector.py | 2 +- hyperion/torch/models/xvectors/xvector.py | 196 ++++------- hyperion/torch/narchs/audio_feats_mvn.py | 16 +- hyperion/torch/narchs/classif_head.py | 28 +- hyperion/torch/narchs/conformer_encoder_v1.py | 11 +- hyperion/torch/torch_model.py | 4 +- .../trainers/xvector_adv_trainer_from_wav.py | 11 +- hyperion/torch/trainers/xvector_finetuner.py | 117 ------ hyperion/torch/trainers/xvector_trainer.py | 2 +- .../trainers/xvector_trainer_deep_feat_reg.py | 51 +-- .../xvector_trainer_deep_feat_reg_from_wav.py | 22 +- .../trainers/xvector_trainer_from_wav.py | 4 +- hyperion/torch/utils/__init__.py | 3 + hyperion/torch/utils/collation.py | 92 +++++ hyperion/torch/utils/masking.py | 58 +++ hyperion/torch/utils/misc.py | 66 +++- hyperion/torch/utils/vad_utils.py | 59 ++++ 52 files changed, 2268 insertions(+), 556 deletions(-) create mode 100644 hyperion/torch/models/wav2xvectors/__init__.py create mode 100644 hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py create mode 100644 hyperion/torch/models/wav2xvectors/hf_wav2xvector.py create mode 100644 hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py create mode 100644 hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py create mode 100644 hyperion/torch/models/wav2xvectors/wav2xvector.py create mode 100644 hyperion/torch/models/xvectors/__init__.py delete mode 100644 hyperion/torch/trainers/xvector_finetuner.py create mode 100644 hyperion/torch/utils/collation.py create mode 100644 hyperion/torch/utils/masking.py create mode 100644 hyperion/torch/utils/vad_utils.py diff --git a/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py b/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py index c4f1ba9a..101d6a10 100755 --- a/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py +++ b/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py @@ -154,21 +154,6 @@ def extract_xvectors( t3 = time.time() key, x = augment(key0, x0, augmenter, aug_df, aug_id) - # if augmenter is None: - # x = x0 - # key = key0 - # else: - # x, aug_info = augmenter(x0) - # key = '%s-aug-%02d' % (key0, aug_id) - # aug_df_row = {'key_aug': key, 'key_orig': key0, - # 'noise_type': aug_info['noise']['noise_type'], - # 'snr': aug_info['noise']['snr'], - # 'rir_type': aug_info['reverb']['rir_type'], - # 'srr': aug_info['reverb']['srr'], - # 'sdr': aug_info['sdr']} - - # aug_df.append(pd.DataFrame(aug_df_row, index=[0])) - x_total = x max_samples = x.shape[0] y = np.zeros( @@ -219,17 +204,6 @@ def extract_xvectors( key, x, min_utt_length, max_utt_length, rng ) - # if random_utt_length: - # utt_length = rng.randint( - # low=min_utt_length, high=max_utt_length+1) - # if utt_length < x.shape[1]: - # first_frame = rng.randint( - # low=0, high=x.shape[1]-utt_length) - # x = x[:,first_frame:first_frame+utt_length] - # logging.info( - # 'extract-random-utt %s of length=%d first-frame=%d' % ( - # key, x.shape[1], first_frame)) - t6 = time.time() if x.shape[1] > 0: x = x.transpose(1, 2).contiguous() diff --git a/hyperion/bin/torch-extract-xvectors-vae-preproc.py b/hyperion/bin/torch-extract-xvectors-vae-preproc.py index 64f6359d..afa7a117 100755 --- a/hyperion/bin/torch-extract-xvectors-vae-preproc.py +++ b/hyperion/bin/torch-extract-xvectors-vae-preproc.py @@ -96,21 +96,6 @@ def extract_xvectors( keys = [] info = [] - # num_gpus = 1 if use_gpu else 0 - # logging.info('initializing devices num_gpus={}'.format(num_gpus)) - # device = open_device(num_gpus=num_gpus) - # logging.info('loading x-vector model {}'.format(xvec_model_path)) - # xvec_model = TML.load(xvec_model_path) - # xvec_model.to(device) - # xvec_model.eval() - # logging.info('x-vector={}'.format(xvec_model)) - - # logging.info('loading vae model {}'.format(vae_model_path)) - # vae_model = TML.load(vae_model_path) - # vae_model.to(device) - # vae_model.eval() - # logging.info('vae={}'.format(vae_model)) - mse_loss = torch.nn.MSELoss() dr_args = DRF.filter_args(**kwargs) @@ -151,7 +136,7 @@ def extract_xvectors( t4 = time.time() if x.shape[0] == 0: - y = np.zeros((model.embed_dim,), dtype=float_cpu()) + y = np.zeros((xvec_model.embed_dim,), dtype=float_cpu()) else: xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) with torch.no_grad(): diff --git a/hyperion/np/feats/mfcc.py b/hyperion/np/feats/mfcc.py index cf517ee5..d6b8dd3f 100644 --- a/hyperion/np/feats/mfcc.py +++ b/hyperion/np/feats/mfcc.py @@ -64,7 +64,7 @@ class MFCC(object): """Compute MFCC features. Attributes: - sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) + sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) frame_length: Frame length in milliseconds (default = 25) frame_shift: Frame shift in milliseconds (default = 10) fft_length: Length of FFT (default = 512) diff --git a/hyperion/torch/layer_blocks/conformer_conv.py b/hyperion/torch/layer_blocks/conformer_conv.py index 7ed9a43a..0c42f34a 100644 --- a/hyperion/torch/layer_blocks/conformer_conv.py +++ b/hyperion/torch/layer_blocks/conformer_conv.py @@ -100,14 +100,16 @@ def __init__( self.context = stride * (kernel_size - 1) // 2 - def forward(self, x): + def forward(self, x, x_mask=None): """Forward function Args: - x: input size = (batch, num_channels, time) + x: input tesnosr shape = (batch, num_channels, time) + x_mask: mask indicating the valid frames in the sequence with + shape = (batch, 1, time) or (batch, time) Returns - torch.Tensor size = (batch, num_channels, (time-1)//stride+1) + Tensor with shape = (batch, num_channels, (time-1)//stride+1) """ residual = x @@ -121,7 +123,7 @@ def forward(self, x): # depthwide conv phase x = self.act(self.norm_dw(self.conv_dw(x))) if self.has_se: - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) # final projection x = self.conv_proj(x) diff --git a/hyperion/torch/layer_blocks/dc1d_blocks.py b/hyperion/torch/layer_blocks/dc1d_blocks.py index f5b794ef..da643c34 100644 --- a/hyperion/torch/layer_blocks/dc1d_blocks.py +++ b/hyperion/torch/layer_blocks/dc1d_blocks.py @@ -12,6 +12,22 @@ class DC1dEncBlock(nn.Module): + """Build block for deep convolutional encoder 1d. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + stride: downsampling stride. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -62,14 +78,25 @@ def __init__( self.context = dilation * (kernel_size - 1) // 2 def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Un freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ x = self.conv1(x) if self.norm_before: @@ -88,6 +115,22 @@ def forward(self, x): class DC1dDecBlock(nn.Module): + """Build block for deep convolutional decoder 1d. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + stride: upsampling stride. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -149,15 +192,25 @@ def __init__( self.context = dilation * (kernel_size - 1) // 2 def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Unfreezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) diff --git a/hyperion/torch/layer_blocks/dc2d_blocks.py b/hyperion/torch/layer_blocks/dc2d_blocks.py index 0d251528..bae8e203 100644 --- a/hyperion/torch/layer_blocks/dc2d_blocks.py +++ b/hyperion/torch/layer_blocks/dc2d_blocks.py @@ -11,6 +11,22 @@ class DC2dEncBlock(nn.Module): + """Build block for deep convolutional encoder 2d. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + stride: downsampling stride. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm2d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -61,15 +77,25 @@ def __init__( self.context = dilation * (kernel_size - 1) // 2 def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Unfreezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) @@ -87,6 +113,22 @@ def forward(self, x): class DC2dDecBlock(nn.Module): + """Build block for deep convolutional decoder 2d. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + stride: upsampling stride. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm2d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -148,15 +190,25 @@ def __init__( self.context = dilation * (kernel_size - 1) // 2 def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Unfreezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) diff --git a/hyperion/torch/layer_blocks/etdnn_blocks.py b/hyperion/torch/layer_blocks/etdnn_blocks.py index 958c31ba..17f3f8ef 100644 --- a/hyperion/torch/layer_blocks/etdnn_blocks.py +++ b/hyperion/torch/layer_blocks/etdnn_blocks.py @@ -13,6 +13,21 @@ class ETDNNBlock(nn.Module): + """Building block for Extended-TDNN. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -62,15 +77,21 @@ def __init__( ) self.conv2 = Conv1d(out_channels, out_channels, bias=bias, kernel_size=1) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. - x = self.conv1(x) + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ + x = self.conv1(x) if self.norm_before: x = self.bn1(x) x = self.activation1(x) - if self.norm_after: x = self.bn1(x) @@ -78,12 +99,10 @@ def forward(self, x): x = self.dropout1(x) x = self.conv2(x) - if self.norm_before: x = self.bn2(x) x = self.activation2(x) - if self.norm_after: x = self.bn2(x) diff --git a/hyperion/torch/layer_blocks/mbconv_blocks.py b/hyperion/torch/layer_blocks/mbconv_blocks.py index 6d9a3141..89c746ea 100644 --- a/hyperion/torch/layer_blocks/mbconv_blocks.py +++ b/hyperion/torch/layer_blocks/mbconv_blocks.py @@ -42,6 +42,22 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer): class MBConvBlock(nn.Module): + """MobileNet/EfficentNet Inverted bottleneck Block + + Attributes: + in_channels: input channels. + out_channels: output channels + expansion: expansion of channels for the inverted bottleneck. + kernel_size: kernel size of the convs. + stride: downsampling stride of the convs. + activation: Non-linear activation object, string of configuration dictionary. + drop_connect_rate: Drop-connect rate for stochastic number of layers. + norm_layer: Normalization layer constructor, if None BatchNorm2d is used. + se_r=None: Squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + def __init__( self, in_channels, @@ -113,8 +129,17 @@ def __init__( self.context = stride * (kernel_size - 1) // 2 self.downsample_factor = stride - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x if self.expansion > 1: x = self.act(self.bn_exp(self.conv_exp(x))) @@ -137,6 +162,19 @@ def forward(self, x): class MBConvInOutBlock(nn.Module): + """Convolutional block used as input/output + in MobileNet/EffcientNet + + Attributes: + in_channels: input channels. + out_channels: output channels + kernel_size: kernel size of the convs. + stride: downsampling stride of the convs. + activation: Non-linear activation object, string of configuration dictionary. + norm_layer: Normalization layer constructor, if None BatchNorm2d is used. + + """ + def __init__( self, in_channels, @@ -169,4 +207,13 @@ def __init__( self.downsample_factor = stride def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ return self.act(self.bn(self.conv(x))) diff --git a/hyperion/torch/layer_blocks/resetdnn_blocks.py b/hyperion/torch/layer_blocks/resetdnn_blocks.py index 9d849719..775118d1 100644 --- a/hyperion/torch/layer_blocks/resetdnn_blocks.py +++ b/hyperion/torch/layer_blocks/resetdnn_blocks.py @@ -15,6 +15,21 @@ class ResETDNNBlock(ETDNNBlock): + """Building block for Residual Extended-TDNN. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, num_channels, @@ -39,7 +54,16 @@ def __init__( norm_before, ) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ residual = x x = self.conv1(x) diff --git a/hyperion/torch/layer_blocks/se_blocks.py b/hyperion/torch/layer_blocks/se_blocks.py index 3d33f7d4..c3ba8e20 100644 --- a/hyperion/torch/layer_blocks/se_blocks.py +++ b/hyperion/torch/layer_blocks/se_blocks.py @@ -11,7 +11,15 @@ class SEBlock2D(nn.Module): - """From https://arxiv.org/abs/1709.01507""" + """Squeeze-excitation block 2d + from https://arxiv.org/abs/1709.01507. + + Attributes: + num_channels: input/output channels. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. + + """ def __init__( self, num_channels, r=16, activation={"name": "relu", "inplace": True} @@ -26,8 +34,33 @@ def __init__( ) self.sigmoid = nn.Sigmoid() - def forward(self, x): - z = torch.mean(x, dim=(2, 3), keepdim=True) + def _standardize_mask(self, mask): + if mask.dim() == 2: + return mask.view(mask.size(0), 1, 1, mask.size(-1)) + + if mask.dim() == 3: + return mask.unsqueeze(1) + + return mask + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ + if x_mask is None: + z = torch.mean(x, dim=(2, 3), keepdim=True) + else: + x_mask = self._standardize_mask(x_mask) + total = torch.mean(x_mask, dim=(2, 3), keepdim=True) + z = torch.mean(x * x_mask, dim=(2, 3), keepdim=True) / total + scale = self.sigmoid(self.conv2(self.act(self.conv1(z)))) y = scale * x return y @@ -35,7 +68,14 @@ def forward(self, x): class TSEBlock2D(nn.Module): """From https://arxiv.org/abs/1709.01507 - Modified to do pooling only in time dimension + Modified to do pooling only in time dimension. + + Attributes: + num_channels: input/output channels. + num_feats: Number of features in dimension 2. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. + """ def __init__( @@ -62,10 +102,35 @@ def __init__( ) self.sigmoid = nn.Sigmoid() - def forward(self, x): + def _standardize_mask(self, mask): + if mask.dim() == 2: + return mask.view(mask.size(0), 1, 1, mask.size(-1)) + + if mask.dim() == 3: + return mask.unsqueeze(1) + + return mask + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ num_feats = x.shape[2] num_channels = x.shape[1] - z = torch.mean(x, dim=-1, keepdim=True) + if x_mask is None: + z = torch.mean(x, dim=-1, keepdim=True) + else: + x_mask = self._standardize_mask(x_mask) + total = torch.mean(x_mask, dim=-1, keepdim=True) + z = torch.mean(x * x_mask, dim=-1, keepdim=True) / total + z = z.view(-1, self.num_channels_1d, 1, 1) scale = self.sigmoid(self.conv2(self.act(self.conv1(z)))) scale = scale.view(-1, num_channels, num_feats, 1) @@ -76,6 +141,11 @@ def forward(self, x): class SEBlock1d(nn.Module): """1d Squeeze Excitation version of https://arxiv.org/abs/1709.01507 + + Attributes: + num_channels: input/output channels. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. """ def __init__( @@ -91,8 +161,30 @@ def __init__( ) self.sigmoid = nn.Sigmoid() - def forward(self, x): - z = torch.mean(x, dim=2, keepdim=True) + def _standardize_mask(self, mask): + if mask.dim() == 2: + return mask.unsqueeze(1) + + return mask + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, time). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time) + + Returns: + Tensor with shape = (batch, channels, time). + """ + if x_mask is None: + z = torch.mean(x, dim=2, keepdim=True) + else: + x_mask = self._standardize_mask(x_mask) + total = torch.mean(x_mask, dim=-1, keepdim=True) + z = torch.mean(x * x_mask, dim=-1, keepdim=True) / total + scale = self.sigmoid(self.conv2(self.act(self.conv1(z)))) y = scale * x return y diff --git a/hyperion/torch/layer_blocks/tdnn_blocks.py b/hyperion/torch/layer_blocks/tdnn_blocks.py index 8fcbb056..e979b7db 100644 --- a/hyperion/torch/layer_blocks/tdnn_blocks.py +++ b/hyperion/torch/layer_blocks/tdnn_blocks.py @@ -11,6 +11,21 @@ class TDNNBlock(nn.Module): + """Building block for TDNN. + + Args: + in_channels: input channels. + out_channels: output channels. + kernel_size: kernels size for the convolution. + dilation: kernel dilation. + activation: non-linear activation function object, string or config dict. + dropout_rate: dropout rate. + use_norm: if True, if uses layer normalization. + norm_layer: Normalization Layer constructor, if None it used BatchNorm1d. + norm_before: if True, layer normalization is before the non-linearity, else + after the non-linearity. + """ + def __init__( self, in_channels, @@ -56,15 +71,25 @@ def __init__( ) def freeze(self): + """Freezes trainable parameters.""" for param in self.parameters(): param.requires_grad = False def unfreeze(self): + """Unreezes trainable parameters.""" for param in self.parameters(): param.requires_grad = True def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_time). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_time). + """ x = self.conv1(x) if self.norm_before: diff --git a/hyperion/torch/layers/activation_factory.py b/hyperion/torch/layers/activation_factory.py index 7bc09827..1d3bdfd2 100644 --- a/hyperion/torch/layers/activation_factory.py +++ b/hyperion/torch/layers/activation_factory.py @@ -41,12 +41,13 @@ def create(activation, **kwargs): """Creates a non-linear activation object Args: - activation: str with activation type, - dictionary with name field indicating the activation type, and extra activation arguments + activation: String with activation type, + dictionary with name field indicating the activation type, + and extra activation arguments None, then it returns None, Activation constructor - **kwargs: extra arguments for activation constructor + **kwargs: Extra arguments for activation constructor Return: Non-linear activation object diff --git a/hyperion/torch/layers/audio_feats.py b/hyperion/torch/layers/audio_feats.py index 718844f5..34cb9aa3 100644 --- a/hyperion/torch/layers/audio_feats.py +++ b/hyperion/torch/layers/audio_feats.py @@ -66,21 +66,23 @@ def _get_feature_window_function(window_type, window_size, blackman_coeff=0.42): def _get_strided_batch(waveform, window_length, window_shift, snip_edges, center=False): - r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``) - representing how the window is shifted along the waveform. Each row is a frame. + """Given a waveform (2D tensor of size (batch, num_samples), + it returns a 3D tensor (batch, m, window_size) + representing how the window is shifted along the waveform. Each row is a frame. Args: - waveform (torch.Tensor): Tensor of size ``num_samples`` - window_size (int): Frame length - window_shift (int): Frame shift - snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit - in the file, and the number of frames depends on the frame_length. If False, the number of frames - depends only on the frame_shift, and we reflect the data at the ends. - center (bool): If true, if puts the center of the frame at t*window_shift, starting at t=0, - If overwrides snip_edges and set it to False + waveform: Tensor of size (batch, num_samples). + window_size: Frame length in samples. + window_shift: Frame shift in samples. + snip_edges: If True, end effects will be handled by outputting only frames + that completely fit in the file, and the number of frames depends + on the frame_length. If False, the number of frames depends only + on the frame_shift, and we reflect the data at the ends. + center (bool): If true, if puts the center of the frame at t*window_shift, + starting at t=0, it overwrides snip_edges and set it to False Returns: - torch.Tensor: 3D tensor of size (m, ``window_size``) where each row is a frame + 3D tensor of size (batch, m, ``window_size``) where each row is a frame """ assert waveform.dim() == 2 batch_size = waveform.size(0) @@ -121,7 +123,7 @@ def _get_strided_batch(waveform, window_length, window_shift, snip_edges, center def _get_log_energy(x, energy_floor): - r"""Returns the log energy of size (m) for a strided_input (m,*)""" + r"""Returns the log energy of size (batch, m) for a strided_input (batch, m,*)""" log_energy = (x.pow(2).sum(-1) + 1e-15).log() # size (m) if energy_floor > 0.0: log_energy = torch.max( @@ -133,6 +135,13 @@ def _get_log_energy(x, energy_floor): class Wav2Win(nn.Module): + """Class that takes a batch of waveforms and returns windowed frames + with a given frame-shift and frame-length. + + Attributes: + + """ + def __init__( self, fs=16000, @@ -235,7 +244,7 @@ def forward(self, x): if self.return_log_energy and not self.raw_energy: signal_log_energy = _get_log_energy( - strided_input, self.energy_floor + x_strided, self.energy_floor ) # size (batch, m) # Pad columns with zero until we reach size (batch, num_frames, pad_length) @@ -252,6 +261,37 @@ def forward(self, x): class Wav2FFT(nn.Module): + """Computes FFT from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -324,14 +364,19 @@ def dither(self): return self.wav2win.dither def forward(self, x): + """Computes the comples Fourier transform. + + Args: + x: waveform tensor with shape = (batch, num_samples). + Returns: + FFT tensor with shape = (batch, num_frames, fft_length//2+1) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) - if self.use_energy: X[:, 0, :, 0] = log_e @@ -339,6 +384,37 @@ def forward(self, x): class Wav2Spec(Wav2FFT): + """Computes Spectrograms from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -380,18 +456,21 @@ def __init__( self._to_spec = _pow_spectrogram def forward(self, x): + """Computes the Spectrogram. + + Args: + x: waveform tensor with shape = (batch, num_samples). + + Returns: + Spectrogram tensor with shape = (batch, num_frames, fft_length//2+1) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) pow_spec = self._to_spec(X) - # pow_spec = X.pow(2).sum(-1) - # if self.use_fft_mag: - # pow_spec = pow_spec.sqrt() - if self.use_energy: pow_spec[:, 0] = log_e @@ -399,6 +478,37 @@ def forward(self, x): class Wav2LogSpec(Wav2FFT): + """Computes log-spectrograms from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -440,21 +550,21 @@ def __init__( self._to_spec = _pow_spectrogram def forward(self, x): + """Computes the log-spectrogram. + + Args: + x: waveform tensor with shape = (batch, num_samples). + Returns: + Spectrogram tensor with shape = (batch, num_frames, fft_length//2+1) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) pow_spec = self._to_spec(X) - - # pow_spec = X.pow(2).sum(-1) - # if self.use_fft_mag: - # pow_spec = pow_spec.sqrt() - pow_spec = (pow_spec + 1e-15).log() - if self.use_energy: pow_spec[:, 0] = log_e @@ -462,6 +572,46 @@ def forward(self, x): class Wav2LogFilterBank(Wav2FFT): + """Computes log-filter-bank from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + fb_type: Filter-bank type in ["mel_kaldi", "mel_etsi", + "mel_librosa", "mel_librosa_htk", "linear"] + (default = 'mel_kaldi') + low_freq: Low cutoff frequency for mel bins (default = 20) + high_freq: High cutoff frequency for mel bins, if < 0, + offset from Nyquist (default = 0) + num_filters: Number of triangular mel-frequency bins (default = 23) + norm_filters: Normalize filters coeff to sum up to 1, if librosa + it uses Stanley norm (default = False) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -526,29 +676,24 @@ def __init__( self._to_spec = _pow_spectrogram def forward(self, x): + """Computes the log-filter-banks. + + Args: + x: waveform tensor with shape = (batch, num_samples). + Returns: + Filter-bank tensor with shape = (batch, num_frames, num_filters) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) - # logging.info('X={} {}'.format(X, X.type())) - # logging.info('X={}'.format(X.type())) pow_spec = self._to_spec(X) - # pow_spec = X.pow(2).sum(-1) - # # logging.info('p={} {} nan={}'.format(pow_spec, pow_spec.type(), torch.sum(torch.isnan(pow_spec)))) - # # logging.info('p={}'.format(pow_spec.type())) - # if self.use_fft_mag: - # pow_spec = pow_spec.sqrt() - with amp.autocast(enabled=False): pow_spec = torch.matmul(pow_spec.float(), self._fb.float()) - # logging.info('fb={} {}'.format(pow_spec, pow_spec.type())) - # logging.info('fb={}'.format(pow_spec.type())) + pow_spec = (pow_spec + 1e-10).log() - # logging.info('lfb={} {}'.format(pow_spec, pow_spec.type())) - # logging.info('lfb={}'.format(pow_spec.type())) if self.use_energy: pow_spec = torch.cat((log_e.unsqueeze(-1), pow_spec), dim=-1) @@ -556,6 +701,49 @@ def forward(self, x): class Wav2MFCC(Wav2FFT): + """Computes MFCC from waveforms. + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + fb_type: Filter-bank type in ["mel_kaldi", "mel_etsi", + "mel_librosa", "mel_librosa_htk", "linear"] + (default = 'mel_kaldi') + low_freq: Low cutoff frequency for mel bins (default = 20) + high_freq: High cutoff frequency for mel bins, if < 0, + offset from Nyquist (default = 0) + num_filters: Number of triangular mel-frequency bins (default = 23) + norm_filters: Normalize filters coeff to sum up to 1, if librosa + it uses Stanley norm (default = False) + num_ceps: Number of cepstra in MFCC computation (including C0) + (default = 13) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + cepstral_lifter: Constant that controls scaling of MFCCs (default = 22) + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ + def __init__( self, fs=16000, @@ -648,6 +836,15 @@ def make_lifter(N, Q): @staticmethod def make_dct_matrix(num_ceps, num_filters): + """Calculates the DCT Matrix. + + Args: + num_ceps: Number of cepstral coeffs. + num_filters: Number of filters. + + Returns + DCT matrix (num_ceps, num_filters) + """ n = torch.arange(float(num_filters)).unsqueeze(1) k = torch.arange(float(num_ceps)) dct = torch.cos( @@ -658,23 +855,25 @@ def make_dct_matrix(num_ceps, num_filters): return dct def forward(self, x): + """Computes the MFCC. + + Args: + x: Waveform tensor with shape = (batch, num_samples). + + Returns: + MFCC tensor with shape = (batch, num_frames, num_ceps) + """ x_strided = self.wav2win(x) if self.use_energy: x_strided, log_e = x_strided - # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) pow_spec = self._to_spec(X) - # pow_spec = X.pow(2).sum(-1) - # if self.use_fft_mag: - # pow_spec = pow_spec.sqrt() - with amp.autocast(enabled=False): pow_spec = torch.matmul(pow_spec.float(), self._fb.float()) pow_spec = (pow_spec + 1e-10).log() - mfcc = torch.matmul(pow_spec, self._dct) if self.cepstral_lifter > 0: mfcc *= self._lifter @@ -689,6 +888,31 @@ class Wav2KanBayashiLogFilterBank(Wav2LogFilterBank): """Class to replicate log-filter-banks used in Kan Bayashi's ParallelWaveGAN repository: https://github.com/kan-bayashi/ParallelWaveGAN + + Attributes: + fs: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds + frame_shift: Frame shift in milliseconds + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + fb_type: Filter-bank type in ["mel_kaldi", "mel_etsi", + "mel_librosa", "mel_librosa_htk", "linear"] + (default = 'mel_kaldi') + low_freq: Low cutoff frequency for mel bins (default = 20) + high_freq: High cutoff frequency for mel bins, if < 0, + offset from Nyquist (default = 0) + num_filters: Number of triangular mel-frequency bins (default = 23) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False """ def __init__( @@ -730,6 +954,14 @@ def __init__( self.scale = 1.0 / math.log(10) def forward(self, x): + """Computes the Log filter banks using Kan Bayashi configuration. + + Args: + x: Waveform tensor with shape = (batch, num_samples). + + Returns: + Filter-bank tensor with shape = (batch, num_frames, num_samples) + """ return self.scale * super().forward(x) @@ -768,6 +1000,14 @@ def __init__( ) def forward(self, x): + """Computes the Log filter banks from spectrograms. + + Args: + x: Waveform tensor with shape = (batch, num_samples). + + Returns: + Filter-bank tensor with shape = (batch, num_frames, num_filters) + """ with amp.autocast(enabled=False): pow_spec = torch.matmul(x.float(), self._fb.float()) pow_spec = (pow_spec + 1e-10).log() diff --git a/hyperion/torch/layers/audio_feats_factory.py b/hyperion/torch/layers/audio_feats_factory.py index ac463f07..71c3a8e8 100644 --- a/hyperion/torch/layers/audio_feats_factory.py +++ b/hyperion/torch/layers/audio_feats_factory.py @@ -6,7 +6,7 @@ import re from ...utils.misc import str2bool -from ...feats.filter_banks import FilterBankFactory as FBF +from ...np.feats.filter_banks import FilterBankFactory as FBF from .audio_feats import * FFT = "fft" @@ -20,6 +20,10 @@ class AudioFeatsFactory(object): + """Factory class to create acoustic features layers like + FFT, Spectrogram, log-Spectrogram, log-filter-bank, MFCC. + """ + @staticmethod def create( audio_feat, @@ -45,6 +49,53 @@ def create( raw_energy=True, use_energy=True, ): + """ + Method that creates acoustic features layers like + FFT, Spectrogram, log-Spectrogram, log-filter-bank, MFCC. + + Args: + audio_feat: Type of feature extractor in ["fft", "spec", "log_spec", + "logfb", "mfcc", "kanbayashi_logfb"]. "kanbayashi_logfb" + should produce features compatible with WaveGAN repository. + sample_frequency: Waveform data sample frequency (must match the waveform + file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ["hamming"|"hanning"|"povey"|"rectangular"| + "blackmann"] (default = 'povey') + use_fft_mag: If false, it uses |X(f)|^2, if true, it uses |X(f)|, + (default = False) + dither: Dithering constant (0.0 means no dither) (default = 1) + fb_type: Filter-bank type in ["mel_kaldi", "mel_etsi", + "mel_librosa", "mel_librosa_htk", "linear"] + (default = 'mel_kaldi') + low_freq: Low cutoff frequency for mel bins (default = 20) + high_freq: High cutoff frequency for mel bins, if < 0, + offset from Nyquist (default = 0) + num_filters: Number of triangular mel-frequency bins (default = 23) + norm_filters: Normalize filters coeff to sum up to 1, if librosa + it uses Stanley norm (default = False) + num_ceps: Number of cepstra in MFCC computation (including C0) + (default = 13) + snip_edges: If true, end effects will be handled by outputting only + frames that completely fit in the file, and the number of + frames depends on the frame-length. + If false, the number of frames depends only on the + frame-shift, and we reflect the data at the ends. + (default = True) + center: If true, if puts the center of the frame at t*window_shift, starting at t=0, + If overwrides snip_edges and set it to False + cepstral_lifter: Constant that controls scaling of MFCCs (default = 22) + energy_floor: Floor on energy (absolute, not relative) in MFCC computation + (default = 0) + raw_energy: If true, compute energy before preemphasis and + windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + + """ if audio_feat == FFT: return Wav2FFT( @@ -163,13 +214,13 @@ def create( @staticmethod def filter_args(**kwargs): - """Filters MFCC args from arguments dictionary. + """Filters feature extractor args from arguments dictionary. Args: kwargs: Arguments dictionary. Returns: - Dictionary with MFCC options. + Dictionary with feature extractor options. """ valid_args = ( "sample_frequency", @@ -189,7 +240,7 @@ def filter_args(**kwargs): "norm_filters", "num_ceps", "snip_edges", - "energy_floor", + "center" "energy_floor", "raw_energy", "use_energy", "cepstral_lifter", @@ -201,7 +252,7 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): - """Adds MFCC options to parser. + """Adds feature extractor options to parser. Args: parser: Arguments parser @@ -337,6 +388,5 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='acoustic features options') add_argparse_args = add_class_args diff --git a/hyperion/torch/layers/calibrators.py b/hyperion/torch/layers/calibrators.py index 4b38a858..51d363b8 100644 --- a/hyperion/torch/layers/calibrators.py +++ b/hyperion/torch/layers/calibrators.py @@ -8,10 +8,26 @@ class LinBinCalibrator(nn.Module): + """Linear score calibrator. + Applies a scale and bias to a tensor. + + Attributes: + a: Scale + b: Bias + """ + def __init__(self, a, b): super().__init__() self.a = a self.b = b def forward(self, x): + """Applies scale and bias to a tensor. + + Args: + x: Input tensor. + + Returns: + Calibrated tensor. + """ return self.a * x + self.b diff --git a/hyperion/torch/layers/dropout.py b/hyperion/torch/layers/dropout.py index 6765baa5..22bff733 100644 --- a/hyperion/torch/layers/dropout.py +++ b/hyperion/torch/layers/dropout.py @@ -10,7 +10,21 @@ class Dropout1d(Dropout2d): + """Dropout for tensors with 1d spatial (time) dimension (3d tensors). + + Attributes: + p: Drop probability. + """ + def forward(self, inputs): + """Applies dropout 1d. + + Args: + inputs: Input tensor with shape = (batch, C, time). + + Returns: + Tensor with shape = (batch, C, time). + """ x = torch.unsqueeze(inputs, dim=-2) x = F.dropout2d(x, self.p, self.training, self.inplace) return torch.squeeze(x, dim=-2) @@ -24,6 +38,15 @@ def __str__(self): class DropConnect2d(nn.Module): + """DropsConnect for tensor with 2d spatial dimanions (4d tensors). + It drops the full feature map. It used to create residual networks + with stochastic depth. + + Attributes: + p: Probability of dropping the feature map. + + """ + def __init__(self, p=0.2): super().__init__() self.p = p @@ -36,6 +59,14 @@ def __str__(self): return s def forward(self, inputs): + """Applies drop-connect. + + Args: + inputs: Input tensor with shape = (batch, C, H, W). + + Returns: + Tensor with shape = (batch, C, H, W). + """ if not self.training: return inputs @@ -51,6 +82,15 @@ def forward(self, inputs): class DropConnect1d(nn.Module): + """DropsConnect for tensor with 1d spatial dimanions (3d tensors). + It drops the full feature map. It used to create residual networks + with stochastic depth. + + Attributes: + p: Probability of dropping the feature map. + + """ + def __init__(self, p=0.2): super().__init__() self.p = p @@ -63,6 +103,14 @@ def __str__(self): return s def forward(self, inputs): + """Applies drop-connect. + + Args: + inputs: Input tensor with shape = (batch, C, time). + + Returns: + Tensor with shape = (batch, C, time). + """ if not self.training: return inputs diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 5a2e960c..467ea589 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -10,7 +10,12 @@ import torch.nn as nn import torch.nn.functional as nnf +from hyperion.torch.utils.masking import seq_lengths_to_mask + +from ..utils import seq_le + SQRT_EPS = 1e-5 +N_EPS = 1e-6 def _conv1(in_channels, out_channels, bias=False): @@ -19,19 +24,34 @@ def _conv1(in_channels, out_channels, bias=False): class _GlobalPool1d(nn.Module): + """Abstract base class Global pooling in 1d + + Attributes: + dim: Pooling dimension + keepdim: If True, it keeps the same number of dimensions after pooling + + """ + def __init__(self, dim=-1, keepdim=False): super().__init__() self.dim = dim self.keepdim = keepdim self.size_multiplier = 1 - def _standarize_weights(self, weights, ndims): + def _standardize_weights(self, x, x_lengths=None, weights=None): + """standardizes the weights to have the proper shape to be + multiplied by the input data. + """ + if weights is None: + return seq_lengths_to_mask( + x, x.size(self.dim), dtype=x.dtype, time_dim=self.dim + ) - if weights.dim() == ndims: + if weights.dim() == x.dim(): return weights assert weights.dim() == 2 - shape = ndims * [1] + shape = x.dim() * [1] shape[0] = weights.shape[0] shape[self.dim] = weights.shape[1] return weights.view(tuple(shape)) @@ -68,21 +88,30 @@ class GlobalAvgPool1d(_GlobalPool1d): """Global average pooling in 1d Attributes: - dim: pooling dimension - keepdim: it True keeps the same number of dimensions after pooling + dim: Pooling dimension + keepdim: if True, it keeps the same number of dimensions after pooling """ def __init__(self, dim=-1, keepdim=False): super().__init__(dim, keepdim) - def forward(self, x, weights=None): + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor. + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor + """ + weights = self._standardize_weights(x, x_lengths, weights) if weights is None: y = torch.mean(x, dim=self.dim, keepdim=self.keepdim) return y - weights = self._standarize_weights(weights, x.dim()) - xbar = torch.mean(weights * x, dim=self.dim, keepdim=self.keepdim) wbar = torch.mean(weights, dim=self.dim, keepdim=self.keepdim) return xbar / wbar @@ -146,8 +175,8 @@ class GlobalMeanStdPool1d(_GlobalPool1d): """Global mean + standard deviation pooling in 1d Attributes: - dim: pooling dimension - keepdim: it True keeps the same number of dimensions after pooling + dim: Pooling dimension + keepdim: If True, it keeps the same number of dimensions after pooling """ @@ -155,7 +184,18 @@ def __init__(self, dim=-1, keepdim=False): super().__init__(dim, keepdim) self.size_multiplier = 2 - def forward(self, x, weights=None): + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor. + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor + """ + weights = self._standardize_weights(x, x_lengths, weights) if weights is None: mu = torch.mean(x, dim=self.dim, keepdim=True) delta = x - mu @@ -173,7 +213,6 @@ def forward(self, x, weights=None): return mus - weights = self._standarize_weights(weights, x.dim()) xbar = torch.mean(weights * x, dim=self.dim, keepdim=True) wbar = torch.mean(weights, dim=self.dim, keepdim=True) mu = xbar / wbar @@ -342,8 +381,8 @@ class GlobalMeanLogVarPool1d(_GlobalPool1d): """Global mean + log-variance pooling in 1d Attributes: - dim: pooling dimension - keepdim: it True keeps the same number of dimensions after pooling + dim: Pooling dimension + keepdim: If True, it keeps the same number of dimensions after pooling """ @@ -351,15 +390,24 @@ def __init__(self, dim=-1, keepdim=False): super().__init__(dim, keepdim) self.size_multiplier = 2 - def forward(self, x, weights=None): + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor. + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor + """ + weights = self._standardize_weights(x, x_lengths, weights) if weights is None: mu = torch.mean(x, dim=self.dim, keepdim=self.keepdim) x2bar = torch.mean(x ** 2, dim=self.dim, keepdim=self.keepdim) logvar = torch.log(x2bar - mu * mu + 1e-5) # for stability in case var=0 return torch.cat((mu, logvar), dim=-1) - weights = self._standarize_weights(weights, x.dim()) - xbar = torch.mean(weights * x, dim=self.dim, keepdim=self.keepdim) wbar = torch.mean(weights, dim=self.dim, keepdim=self.keepdim) mu = xbar / wbar @@ -371,15 +419,16 @@ def forward(self, x, weights=None): class LDEPool1d(_GlobalPool1d): - """Learnable dictionary encoder pooling in 1d + """Learnable dictionary encoder pooling in 1d. + It only works for 3d tensors. Attributes: - in_feats: input feature dimension - num_comp: number of cluster components - dist_pow: power for distance metric - use_bias: use bias parameter when computing posterior responsibility - dim: pooling dimension - keepdim: it True keeps the same number of dimensions after pooling + in_feats: Input feature dimension. + num_comp: Number of cluster components. + dist_pow: Power for distance metric. + use_bias: Use bias parameter when computing posterior responsibility. + dim: Pooling dimension. + keepdim: if True, it keeps the same number of dimensions after pooling. """ @@ -426,29 +475,52 @@ def __str__(self): ) return s - def forward(self, x, weights=None): + def _standardize_weights(self, x, x_lengths=None, weights=None): + """standardizes the weights to have shape (batch, max_length).""" + if weights is None: + return seq_lengths_to_mask(x, x.size(self.dim), dtype=x.dtype, time_dim=1) + + if weights.dim() == x.dim(): + return weights.traspose(1, self.dim) + + assert weights.dim() == 2 + return weights + + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor of shape=(batch, time, feat_dim) or (batch, feat_dim, time). + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor. + """ + weights = self._standardize_weights(x, x_lengths, weights) if self.dim != 1 or self.dim != -2: - x = x.transpose(1, self.dim) + x = x.transpose(1, self.dim) # (batch, time, feat_dim) - x = torch.unsqueeze(x, dim=2) - delta = x - self.mu - dist = self.dist_f(delta) + x = torch.unsqueeze(x, dim=2) # (batch, time, 1, feat_dim) + delta = x - self.mu # (batch, time, num_comp, feat_dim) + dist = self.dist_f(delta) # (batch, time, num_comp) llk = -self.prec ** 2 * dist + self.bias - r = nnf.softmax(llk, dim=-1) + r = nnf.softmax(llk, dim=-1) # (batch, time, num_comp) if weights is not None: r *= weights - r = torch.unsqueeze(r, dim=-1) - N = torch.sum(r, dim=1) + 1e-9 - F = torch.sum(r * delta, dim=1) - pool = F / N + r = torch.unsqueeze(r, dim=-1) # (batch, time, num_comp, 1) + N = torch.sum(r, dim=1) + N_EPS # (batch, num_comp, 1) + F = torch.sum(r * delta, dim=1) # (batch, num_comp, feat_dim) + pool = F / N # (batch, num_comp, feat_dim) pool = pool.contiguous().view(-1, self.num_comp * self.in_feats) + # (batch, num_comp * feat_dim) if self.keepdim: if self.dim == 1 or self.dim == -2: - pool.unsqueeze_(1) + pool = pool.unsqueeze(1) else: - pool.unsqueeze_(-1) + pool = pool.unsqueeze(-1) return pool @@ -466,6 +538,23 @@ def get_config(self): class ScaledDotProdAttV1Pool1d(_GlobalPool1d): + """Scaled dot product attention pooling in 1d. + The attention weights are obtained by scaled inner product + between the feature frames and learned parameters contained + inside the layer. + This class only works on 3d tensors. + + Attributes: + in_feats: Input feature dimension. + num_heads: Number of attention heads. + d_k: Dimension of the keys. + d_v: Dimension of the values + bin_attn: It True, use binary attention. Attention values are obtained by applying sigmoid to + the dot products instead of softmax. + dim: Pooling dimension. + keepdim: if True, it keeps the same number of dimensions after pooling. + """ + def __init__( self, in_feats, num_heads, d_k, d_v, bin_attn=False, dim=-1, keepdim=False ): @@ -505,9 +594,32 @@ def __str__(self): ) return s - def forward(self, x, weights=None): + def _standardize_weights(self, x, x_lengths=None, weights=None): + """standardizes the weights to have shape (batch, max_length).""" + if weights is None: + return seq_lengths_to_mask(x, x.size(self.dim), dtype=x.dtype, time_dim=1) + + if weights.dim() == x.dim(): + return weights.traspose(1, self.dim) + + assert weights.dim() == 2 + return weights + + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor of shape=(batch, time, feat_dim) or (batch, feat_dim, time). + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor. In this implementation only binary weights + are allowed. + """ + weights = self._standardize_weights(x, x_lengths, weights) batch_size = x.size(0) - if self.dim != 1: + if self.dim == 2 or self.dim == -1: x = x.transpose(1, self.dim) k = self.linear_k(x).view(batch_size, -1, self.num_heads, self.d_k) @@ -519,16 +631,20 @@ def forward(self, x, weights=None): self.d_k ) # (batch, head, 1, time) if self.bin_attn: + # use binary attention. scores = torch.sigmoid(scores + self.bias) # scores = scores.squeeze(dim=-1) # (batch, head, time) if weights is not None: - mask = weights.view(batch_size, 1, 1, -1).eq(0) # (batch, 1, 1,time) + mask = weights.view(batch_size, 1, 1, -1).eq(0) # (batch, 1, 1, time) if self.bin_attn: scores = scores.masked_fill(mask, 0.0) self.attn = scores / (torch.sum(scores, dim=-1, keepdim=True) + 1e-9) else: - min_value = -1e200 + if scores.dtype == torch.half: + min_value = -65504 + else: + min_value = -1e200 scores = scores.masked_fill(mask, min_value) self.attn = torch.softmax(scores, dim=-1).masked_fill( mask, 0.0 @@ -541,7 +657,14 @@ def forward(self, x, weights=None): x = torch.matmul(self.attn, v) # (batch, head, 1, d_v) if self.keepdim: - x = x.view(batch_size, 1, self.num_heads * self.d_v) # (batch, 1, d_model) + if self.dim == 1 or self.dim == -2: + x = x.view( + batch_size, 1, self.num_heads * self.d_v + ) # (batch, 1, d_model) + else: + x = x.view( + batch_size, 1, self.num_heads * self.d_v + ) # (batch, d_model, 1) else: x = x.view(batch_size, self.num_heads * self.d_v) # (batch, d_model) return x @@ -560,7 +683,20 @@ def get_config(self): class GlobalChWiseAttMeanStdPool1d(_GlobalPool1d): - """Attentive mean + stddev pooling for each channel""" + """Attentive mean + stddev pooling for each channel. + This class only works on 3d tensors. + + Attributes: + in_feats: Input feature dimension. + inner_feats: Feature dimension in the hidden layer of the content based attention. + bin_attn: If True, use binary attention. Attention values are obtained by applying sigmoid to + the dot products instead of softmax. + use_global_context: If True, concat global stats pooling to the input features to + compute the attention. + norm_layer: Normalization layer object, if None, it used BatchNorm1d. + dim: Pooling dimension. + keepdim: it True, it keeps the same number of dimensions after pooling. + """ def __init__( self, @@ -588,9 +724,9 @@ def __init__( self.norm_layer = norm_layer(inner_feats) self.activation = nn.Tanh() self.conv2 = _conv1(inner_feats, in_feats, bias=True) - self.stats_pool = GlobalMeanStdPool1d(dim=dim) + self.stats_pool = GlobalMeanStdPool1d(dim=-1) if self.bin_attn: - self.bias = nn.Parameter(torch.ones((1, in_feats, 1))) + self.bias = nn.Parameter(torch.zeros((1, in_feats, 1))) def __repr__(self): return self.__str__() @@ -607,23 +743,69 @@ def __str__(self): ) return s - def forward(self, x, weights=None): + def _standardize_weights(self, x, x_lengths=None, weights=None): + """standardizes the weights to have the proper shape to be + multiplied by the input data. + """ + if weights is None: + return seq_lengths_to_mask(x, x.size(self.dim), dtype=x.dtype, time_dim=-1) - x_inner = self.conv1(x) + if weights.dim() == x.dim(): + return weights.transpose(self.dim, -1) + + assert weights.dim() == 2 + shape = x.dim() * [1] + shape[0] = weights.shape[0] + shape[-1] = weights.shape[1] + return weights.view(tuple(shape)) + + def forward(self, x, x_lengths=None, weights=None): + """Applies pooling to the input. + + Args: + x: Input tensor of shape=(batch, time, feat_dim) or (batch, feat_dim, time). + x_lengths: Lengths of the input sequences in the pooling dimension. + x_lengths is only used if weights is not given. + weights: Weights for weighted pooling with shape=(batch, max_length) + or (batch,..., max_length,...) with shape matching the one + of the input tensor. + """ + assert x.dim() == 3, "Input should be a 3d tensor" + if self.dim == 1 or self.dim == -2: + x = x.transpose(1, self.dim) + + # x = (batch, feat_dim, time) + weights = self._standardize_weights(x, x_lengths, weights) # (batch, 1, time) + x_inner = self.conv1(x) # (batch, inner_dim, time) # logging.info('x_inner1={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) if self.use_global_context: - global_mus = self.stats_pool(x) + global_mus = self.stats_pool(x, weights=weights) x_inner = x_inner + self.lin_global(global_mus).unsqueeze(-1) # logging.info('x_inner2={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) - attn = self.conv2(self.activation(self.norm_layer(x_inner))) + attn = self.conv2( + self.activation(self.norm_layer(x_inner)) + ) # (batch, feat_dim, time) if self.bin_attn: - # attn = torch.sigmoid(attn+self.bias) - attn = torch.sigmoid(attn) + attn = torch.sigmoid(attn + self.bias).clamp(min=N_EPS) else: + if weights is not None: + if attn.dtype == torch.half: + min_value = -65504 + else: + min_value = -1e200 + mask = weights.eq(0) + attn = attn.masked_fill(mask, min_value) + attn = nnf.softmax(attn, dim=-1) + if weights is not None: + attn = attn * weights + mus = self.stats_pool(x, weights=attn) # logging.info('mus={} {}'.format(torch.sum(torch.isnan(mus)), torch.sum(torch.isinf(mus)))) + if self.keepdim: + mus = mus.unsqueeze(self.dim) + return mus def get_config(self): diff --git a/hyperion/torch/layers/interpolate.py b/hyperion/torch/layers/interpolate.py index fa76fd2a..94b3d2ed 100644 --- a/hyperion/torch/layers/interpolate.py +++ b/hyperion/torch/layers/interpolate.py @@ -9,6 +9,13 @@ class Interpolate(nn.Module): + """Interpolation class. + + Attributes: + scale_factor: upsampling scale factor. + mode: algorithm used for upsampling: 'nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'. + """ + def __init__(self, scale_factor, mode="nearest"): super().__init__() self.interp = nnf.interpolate @@ -24,5 +31,13 @@ def __repr__(self): return s def forward(self, x): + """Interpolates the input. + + Args: + x: input tensor. + + Returns: + Interpolated tensor. + """ x = self.interp(x, scale_factor=self.scale_factor, mode=self.mode) return x diff --git a/hyperion/torch/layers/margin_losses.py b/hyperion/torch/layers/margin_losses.py index 36fd2a5f..5ae2b518 100644 --- a/hyperion/torch/layers/margin_losses.py +++ b/hyperion/torch/layers/margin_losses.py @@ -20,6 +20,17 @@ def _l2_norm(x, axis=-1): class ArcLossOutput(nn.Module): + """Additive angular margin softmax (ArcFace) output layer. + + Attributes: + in_feats: input feature dimension. + num_classes: number of output classes. + cos_scale: cosine scale. + margin: angular margin. + margin_warmup_epochs: number of epochs to warm up the margin from 0 to + its final value. + """ + def __init__( self, in_feats, num_classes, cos_scale=64, margin=0.3, margin_warmup_epochs=0 ): @@ -59,6 +70,11 @@ def _compute_aux(self): self.sin_m = math.sin(self.cur_margin) def update_margin(self, epoch): + """Updates the value of the margin. + + Args: + epoch: value of current epoch. + """ if self.margin_warmup_epochs == 0: return @@ -73,6 +89,16 @@ def update_margin(self, epoch): self._compute_aux() def forward(self, x, y=None): + """Computes penalized logits. + + Args: + x: input feature tensor with shape = (batch, in_feats). + y: ground truth classes. This is required to penalize the logit of + the true class at training time. + + Returns: + Logit tensor with shape = (batch, num_classes) + """ with amp.autocast(enabled=False): s = self.cos_scale batch_size = len(x) @@ -98,6 +124,17 @@ def forward(self, x, y=None): class CosLossOutput(nn.Module): + """Additive margin softmax (CosFace) output layer. + + Attributes: + in_feats: input feature dimension. + num_classes: number of output classes. + cos_scale: cosine scale. + margin: angular margin. + margin_warmup_epochs: number of epochs to warm up the margin from 0 to + its final value. + """ + def __init__( self, in_feats, num_classes, cos_scale=64, margin=0.3, margin_warmup_epochs=0 ): @@ -116,6 +153,11 @@ def __init__( self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) def update_margin(self, epoch): + """Updates the value of the margin. + + Args: + epoch: value of current epoch. + """ if self.margin_warmup_epochs == 0: return @@ -130,6 +172,16 @@ def update_margin(self, epoch): return def forward(self, x, y=None): + """Computes penalized logits. + + Args: + x: input feature tensor with shape = (batch, in_feats). + y: ground truth classes. This is required to penalize the logit of + the true class at training time. + + Returns: + Logit tensor with shape = (batch, num_classes) + """ with amp.autocast(enabled=False): s = self.cos_scale x = _l2_norm(x.float()) @@ -152,6 +204,18 @@ def forward(self, x, y=None): class SubCenterArcLossOutput(ArcLossOutput): + """Sub-Center Additive angular margin softmax (ArcFace) output layer. + + Attributes: + in_feats: input feature dimension. + num_classes: number of output classes. + num_subcenters: number of subcenters. + cos_scale: cosine scale. + margin: angular margin. + margin_warmup_epochs: number of epochs to warm up the margin from 0 to + its final value. + """ + def __init__( self, in_feats, @@ -184,6 +248,16 @@ def __str__(self): return s def forward(self, x, y=None): + """Computes penalized logits. + + Args: + x: input feature tensor with shape = (batch, in_feats). + y: ground truth classes. This is required to penalize the logit of + the true class at training time. + + Returns: + Logit tensor with shape = (batch, num_classes) + """ with amp.autocast(enabled=False): s = self.cos_scale batch_size = len(x) diff --git a/hyperion/torch/layers/mvn.py b/hyperion/torch/layers/mvn.py index 3ee1e121..4f569089 100644 --- a/hyperion/torch/layers/mvn.py +++ b/hyperion/torch/layers/mvn.py @@ -9,11 +9,22 @@ class MeanVarianceNorm(nn.Module): + """Class to apply short-time mean-variance normalization to features. + + Attributes: + norm_mean: if True, it normalizes the mean. + norm_var: if True, is also normalized the variance. + left_context: left context for the window that computes the normalization stats. + right_context: right context for the window that computes the normalization stats. + dim: normalization dimension (time dimension). + + If left_context = right_context = 0, it computes the stats on the whole utterance. + """ def __init__( self, norm_mean=True, norm_var=False, left_context=0, right_context=0, dim=1 ): - super(MeanVarianceNorm, self).__init__() + super().__init__() self.norm_mean = norm_mean self.norm_var = norm_var self.left_context = left_context @@ -35,6 +46,14 @@ def __str__(self): return s def forward(self, x): + """Short-time mean-var normalizes feature tensor. + + Args: + x: feature tensor. + + Returns: + Normalized feature tensor. + """ T = x.shape[self.dim] if (self.left_context == 0 and self.right_context == 0) or ( @@ -45,6 +64,7 @@ def forward(self, x): return self.normalize_cumsum(x) def normalize_global(self, x): + """Applies global mean-var normalization.""" # Global mean/var norm. if self.norm_mean: m_x = torch.mean(x, dim=self.dim, keepdim=True) @@ -57,7 +77,7 @@ def normalize_global(self, x): return x def normalize_cumsum(self, x): - + """Applies short-time mean-var normalization using cumulative sums.""" if self.norm_mean: # substract first global mean # it will help cumsum numerical stability @@ -99,13 +119,13 @@ def normalize_cumsum(self, x): @staticmethod def filter_args(**kwargs): - """Filters ST-CMVN args from arguments dictionary. + """Filters ST-MVN args from arguments dictionary. Args: kwargs: Arguments dictionary. Returns: - Dictionary with ST-CMVN options. + Dictionary with ST-MVN options. """ valid_args = ( diff --git a/hyperion/torch/layers/norm_layer_factory.py b/hyperion/torch/layers/norm_layer_factory.py index cd7e542f..8543b31b 100644 --- a/hyperion/torch/layers/norm_layer_factory.py +++ b/hyperion/torch/layers/norm_layer_factory.py @@ -7,6 +7,10 @@ class NormLayer2dFactory(object): + """Factory class to create normalization layers for + tensors with 2D spatial dimension. + """ + @staticmethod def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): """Creates a layer-norm callabe constructor @@ -54,6 +58,10 @@ def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): class NormLayer1dFactory(object): + """Factory class to create normalization layers for + tensors with 1D spatial (time) dimension. + """ + @staticmethod def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): """Creates a layer-norm callabe constructor diff --git a/hyperion/torch/layers/pdf_storage.py b/hyperion/torch/layers/pdf_storage.py index bac48d27..f3f34b37 100644 --- a/hyperion/torch/layers/pdf_storage.py +++ b/hyperion/torch/layers/pdf_storage.py @@ -10,18 +10,22 @@ class StdNormal(nn.Module): - """Storage for Standard Normal distribution""" + """Storage for Standard Normal distribution parameters + + Attributes: + shape: shape of the location/scale tensors. + """ def __init__(self, shape): super().__init__() self.register_buffer("loc", torch.zeros(shape)) self.register_buffer("scale", torch.ones(shape)) - # self.loc = nn.Parameter(torch.zeros(shape), requires_grad=False) - # self.scale = nn.Parameter(torch.ones(shape), requires_grad=False) @property def pdf(self): + """Probability density function for N(0,I).""" return pdf.normal.Normal(self.loc, self.scale) def forward(self): + """Probability density function for N(0,I).""" return self.pdf diff --git a/hyperion/torch/layers/pool_factory.py b/hyperion/torch/layers/pool_factory.py index 41cf2ac2..fa1032a8 100644 --- a/hyperion/torch/layers/pool_factory.py +++ b/hyperion/torch/layers/pool_factory.py @@ -9,6 +9,8 @@ class GlobalPool1dFactory(object): + """Factory class to create global pooling layers 1d.""" + @staticmethod def create( pool_type, @@ -27,6 +29,28 @@ def create( keepdim=False, **kwargs ): + """Creates a global pooling layer from arguments. + + Args: + pool_type: pooling type in ["avg", "mean+stddev", "mean+logvar", "lde", + "scaled-dot-prod-att-v1", "ch-wise-att-mean+stddev"] + in_feats: input feature dimension. + inner_feats: feature dimension in the hidden layer of the content based attention, + in channel-wise attention. + num_comp: number of LDE components. + dist_power: distance type in LDE in L1 or L2. + use_bias: use bias in LDE. + num_heads: number of attention heads. + d_k: dimension of the keys in scaled dot product attn. + d_v: dimension of the values in scaled dot product attn. + bin_attn: it True, use binary attention. Attention values are obtained by applying sigmoid to + the dot products instead of softmax. + use_global_context: if True, concat global stats pooling to the input features to + compute the attention in channel-wise attention. + norm_layer: normalization layer object, if None, it used BatchNorm1d. + dim: pooling dimension. + keepdim: it True keeps the same number of dimensions after pooling. + """ if pool_type == "avg": return GlobalAvgPool1d(dim=dim, keepdim=keepdim) @@ -71,6 +95,14 @@ def create( @staticmethod def filter_args(**kwargs): + """Filters the arguments corresponding to the creation of a pooling layer. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with the pooling layer options. + """ if "wo_bias" in kwargs: kwargs["use_bias"] = not kwargs["wo_bias"] diff --git a/hyperion/torch/layers/spec_augment.py b/hyperion/torch/layers/spec_augment.py index ecb3609f..1366172b 100644 --- a/hyperion/torch/layers/spec_augment.py +++ b/hyperion/torch/layers/spec_augment.py @@ -17,8 +17,10 @@ class AxisMasker(nn.Module): Implementation based on espnet. Attributes: - mask_width_range: range for the width of the masks - mask_num_range: range for the number of masks + min_width: minimum width of the mask. + max_width: maximum width of the mask. + min_num_mask: minimum number of masks. + max_num_mask: maximum number of masks. dim: axis where we apply the mask fill_value: masking value """ @@ -121,7 +123,9 @@ class SpecWarper(nn.Module): Implementation based on espnet. Attributes: - window: time warp parameter + window: time warp parameter. + mode: interpolation mode in ["nearest", "linear", "bilinear"] + dim: warping dimension. """ def __init__(self, window=80, mode="bicubic", dim=-2): @@ -136,14 +140,14 @@ def __repr__(self): ) return s - def forward(self, x, lengths=None): + def forward(self, x, x_lengths=None): """warps x along time or freq dimension Args: - x: spectrogram (batch, *, time, freq) - lengths: length ratios + x: spectrogram shape= (batch, *, time, freq) + lengths: time lengths of the sequences. Returns: - warped spectrogram (batch, *, time, freq) + warped spectrogram shape = (batch, *, time, freq) """ if not self.training: return x @@ -166,10 +170,10 @@ def forward(self, x, lengths=None): # the first n frames where n is the length of the # shortest utterance # the end of the utterance will not be warped - if dim == -1 or lengths is None: + if dim == -1 or x_lengths is None: warp_length = x.shape[-2] else: - warp_length = int(x.shape[-2] * torch.min(lengths)) + warp_length = int(x.shape[-2] * torch.min(x_lengths)) center = torch.randint(self.window, warp_length - self.window, (1,))[0] warped = torch.randint(center - self.window, center + self.window, (1,))[0] + 1 @@ -208,6 +212,20 @@ class SpecAugment(nn.Module): Augmentation Method for Automatic Speech Recognition" Attributes: + time_warp_prob: probability of applying time warping. + time_warp_window: time warp parameter. + time_warp_mode: interpolation mode in ["nearest", "linear", "bilinear"] + time_mask_prob: probability of applying masking in time. + time_min_width: minimum width of the time mask. + time_max_width: maximum width of the time mask. + time_min_num_mask: minimum number of time masks. + time_max_num_mask: maximum number of time masks. + freq_mask_prob: probability of applying frequency masking. + freq_min_width: minimum width of the frequency mask. + freq_max_width: maximum width of the frequency mask. + freq_min_num_mask: minimum number of frequency masks. + freq_max_num_mask: maximum number of frequency masks. + fill_value: masking value. """ def __init__( @@ -287,7 +305,14 @@ def __repr__(self): ) return s - def forward(self, x, lengths=None): + def forward(self, x, x_lengths=None): + """Applies spec augment to input + Args: + x: spectrogram with shape = (batch, time, freq) + lengths: time lengths of the sequences. + Returns: + Augmented spectrogram with shape = (batch, time, freq) + """ if not self.training: return x # global count @@ -300,7 +325,7 @@ def forward(self, x, lengths=None): # ax.imshow(x.cpu().numpy()[0].T) r = torch.rand((3,), device=x.device) if self.time_warp_prob > r[0]: - x = self.time_warper(x, lengths) + x = self.time_warper(x, x_lengths) # ax = plt.subplot(222) # ax.imshow(x.cpu().numpy()[0].T) @@ -319,6 +344,7 @@ def forward(self, x, lengths=None): # count += 1 return x + @staticmethod def filter_args(**kwargs): """Filters SpecAugment args from arguments dictionary. diff --git a/hyperion/torch/layers/subpixel_convs.py b/hyperion/torch/layers/subpixel_convs.py index 6b529aff..19c0283f 100644 --- a/hyperion/torch/layers/subpixel_convs.py +++ b/hyperion/torch/layers/subpixel_convs.py @@ -9,6 +9,22 @@ class SubPixelConv1d(nn.Module): + """Implements a SubPixel Convolution in 1d proposed in: + https://arxiv.org/abs/1609.05158 + + Attributes: + in_channels: Number of input channels. + out_channels: Number of output channels. + kernel_size: Kernel size. + stride: Downsampling stride. + padding: Int or Int Tuple with the number of left/right padding samples + dilation: Kernel dilation. + groups: Number of groups in the convolution. + bias: If true, the convolution has bias. + padding_mode: Padding mode in ['zeros', 'reflect', 'replicate' or 'circular']. + + """ + def __init__( self, in_channels, @@ -38,6 +54,14 @@ def __init__( self.stride = stride def forward(self, x): + """Applies subpixel convolution 1d. + + Args: + x: Input tensor with shape = (batch, in_channels, in_time) + + Returns: + Output tensor with shape = (batch, out_channels, out_time) + """ x = self.conv(x) if self.stride == 1: return x @@ -51,6 +75,22 @@ def forward(self, x): class SubPixelConv2d(nn.Module): + """Implements a SubPixel Convolution in 2d proposed in: + https://arxiv.org/abs/1609.05158 + + Attributes: + in_channels: Number of input channels. + out_channels: Number of output channels. + kernel_size: Kernel size. + stride: Downsampling stride. + padding: Int or Int Tuple with the number of left/right padding samples + dilation: Kernel dilation. + groups: Number of groups in the convolution. + bias: If true, the convolution has bias. + padding_mode: Padding mode in ['zeros', 'reflect', 'replicate' or 'circular']. + + """ + def __init__( self, in_channels, @@ -81,6 +121,14 @@ def __init__( self.pixel_shuffle = nn.PixelShuffle(self.stride) def forward(self, x): + """Applies subpixel convolution 1d. + + Args: + x: Input tensor with shape = (batch, in_channels, in_W, in_H) + + Returns: + Output tensor with shape = (batch, out_channels, out_W, out_H) + """ x = self.conv(x) if self.stride == 1: return x diff --git a/hyperion/torch/layers/swish.py b/hyperion/torch/layers/swish.py index 520a71fb..a313455e 100644 --- a/hyperion/torch/layers/swish.py +++ b/hyperion/torch/layers/swish.py @@ -7,6 +7,8 @@ class SwishImplementation(torch.autograd.Function): + """Implementation for Swish activation function.""" + @staticmethod def forward(ctx, i): result = i * torch.sigmoid(i) @@ -21,6 +23,10 @@ def backward(ctx, grad_output): class Swish(nn.Module): + """Swish activation class: + y = x * sigmoid(x) + """ + def forward(self, x): return SwishImplementation.apply(x) diff --git a/hyperion/torch/layers/tensor2pdf.py b/hyperion/torch/layers/tensor2pdf.py index e38b1bc7..55c890a3 100644 --- a/hyperion/torch/layers/tensor2pdf.py +++ b/hyperion/torch/layers/tensor2pdf.py @@ -13,6 +13,12 @@ class Tensor2PDF(nn.Module): """Base class for layers that create a prob distribution from an input tensor + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -44,7 +50,14 @@ def _make_proj(self, in_feats, out_feats, ndims): class Tensor2NormalICov(Tensor2PDF): - """Transforms a Tensor into Normal distribution with identitiy variance""" + """Transforms a Tensor into Normal distribution with identitiy variance + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. + """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): super().__init__(pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) @@ -53,6 +66,16 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._proj = self._make_proj(self.in_feats, self.pdf_feats, self.in_dim) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + prior: Not used. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -70,6 +93,12 @@ class Tensor2NormalGlobDiagCov(Tensor2PDF): Input tensor will be the mean of the distribution and the standard deviation is a global trainable parameter. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -85,6 +114,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self.logvar = nn.Parameter(torch.zeros(pdf_shape)) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -108,6 +149,12 @@ class Tensor2NormalDiagCov(Tensor2PDF): Applies two linear transformation to the tensors to obtain the mean and the log-variance. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -117,6 +164,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._proj = self._make_proj(self.in_feats, self.pdf_feats * 2, self.in_dim) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -138,7 +197,13 @@ def forward(self, inputs, prior=None, squeeze_dim=None): class Tensor2BayNormalICovGivenNormalPrior(Tensor2PDF): """Transforms a Tensor into Normal distribution with identitiy variance - Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation + Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -151,6 +216,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._alpha = nn.Parameter(torch.zeros(1)) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -173,7 +250,13 @@ class Tensor2BayNormalGlobDiagCovGivenNormalPrior(Tensor2PDF): Input tensor will be the ML mean of the distribution and the ML standard deviation is a global trainable parameter. - Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation + Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -193,6 +276,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._beta = nn.Parameter(torch.zeros(1)) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) @@ -231,7 +326,13 @@ class Tensor2BayNormalDiagCovGivenNormalPrior(Tensor2PDF): Applies two linear transformation to the tensors to obtain the maximum likelihood mean and the log-variance. - Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation + Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation. + + Attributes: + pdf_feats: Feature dimension of the probability distribution. + project: If True, it applies a projection to the input tensor. + in_feats: Feature dimension of the input tensor. + in_dim: Number of dimensions of the input tensor. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): @@ -245,6 +346,18 @@ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): self._beta = nn.Parameter(torch.zeros(1)) def forward(self, inputs, prior=None, squeeze_dim=None): + """Creates a Normal distribution from input tensor. + + Args: + inputs: Input tensor. + Args: + inputs: Input tensor. + prior: prior pdf object. + squeeze_dim: Squeezes pdf parameters dimensions. + + Returns: + torch.distributions.normal.Normal object. + """ if self.project: inputs = self._proj(inputs) diff --git a/hyperion/torch/layers/vq.py b/hyperion/torch/layers/vq.py index 98307438..c56b58f6 100644 --- a/hyperion/torch/layers/vq.py +++ b/hyperion/torch/layers/vq.py @@ -9,8 +9,20 @@ import torch.nn.functional as F import torch.distributed as dist +from ..utils import seq_lengths_to_mask + class VectorQuantizer(nn.Module): + """Abstract base class for vector quantization layers. + + Attributes: + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_embed, embed_feats, project=True, in_feats=None, in_dim=None ): @@ -43,6 +55,7 @@ def __repr__(self): return self.__str__() def _make_proj(self, in_feats, out_feats, ndims): + """Creates the feature projection layer.""" if ndims == 2: return nn.Linear(in_feats, out_feats) elif ndims == 3: @@ -56,6 +69,18 @@ def _make_proj(self, in_feats, out_feats, ndims): class KMeansVectorQuantizer(VectorQuantizer): + """Class for K-Means vector quantization layers, + where codebook vectors are trained by gradient descend losses. + + Attributes: + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + commitment_cost: weight for loss that makes input features close to the codebook vectors. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_embed, @@ -95,11 +120,33 @@ def __str__(self): ) return s - def forward(self, inputs, return_r=False): + def forward(self, inputs, lengths=None, mask=None, return_r=False): + """Quantizes the input tensor. + + Args: + input: input tensor 2d - 5d dimension with shape (batch, channels, ...) + lengths: when inputs is 3d, it the length of each sequence in the batch. + Not used if mask is given. + mask: indicates which elements are valid, to quantize. The elements with zero + mask are set to 0. The mask tensor should have the same shape as the + input tensor with the channel dimension removed, shape=(batch, ...). + return_r: it True, it returns the responsibilities. + + Returns: + Dictionary containing quantized vectors, vq_loss, KL(q(z)||p(z)), where q(z) is + the distribution of posterior responsabilities and p(z) is a uniform categorical + distribution, and the log_perplexity of the responsibilities. If return_r is True, + it also returns the responsibilities. + """ # inputs -> z_e in paper if self.project: inputs = self._proj(inputs) + if mask is None and lengths is not None: + mask = seq_lengths_to_mask( + lengths, inputs.size(-1), time_dim=1, dtype=inputs.dtype + ) + # convert inputs from BCHW -> BHWC inputs = inputs.transpose(1, -1).contiguous() input_shape = inputs.shape @@ -112,26 +159,37 @@ def forward(self, inputs, return_r=False): torch.sum(flat_inputs ** 2, dim=1, keepdim=True) + torch.sum(self.embed ** 2, dim=1) - 2 * torch.matmul(flat_inputs, self.embed.t()) - ) + ) # (batch x time, num_embeds) # Encoding # quantization integer indexes - q_idx = torch.argmin(d2, dim=1).unsqueeze(1) + q_idx = torch.argmin(d2, dim=1).unsqueeze(1) # (batch x time, 1) # 1 hot responsibilities r = torch.zeros(q_idx.shape[0], self.num_embed, device=inputs.device) - r.scatter_(1, q_idx, 1) - z_q = torch.matmul(r, self.embed).view(input_shape) + r.scatter_(1, q_idx, 1) # (batch x time, num_embeds) + z_q = torch.matmul(r, self.embed).view(input_shape) # (batch, time, embed_dim) + + if mask is not None: + z_q = z_q * mask + inputs = inputs * mask # Loss - vq_loss = F.mse_loss(z_q, inputs.detach()) - commitment_loss = F.mse_loss(z_q.detach(), inputs) + vq_loss = F.mse_loss(z_q, inputs.detach()) # || z_q - sg(z) ||_2 + commitment_loss = F.mse_loss(z_q.detach(), inputs) # || z - sg (z_q) ||_2 + loss = vq_loss + self.commitment_cost * commitment_loss + if mask is not None: + loss /= torch.mean(mask) # this allows to backprogate the gradients as if the output were equal to z_e z_q = inputs + (z_q - inputs).detach() # compute the perplexity - probs = torch.mean(r, dim=0) + if mask is None: + probs = torch.mean(r, dim=0) + else: + probs = torch.mean(r[mask.flatten()], dim=0) + log_perplexity = -torch.sum(probs * torch.log(probs + 1e-10)) # compute KL divergence between r and uniform categorical prior @@ -147,7 +205,7 @@ def forward(self, inputs, return_r=False): ) # convert quantized from BHWC -> BCHW - z_q = z_q.transpose(1, -1).contiguous() + z_q = z_q.transpose(1, -1).contiguous() # (batch, embed_dim, time) output = { "z_q": z_q, "loss": loss, @@ -162,6 +220,20 @@ def forward(self, inputs, return_r=False): class MultiKMeansVectorQuantizer(VectorQuantizer): + """Class for Mulit-group K-Means vector quantization layers, + where codebook vectors are trained by gradient descend losses. + The input tensors are divided into groups and quantized separately. + + Attributes: + num_groups: number of codebooks. + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + commitment_cost: weight for loss that makes input features close to the codebook vectors. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_groups, @@ -212,15 +284,37 @@ def __str__(self): ) return s - def forward(self, inputs, return_r=False): + def forward(self, inputs, lengths=None, mask=None, return_r=False): + """Quantizes the input tensor. + + Args: + input: input tensor 2d - 5d dimension with shape (batch, channels, ...) + lengths: when inputs is 3d, it the length of each sequence in the batch. + Not used if mask is given. + mask: indicates which elements are valid, to quantize. The elements with zero + mask are set to 0. The mask tensor should have the same shape as the + input tensor with the channel dimension removed, shape=(batch, ...). + return_r: it True, it returns the responsibilities. + + Returns: + Dictionary containing quantized vectors, vq_loss, KL(q(z)||p(z)), where q(z) is + the distribution of posterior responsabilities and p(z) is a uniform categorical + distribution, and the log_perplexity of the responsibilities. If return_r is True, + it also returns the responsibilities. + """ if self.project: inputs = self._proj(inputs) + if mask is None and lengths is not None: + mask = seq_lengths_to_mask( + lengths, inputs.size(-1), time_dim=1, dtype=inputs.dtype + ) + inputs = inputs.chunk(self.num_groups, dim=1) z_q = [] r = [] for i in range(self.num_groups): - output_i = self.vq_layers[i](inputs[i], return_r=return_r) + output_i = self.vq_layers[i](inputs[i], mask=mask, return_r=return_r) z_qi = output_i["z_q"] loss_i = output_i["loss"] kldiv_ri = output_i["kldiv_qrpr"] @@ -255,6 +349,19 @@ def forward(self, inputs, return_r=False): class EMAKMeansVectorQuantizer(VectorQuantizer): + """Class exponential moving average vector quantization layers, + + Attributes: + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + commitment_cost: weight for loss that makes input features close to the codebook vectors. + gamma: exponential average coefficient. + eps: epsilon for Laplace smoothing of the counts. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_embed, @@ -302,11 +409,34 @@ def __str__(self): ) return s - def forward(self, inputs, return_r=False): + def forward(self, inputs, lengths=None, mask=None, return_r=False): + """Quantizes the input tensor. In training phase, it also + updates the codebooks by EMA. + + Args: + input: input tensor 2d - 5d dimension with shape (batch, channels, ...) + lengths: when inputs is 3d, it the length of each sequence in the batch. + Not used if mask is given. + mask: indicates which elements are valid, to quantize. The elements with zero + mask are set to 0. The mask tensor should have the same shape as the + input tensor with the channel dimension removed, shape=(batch, ...). + return_r: it True, it returns the responsibilities. + + Returns: + Dictionary containing quantized vectors, vq_loss, KL(q(z)||p(z)), where q(z) is + the distribution of posterior responsabilities and p(z) is a uniform categorical + distribution, and the log_perplexity of the responsibilities. If return_r is True, + it also returns the responsibilities. + """ # inputs -> z_e in paper if self.project: inputs = self._proj(inputs) + if mask is None and lengths is not None: + mask = seq_lengths_to_mask( + lengths, inputs.size(-1), time_dim=1, dtype=inputs.dtype + ) + # convert inputs from BCHW -> BHWC inputs = inputs.transpose(1, -1).contiguous() input_shape = inputs.shape @@ -331,9 +461,15 @@ def forward(self, inputs, return_r=False): # Use Exponetial Moving Average (EMA) to update the embedding vectors if self.training: + if mask is not None: + flat_mask = mask.flatten() + r = r[flat_mask] + flat_inputs = flat_inputs[flat_mask] + N = torch.sum(r, dim=0) # required to sync gpus in DDP - dist.all_reduce(N, op=dist.ReduceOp.SUM) + if dist.is_initialized(): + dist.all_reduce(N, op=dist.ReduceOp.SUM) ema_N = self._ema_N * self.gamma + (1 - self.gamma) * N @@ -345,21 +481,31 @@ def forward(self, inputs, return_r=False): z_acc = torch.matmul(r.t(), flat_inputs) # required to sync gpus in DDP - dist.all_reduce(z_acc, op=dist.ReduceOp.SUM) + if dist.is_initialized(): + dist.all_reduce(z_acc, op=dist.ReduceOp.SUM) self._ema_z_acc = ( self.gamma * self._ema_z_acc + (1 - self.gamma) * z_acc ).detach() self.embed = (self._ema_z_acc / self._ema_N.unsqueeze(1)).detach() + if mask is not None: + z_q = z_q * mask + inputs = inputs * mask # Loss commitment_loss = F.mse_loss(z_q.detach(), inputs) loss = self.commitment_cost * commitment_loss + if mask is not None: + loss /= torch.mean(mask) # this allows to backprogate the gradients as if the output were equal to z_e z_q = inputs + (z_q - inputs).detach() # compute the perplexity - probs = torch.mean(r, dim=0) + if mask is None: + probs = torch.mean(r, dim=0) + else: + probs = torch.mean(r[mask.flatten()], dim=0) + log_perplexity = -torch.sum(probs * torch.log(probs + 1e-10)) # compute KL divergence between r and uniform categorical prior @@ -390,6 +536,22 @@ def forward(self, inputs, return_r=False): class MultiEMAKMeansVectorQuantizer(VectorQuantizer): + """Class for Mulit-group exponential moving average vector quantization layers, + where codebook vectors are trained by gradient descend losses. + The input tensors are divided into groups and quantized separately. + + Attributes: + num_groups: number of codebooks. + num_embed: codebook size. + embed_feats: feature dimension of the codebook vectors. + commitment_cost: weight for loss that makes input features close to the codebook vectors. + gamma: exponential average coefficient. + eps: epsilon for Laplace smoothing of the counts. + project: if True, it projects the input features to the embed_feats dim. + in_feats: input feature dimension, needed when project=True. + in_dim: number of dimensions of the input tensor in [2,5], needed when project=True + """ + def __init__( self, num_groups, @@ -452,15 +614,37 @@ def __str__(self): ) return s - def forward(self, inputs, return_r=False): + def forward(self, inputs, lengths=None, mask=None, return_r=False): + """Quantizes the input tensor. + + Args: + input: input tensor 2d - 5d dimension with shape=(batch, channels, ...) + lengths: when inputs is 3d, it the length of each sequence in the batch. + Not used if mask is given. + mask: indicates which elements are valid, to quantize. The elements with zero + mask are set to 0. The mask tensor should have the same shape as the + input tensor with the channel dimension removed, shape=(batch, ...). + return_r: it True, it returns the responsibilities. + + Returns: + Dictionary containing quantized vectors, vq_loss, KL(q(z)||p(z)), where q(z) is + the distribution of posterior responsabilities and p(z) is a uniform categorical + distribution, and the log_perplexity of the responsibilities. If return_r is True, + it also returns the responsibilities. + """ if self.project: inputs = self._proj(inputs) + if mask is None and lengths is not None: + mask = seq_lengths_to_mask( + lengths, inputs.size(-1), time_dim=1, dtype=inputs.dtype + ) + inputs = inputs.chunk(self.num_groups, dim=1) z_q = [] r = [] for i in range(self.num_groups): - output_i = self.vq_layers[i](inputs[i]) + output_i = self.vq_layers[i](inputs[i], mask=mask) z_qi = output_i["z_q"] loss_i = output_i["loss"] kldiv_ri = output_i["kldiv_qrpr"] diff --git a/hyperion/torch/models/wav2xvectors/__init__.py b/hyperion/torch/models/wav2xvectors/__init__.py new file mode 100644 index 00000000..d1e65dd0 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/__init__.py @@ -0,0 +1,13 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +# from .wav2tdnn_xvector import Wav2TDNNXVector +from .wav2resnet_xvector import Wav2ResNetXVector + +# from .wav2efficient_net_xvector import Wav2EfficientNetXVector +# from .wav2transformer_xvector_v1 import Wav2TransformerXVectorV1 +# from .wav2spinenet_xvector import Wav2SpineNetXVector +from .wav2resnet1d_xvector import Wav2ResNet1dXVector diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py new file mode 100644 index 00000000..78724174 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -0,0 +1,40 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + +from ..xvectors import ResNet1dXVector +from ...tpm import HFWav2Vec +from .hf_wav2xvector import HFWav2XVector + + +class HFWav2Vec2ResNet1dXVector(HFWav2XVector): + """Class extracting ResNet1d x-vectors from waveform. + It contains acoustic feature extraction, feature normalization and + ResNet1dXVector extractor. + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + xvector: ResNet1dXVector configuration dictionary or object. + """ + + def __init__(self, hf_feats, xvector): + + if isinstance(hf_feats, dict): + hf_feats = HFWav2Vec(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec) + + if isinstance(xvector, dict): + xvector = ResNet1dXVector(**xvector) + else: + assert isinstance(xvector, ResNet1dXVector) + + super().__init__(hf_feats, xvector) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py new file mode 100644 index 00000000..a471343c --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -0,0 +1,26 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + + +from ...torch_model import TorchModel + + +class HFWav2XVector(TorchModel): + """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + xvector: x-vector model object. + """ + + def __init__(self, hf_feats, xvector): + + self.hf_feats = hf_feats + self.xvector = xvector diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py new file mode 100644 index 00000000..983fbac2 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -0,0 +1,53 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + +from .wav2xvector import Wav2XVector +from ..xvectors import ResNet1dXVector + + +class Wav2ResNet1dXVector(Wav2XVector): + """Class extracting ResNet1d x-vectors from waveform. + It contains acoustic feature extraction, feature normalization and + ResNet1dXVector extractor. + + Attributes: + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + xvector: ResNet1dXVector configuration dictionary or object. + """ + + def __init__(self, feats, xvector): + + if isinstance(xvector, dict): + xvector = ResNet1dXVector.filter_args(**xvector) + xvector = ResNet1dXVector(**xvector) + else: + assert isinstance(xvector, ResNet1dXVector) + + super().__init__(feats, xvector) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2ResNet1dXVector options to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2XVector.add_class_args(parser) + ResNet1dXVector.add_class_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py new file mode 100644 index 00000000..dea2e442 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py @@ -0,0 +1,53 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + +from .wav2xvector import Wav2XVector +from ..xvectors import ResNetXVector + + +class Wav2ResNetXVector(Wav2XVector): + """Class extracting ResNet x-vectors from waveform. + It contains acoustic feature extraction, feature normalization and + ResNetXVector extractor. + + Attributes: + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + xvector: ResNetXVector configuration dictionary or object. + """ + + def __init__(self, feats, xvector): + + if isinstance(xvector, dict): + xvector = ResNetXVector.filter_args(**xvector) + xvector = ResNetXVector(**xvector) + else: + assert isinstance(xvector, ResNetXVector) + + super().__init__(feats, xvector) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2ResNet1dXVector options to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2XVector.add_class_args(parser) + ResNetXVector.add_class_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py new file mode 100644 index 00000000..0c5a1698 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -0,0 +1,128 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + +from ...torch_model import TorchModel +from ...narchs import AudioFeatsMVN +from ...utils import remove_silence + + +class Wav2XVector(TorchModel): + """Base class for models that integrate the acoustic feature extractor and and x-vector model that takes acoustic features as input. + + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + xvector: x-vector model object. + """ + + def __init__(self, feats, xvector): + + super().__init__() + + if isinstance(feats, dict): + feats = AudioFeatsMVN.filter_args(**feats) + feats["trans"] = True + feats = AudioFeatsMVN(**feats) + else: + assert isinstance(feats, AudioFeatsMVN) + + self.feats = feats + self.xvector = xvector + + def forward( + self, + x, + x_lengths=None, + y=None, + vad_samples=None, + vad_feats=None, + enc_layers=None, + classif_layers=None, + return_output=True, + ): + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, feat_lengths) + + # feat_lengths = torch.div(x_lengths * feats.size(-1), x.size(-1)) + return self.xvector( + feats, feat_lengths, y, enc_layers, classif_layers, return_output + ) + + def extract_embed( + self, + x, + x_lengths=None, + vad_samples=None, + vad_feats=None, + chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, feat_lengths) + + return self.xvector.extract_embed( + feats, feat_lengths, chunk_length, embed_layer, detach_chunks + ) + + def train_mode(self, mode="ft-embed-affine"): + self.xvector.train_mode(mode) + + def get_config(self): + feat_cfg = self.feats.get_config() + xvector_cfg = self.xvector.get_config() + config = { + "feats": feat_cfg, + "xvector": xvector_cfg, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(*kwargs): + """Filters Wav2XVector class arguments from arguments dictionary. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with SpecAugment options. + """ + valid_args = ( + "feats", + "xvector", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2XVector options common to all child classes to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + AudioFeatsMVN.add_class_args(parser, prefix="feats") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/__init__.py b/hyperion/torch/models/xvectors/__init__.py new file mode 100644 index 00000000..408de716 --- /dev/null +++ b/hyperion/torch/models/xvectors/__init__.py @@ -0,0 +1,13 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .xvector import XVector +from .tdnn_xvector import TDNNXVector +from .resnet_xvector import ResNetXVector +from .efficient_net_xvector import EfficientNetXVector +from .transformer_xvector_v1 import TransformerXVectorV1 +from .spinenet_xvector import SpineNetXVector +from .resnet1d_xvector import ResNet1dXVector diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index 8db9a073..295824f3 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -138,7 +138,7 @@ def load(cls, file_path=None, cfg=None, state_dict=None): del cfg["in_feats"] except: pass - print(cfg, flush=True) + model = cls(**cfg) if state_dict is not None: model.load_state_dict(state_dict) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 21932491..685ead4a 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -12,7 +12,7 @@ from ...layer_blocks import TDNNBlock from ...narchs import ClassifHead, TorchNALoader from ...torch_model import TorchModel -from ...utils import eval_nnet_by_chunks +from ...utils import eval_nnet_by_chunks, scale_lengths class XVector(TorchModel): @@ -201,40 +201,38 @@ def _pre_enc(self, x): x = x.view(x.size(0), 1, x.size(1), x.size(2)) return x - def _post_enc(self, x): + def _post_enc(self, x, in_lengths=None, max_in_length=None): if self.encoder_net.out_dim() == 4: x = x.view(x.size(0), -1, x.size(-1)) if self.proj is not None: x = self.proj(x) - return x + if in_lengths is not None: + out_lengths = scale_lengths(in_lengths, x.size(-1), max_in_length) + else: + out_lengths = None + + return x, out_lengths def forward( - self, x, y=None, enc_layers=None, classif_layers=None, return_output=True + self, + x, + x_lengths=None, + y=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, ): - if enc_layers is None and classif_layers is None: - return self.forward_output(x, y) + if return_enc_layers is None and return_classif_layers is None: + return self.forward_logits(x, x_lengths, y) - h = self.forward_hid_feats(x, y, enc_layers, classif_layers, return_output) - output = {} - if enc_layers is not None: - if classif_layers is None: - output["h_enc"] = h - else: - output["h_enc"] = h[0] - else: - output["h_enc"] = [] - if classif_layers is not None: - output["h_classif"] = h[1] - else: - output["h_classif"] = [] - if return_output: - output["output"] = h[2] - return output + return self.forward_hid_feats( + x, x_lengths, y, return_enc_layers, return_classif_layers, return_logits + ) - def forward_output(self, x, y=None): + def forward_logits(self, x, x_lengths=None, y=None): """Forward function Args: @@ -242,59 +240,57 @@ def forward_output(self, x, y=None): y: target classes torch.long tensor with shape=(batch,) Returns: - class posteriors tensor with shape=(batch, num_classes) + class logits tensor with shape=(batch, num_classes) """ - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - + max_in_length = x.size(-1) + x = self._pre_enc(x) x = self.encoder_net(x) - - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - if self.proj is not None: - x = self.proj(x) - - p = self.pool_net(x) + x, x_lengths = self._post_enc(x, x_lengths, max_in_length) + p = self.pool_net(x, x_lengths=x_lengths) y = self.classif_net(p, y) return y def forward_hid_feats( - self, x, y=None, enc_layers=None, classif_layers=None, return_output=False + self, + x, + x_lengths=None, + y=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=False, ): """forwards hidden representations in the x-vector network""" - - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - - h_enc, x = self.encoder_net.forward_hid_feats(x, enc_layers, return_output=True) - - if not return_output and classif_layers is None: - return h_enc - - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - if self.proj is not None: - x = self.proj(x) - - p = self.pool_net(x) - h_classif = self.classif_net.forward_hid_feats( - p, y, classif_layers, return_output=return_output + max_in_length = x.size(-1) + x = self._pre_enc(x) + h_enc, x = self.encoder_net.forward_hid_feats( + x, return_enc_layers, return_logits=True + ) + output = {"h_enc": h_enc} + if not return_logits and return_classif_layers is None: + return output + + x, x_lengths = self._post_enc(x, x_lengths, max_in_length) + p = self.pool_net(x, x_lengths=x_lengths) + h_classif, y_pred = self.classif_net.forward_hid_feats( + p, y, return_classif_layers, return_logits=return_logits ) - if return_output: - h_classif, y = h_classif - return h_enc, h_classif, y + if return_logits: + h_classif, y_pred = h_classif + output["h_classif"] = h_classif + output["logits"] = y_pred + return output - return h_enc, h_classif + output["h_classif"] = h_classif + return output - def extract_embed(self, x, chunk_length=0, embed_layer=None, detach_chunks=False): + def extract_embed( + self, x, x_lengths=None, chunk_length=0, embed_layer=None, detach_chunks=False + ): if embed_layer is None: embed_layer = self.embed_layer + max_in_length = x.size(-1) x = self._pre_enc(x) - # if self.encoder_net.in_dim() == 4 and x.dim() == 3: - # x = x.view(x.size(0), 1, x.size(1), x.size(2)) x = eval_nnet_by_chunks( x, self.encoder_net, chunk_length, detach_chunks=detach_chunks ) @@ -302,15 +298,8 @@ def extract_embed(self, x, chunk_length=0, embed_layer=None, detach_chunks=False if x.device != self.device: x = x.to(self.device) - x = self._post_enc(x) - - # if self.encoder_net.out_dim() == 4: - # x = x.view(x.size(0), -1, x.size(-1)) - - # if self.proj is not None: - # x = self.proj(x) - - p = self.pool_net(x) + x, x_lengths = self._post_enc(x, x_lengths, max_in_length) + p = self.pool_net(x, x_lengths=x_lengths) y = self.classif_net.extract_embed(p, embed_layer) return y @@ -344,7 +333,7 @@ def extract_embed_slidwin( embed_layer = self.embed_layer in_time = x.size(-1) - x = self._pre_enc(x) + x, _ = self._pre_enc(x) x = eval_nnet_by_chunks( x, self.encoder_net, chunk_length, detach_chunks=detach_chunks ) @@ -501,7 +490,7 @@ def rebuild_output_layer( # if we change the number of classes or the loss-type # we need to reinitiate the last layer self.classif_net.rebuild_output_layer( - num_classes, loss_type, s, margin, margin_warmup_epochs + num_classes, loss_type, cos_scale, margin, margin_warmup_epochs ) return @@ -538,11 +527,6 @@ def train_mode(self, mode="ft-embed-affine"): @staticmethod def filter_args(**kwargs): - # # get boolean args that are negated - # if 'pool_wo_bias' in kwargs: - # kwargs['pool_use_bias'] = not kwargs['pool_wo_bias'] - # del kwargs['pool_wo_bias'] - if "wo_norm" in kwargs: kwargs["use_norm"] = not kwargs["wo_norm"] del kwargs["wo_norm"] @@ -553,19 +537,6 @@ def filter_args(**kwargs): # get arguments for pooling pool_args = PF.filter_args(**kwargs["pool_net"]) - # pool_valid_args = ( - # 'pool_type', 'pool_num_comp', 'pool_use_bias', - # 'pool_dist_pow', 'pool_d_k', 'pool_d_v', 'pool_num_heads', - # 'pool_bin_attn', 'pool_inner_feats') - # pool_args = dict((k, kwargs[k]) - # for k in pool_valid_args if k in kwargs) - - # # remove pooling prefix from arg name - # for k in pool_valid_args[1:]: - # if k in pool_args: - # k2 = k.replace('pool_','') - # pool_args[k2] = pool_args[k] - # del pool_args[k] valid_args = ( "num_classes", @@ -573,7 +544,7 @@ def filter_args(**kwargs): "num_embed_layers", "hid_act", "loss_type", - "s", + "cos_scale", "margin", "margin_warmup_epochs", "num_subcenters", @@ -600,49 +571,6 @@ def add_class_args(parser, prefix=None, skip=set()): parser, prefix="pool_net", skip=["dim", "in_feats", "keepdim"] ) - # parser.add_argument('--pool-type', type=str.lower, - # default='mean+stddev', - # choices=['avg','mean+stddev', 'mean+logvar', - # 'lde', 'scaled-dot-prod-att-v1', 'ch-wise-att-mean-stddev'], - # help=('Pooling methods: Avg, Mean+Std, Mean+logVar, LDE, ' - # 'scaled-dot-product-attention-v1')) - - # parser.add_argument('--pool-num-comp', - # default=64, type=int, - # help=('number of components for LDE pooling')) - - # parser.add_argument('--pool-dist-pow', - # default=2, type=int, - # help=('Distace power for LDE pooling')) - - # parser.add_argument('--pool-wo-bias', - # default=False, action='store_true', - # help=('Don\' use bias in LDE')) - - # parser.add_argument( - # '--pool-num-heads', default=8, type=int, - # help=('number of attention heads')) - - # parser.add_argument( - # '--pool-d-k', default=256, type=int, - # help=('key dimension for attention')) - - # parser.add_argument( - # '--pool-d-v', default=256, type=int, - # help=('value dimension for attention')) - - # parser.add_argument( - # '--pool-bin-attn', default=False, action='store_true', - # help=('Use binary attention, i.e. sigmoid instead of softmax')) - - # parser.add_argument( - # '--pool-inner-feats', default=128, type=int, - # help=('inner feature size for attentive pooling')) - - # parser.add_argument('--num-classes', - # required=True, type=int, - # help=('number of classes')) - parser.add_argument( "--embed-dim", default=256, type=int, help=("x-vector dimension") ) diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index 1d5cb0a3..9092e9d8 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -4,6 +4,7 @@ """ from jsonargparse import ArgumentParser, ActionParser +import torch import torch.nn as nn from ..layers import AudioFeatsFactory as AFF @@ -56,16 +57,24 @@ def frame_length(self): def frame_shift(self): return self.audio_feats.frame_shift - def forward(self, x, lengths=None): + @staticmethod + def _compute_feat_lengths(x_lengths, max_samples, max_frames): + if x_lengths is None: + return None + + return torch.div(x_lengths * max_frames, max_samples, rounding_mode="floor") + + def forward(self, x, x_lengths=None): f = self.audio_feats(x) + f_lengths = self._compute_feat_lengths(x_lengths, x.size(-1), f.size(1)) if self.spec_augment is not None and not self.aug_after_mvn: - f = self.spec_augment(f, lengths) + f = self.spec_augment(f, f_lengths) if self.mvn is not None: f = self.mvn(f) if self.spec_augment is not None and self.aug_after_mvn: - f = self.spec_augment(f, lengths) + f = self.spec_augment(f, f_lengths) if self.trans: f = f.transpose(1, 2).contiguous() @@ -105,4 +114,3 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='feature extraction options') diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index e3af9f2d..5824cb1b 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -8,6 +8,7 @@ import torch.nn as nn from torch.nn import Linear +from ..layers import ActivationFactory as AF from ..layers import CosLossOutput, ArcLossOutput, SubCenterArcLossOutput from ..layers import NormLayer1dFactory as NLF from ..layer_blocks import FCBlock @@ -143,7 +144,13 @@ def __init__( ) def rebuild_output_layer( - self, num_classes, loss_type, s, margin, margin_warmup_epochs, num_subcenters=2 + self, + num_classes, + loss_type, + cos_scale, + margin, + margin_warmup_epochs, + num_subcenters=2, ): embed_dim = self.embed_dim @@ -228,16 +235,16 @@ def forward(self, x, y=None): return y - def forward_hid_feats(self, x, y=None, layers=None, return_output=False): + def forward_hid_feats(self, x, y=None, return_layers=None, return_logits=False): - assert layers is not None or return_output - if layers is None: - layers = [] + assert return_layers is not None or return_logits + if return_layers is None: + return_layers = [] h = [] for l in range(self.num_embed_layers): x = self.fc_blocks[l](x) - if l in layers: + if l in return_layers: h.append(x) if self.loss_type == "softmax": @@ -245,16 +252,19 @@ def forward_hid_feats(self, x, y=None, layers=None, return_output=False): else: y = self.output(x, y) - if return_output: + if return_logits: return h, y - return h + return h, None def extract_embed(self, x, embed_layer=0): for l in range(embed_layer): x = self.fc_blocks[l](x) - y = self.fc_blocks[embed_layer].forward_linear(x) + if self.loss_type == "softmax" or embed_layer < self.num_embed_layers: + y = self.fc_blocks[embed_layer].forward_linear(x) + else: + y = self.fc_blocks[l](x) return y def get_config(self): diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index 69f9300c..4fabe8d2 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -232,25 +232,28 @@ def _make_in_layer(self): nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc ) elif isinstance(self.in_layer_type, nn.Module): - self.in_layer = nn.Sequential(in_layer_type, pos_enc) + self.in_layer = nn.Sequential(self.in_layer_type, pos_enc) elif self.in_layer_type is None: self.in_layer = pos_enc else: raise ValueError("unknown in_layer_type: " + self.in_layer_type) - def forward(self, x, mask=None, target_shape=None): + def forward(self, x, x_lengths=None, x_mask=None, target_shape=None): """Forward pass function Args: x: input tensor with size=(batch, time, num_feats) - mask: mask to indicate valid time steps for x (batch, time) + x_lengths: lengths of the input sequences. + x_mask: mask to indicate valid time steps for x (batch, time). + It overwrites the mask of x_lengths. Returns: Tensor with output features Tensor with mask """ + if isinstance(self.in_layer, Conv2dSubsampler): - x, mask = self.in_layer(x, mask) + x, mask = self.in_layer(x, x_mask) else: if self.in_time_dim != 1: x = x.transpose(1, self.in_time_dim).contiguous() diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 66c4d028..dc5de737 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -2,7 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +import os from copy import deepcopy import torch @@ -39,7 +39,7 @@ def unfreeze(self): @staticmethod def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): model_data = None - if cfg is None: + if cfg is None or state_dict is None: assert file_path is not None model_data = torch.load(file_path) if cfg is None: diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 75c3ece8..fef0b3b5 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -11,7 +11,7 @@ import torch import torch.nn as nn -from ..utils import MetricAcc # , TorchDataParallel +from ..utils import MetricAcc from .xvector_trainer_from_wav import XVectorTrainerFromWav @@ -128,11 +128,6 @@ def __init__( % (p_attack, 1.0 / self.grad_acc_steps) ) - # if data_parallel: - # # change model in attack by the data parallel version - # self.attack.model = TorchDataParallel(self.attack.model) - # # make loss function in attack data parallel - # self.attack.make_data_parallel() def train_epoch(self, data_loader): @@ -167,7 +162,7 @@ def train_epoch(self, data_loader): feats = self.feat_extractor(data) with self.amp_autocast(): - output = self.model(feats, target) + output = self.model(feats, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: @@ -263,4 +258,4 @@ def add_class_args(parser, prefix=None, skip=[]): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='trainer options') + diff --git a/hyperion/torch/trainers/xvector_finetuner.py b/hyperion/torch/trainers/xvector_finetuner.py deleted file mode 100644 index cf833257..00000000 --- a/hyperion/torch/trainers/xvector_finetuner.py +++ /dev/null @@ -1,117 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import os -from collections import OrderedDict as ODict - -import time -import logging - -import torch -import torch.nn as nn - -from ..utils import MetricAcc -from .xvector_trainer import XVectorTrainer - - -class XVectorFinetuner(XVectorTrainer): - def __init__( - self, - model, - optimizer, - epochs, - exp_path, - cur_epoch=0, - grad_acc_steps=1, - device=None, - metrics=None, - lr_scheduler=None, - loggers=None, - data_parallel=False, - loss=None, - finetune_mode="ft-embed-affine", - ): - - super(XVectorFinetuner, self).__init__( - model, - optimizer, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lr_scheduler=lr_scheduler, - loggers=loggers, - data_parallel=data_parallel, - loss=loss, - ) - - self.finetune_mode = finetune_mode - - def train_epoch(self, data_loader): - # epoch_batches = len(data_loader.dataset) - # total_batches = self.cur_epoch * epoch_batches - - self.model.update_loss_margin(self.cur_epoch) - - metric_acc = MetricAcc() - batch_metrics = ODict() - # self.model.train_mode(self.finetune_mode) - self.model.eval() - for batch, (data, target) in enumerate(data_loader): - self.loggers.on_batch_begin(batch) - - if batch % self.grad_acc_steps == 0: - self.optimizer.zero_grad() - - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - - output = self.model(data, target) - loss = self.loss(output, target).mean() / self.grad_acc_steps - loss.backward() - - if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None: - self.lr_scheduler.on_opt_step() - self.optimizer.step() - - batch_metrics["loss"] = loss.item() * self.grad_acc_steps - for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) - - # logging.info('batch={} shape={} loss={} acc={}'.format(batch,data.shape, batch_metrics['loss'], batch_metrics['acc'])) - - # if batch > 63: - # logging.info(str(self.model.classif_net.fc_blocks[0].linear.weight)) - # logging.info(str(self.model.classif_net.fc_blocks[0].linear.weight.grad)) - # if batch > 63 : - # t=torch.nn.functional.cross_entropy(output, target, reduction='none') - # logging.info(str(t)) - # if batch == 65: - # #torch.set_printoptions(profile="full") - # #logging.info(str(data[1])) - # #logging.info(str(target[1])) - # #logging.info(str(output[1])) - - # #logging.info(str(data[33])) - # #logging.info(str(target[33])) - # logging.info(str(output[33, target[33]])) - # #time.sleep(1000) - # #torch.set_printoptions(profile="default") - - # #logging.info(str(torch.sum(torch.isnan(data)))) - # #logging.info(str(torch.sum(torch.isnan(target)))) - # #logging.info(str(torch.sum(torch.isnan(output)))) - - metric_acc.update(batch_metrics, batch_size) - logs = metric_acc.metrics - logs["lr"] = self._get_lr() - self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - # total_batches +=1 - - logs = metric_acc.metrics - logs["lr"] = self._get_lr() - return logs diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 190b2a30..2e032a49 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -127,7 +127,7 @@ def train_epoch(self, data_loader): batch_size = data.shape[0] with self.amp_autocast(): - output = self.model(data, target, **self.amp_args) + output = self.model(data, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 7b7cb21c..47801c29 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import os +from jsonargparse import ArgumentParser, ActionParser from collections import OrderedDict as ODict import logging @@ -10,30 +11,9 @@ import torch import torch.nn as nn -from ..utils import MetricAcc # , TorchDataParallel +from ..utils import MetricAcc from .xvector_trainer import XVectorTrainer -# class DFRModelWrapper(nn.Module): -# """Wrapper class for the xvector model, which -# replace the forward method by the forward_hid_feats method - -# This is need because nn.DataParallel only support multi-gpu when colling the -# forward method, but not the other methods in the nn.Module classes. -# """ -# def __init__(self, model): -# super().__init__() -# self.model = model - -# def forward(self, x, y=None, enc_layers=None, classif_layers=None, -# return_output=False, use_amp=False): -# if use_amp: -# with torch.cuda.amp.autocast(): -# return self.model.forward_hid_feats( -# x, y, enc_layers, classif_layers, return_output) - -# return self.model.forward_hid_feats( -# x, y, enc_layers, classif_layers, return_output) - class XVectorTrainerDeepFeatReg(XVectorTrainer): """Trainer to train x-vector style models. @@ -149,19 +129,6 @@ def __init__( if device is not None: self.prior_model.to(device) - # self.model_wrapper = DFRModelWrapper(self.model) - # self.prior_model_wrapper = DFRModelWrapper(self.prior_model) - - # if device is not None: - # self.model_wrapper.to(device) - # self.prior_model_wrapper.to(device) - # self.reg_loss.to(device) - - # if data_parallel: - # self.model_wrapper = TorchDataParallel(self.model_wrapper) - # self.prior_model_wrapper = TorchDataParallel(self.prior_model_wrapper) - # self.reg_loss = TorchDataParallel(self.reg_loss) - def train_epoch(self, data_loader): """Training epoch loop @@ -184,14 +151,11 @@ def train_epoch(self, data_loader): batch_size = data.shape[0] with self.amp_autocast(): - # h_enc, h_classif, output = self.model_wrapper( - # data, target, self.reg_layers_enc, self.reg_layers_classif, - # return_output=True, **self.amp_args) outputs = self.model( data, - target, - self.reg_layers_enc, - self.reg_layers_classif, + y=target, + return_enc_layers=self.reg_layers_enc, + return_classif_layers=self.reg_layers_classif, return_output=True, ) h_enc, h_classif, output = ( @@ -207,9 +171,8 @@ def train_epoch(self, data_loader): prior_outputs = self.prior_model( data, - target, - self.reg_layers_enc, - self.reg_layers_classif, + return_enc_layers=self.reg_layers_enc, + return_classif_layers=self.reg_layers_classif, return_output=False, ) prior_h_enc, prior_h_classif = ( diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 29964322..6763b035 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -10,7 +10,7 @@ import torch import torch.nn as nn -from ..utils import MetricAcc # , TorchDataParallel +from ..utils import MetricAcc from .torch_trainer import TorchTrainer from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg @@ -126,9 +126,6 @@ def __init__( if device is not None: self.feat_extractor.to(device) - # if data_parallel: - # self.feat_extractor = TorchDataParallel(self.feat_extractor) - def train_epoch(self, data_loader): """Training epoch loop @@ -154,14 +151,11 @@ def train_epoch(self, data_loader): feats = self.feat_extractor(data) with self.amp_autocast(): - # h_enc, h_classif, output = self.model_wrapper( - # feats, target, self.reg_layers_enc, self.reg_layers_classif, - # return_output=True, **self.amp_args) outputs = self.model( feats, - target, - self.reg_layers_enc, - self.reg_layers_classif, + y=target, + return_enc_layers=self.reg_layers_enc, + return_classif_layers=self.reg_layers_classif, return_output=True, ) h_enc, h_classif, output = ( @@ -175,14 +169,10 @@ def train_epoch(self, data_loader): ).mean() # you need to take the mean here because of the multi-gpu training batch_metrics["loss-classif"] = loss.item() - # prior_h_enc, prior_h_classif = self.prior_model_wrapper( - # feats, target, self.reg_layers_enc, self.reg_layers_classif, - # return_output=False, **self.amp_args) prior_outputs = self.prior_model( feats, - target, - self.reg_layers_enc, - self.reg_layers_classif, + return_enc_layers=self.reg_layers_enc, + return_classif_layers=self.reg_layers_classif, return_output=False, ) prior_h_enc, prior_h_classif = ( diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 06086d32..a00016e6 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -135,7 +135,7 @@ def train_epoch(self, data_loader): feats = self.feat_extractor(data) with self.amp_autocast(): - output = self.model(feats, target) + output = self.model(feats, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: @@ -184,7 +184,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): feats = self.feat_extractor(data) with self.amp_autocast(): - output = self.model(feats, **self.amp_args) + output = self.model(feats) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index 6db39ef3..22af492c 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -5,6 +5,9 @@ from .devices import open_device from .metric_acc import MetricAcc +from .masking import seq_lengths_to_mask, scale_lengths +from .collation import collate_seq_1d, collate_seq_2d, collate_seq_nd from .eval_utils import eval_nnet_by_chunks, eval_nnet_overlap_add +from .vad_utils import remove_silence from .data_parallel import TorchDataParallel from .ddp import TorchDDP, FairShardedDDP, FairFullyShardedDDP diff --git a/hyperion/torch/utils/collation.py b/hyperion/torch/utils/collation.py new file mode 100644 index 00000000..25b3790b --- /dev/null +++ b/hyperion/torch/utils/collation.py @@ -0,0 +1,92 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.nn as nn + + +def collate_seq_1d(x, pad_value=0): + """Combines a list/tuple of vectors with different lengths + into a single tensor. + + Args: + x: input lits/tuple of vectors. + + Returns: + 2D tensor with shape (num_vectors, max_vector_length). + 1D long tensor containing the vector lengths. + """ + max_length = max([x_i.size(0) for x_i in x]) + y = pad_value * torch.ones(len(x), max_length, dtype=x[0].dtype, device=x[0].device) + y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) + for i, x_i in enumerate(x): + y[i, : x_i.size(0)] = x_i + y_lengths[i] = x_i.size(0) + + return y, y_lengths + + +def collate_seq_2d(x, pad_value=0, pad_dim=-1): + """Combines a list/tuple of matrices with different sizes in one of + the dimensions into a single 3d tensor. + Combines performing padding on the dimension which is not constant. + + Args: + x: input lits/tuple of matrices. + pad_dim: padding dimension. + + Returns: + 3D tensor with shape (num_vectors, max_length, feat_dim) or (num_vectors, feat_dim, length). + 1D long tensor containing the dimensions lengths. + """ + max_length = max([x_i.size(pad_dim) for x_i in x]) + y_size = list(x[0].size()) + y_size[pad_dim] = max_length + y = pad_value * torch.ones(*y_size, dtype=x[0].dtype, device=x[0].device) + y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) + if pad_dim == -1 or pad_dim == 1: + for i, x_i in enumerate(x): + y[i, :, : x_i.size(pad_dim)] = x_i + y_lengths[i] = x_i.size(pad_dim) + else: + for i, x_i in enumerate(x): + y[i, : x_i.size(pad_dim)] = x_i + y_lengths[i] = x_i.size(pad_dim) + + return y, y_lengths + + +def collate_seq_nd(x, pad_value=0, pad_dim=-1): + """Combines a list/tuple of N-d tensors with different sizes in one of + the dimensions into a single (N+1)-d tensor. + Combines performing padding on the dimension which is not constant. + + Args: + x: input lits/tuple of matrices. + pad_dim: padding dimension. + + Returns: + (N+1)-D combined tensor. + 1D long tensor containing the dimensions lengths. + """ + if x[0].dim() == 1: + return collate_seq_1d(x) + + if x[0].dim() == 2: + return collate_seq_2d(x) + + # here the general case + max_length = max([x_i.size(pad_dim) for x_i in x]) + y_trans_size = list(x[0].transpose(0, pad_dim).size()) + y = pad_value * torch.ones(*y_trans_size, dtype=x[0].dtype, device=x[0].device) + y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) + for i, x_i in enumerate(x): + y[i, : x_i.size(pad_dim)] = x_i.transpose(0, pad_dim) + y_lengths[i] = x_i.size(pad_dim) + + if pad_dim > 0: + pad_dim = pad_dim + 1 + y = y.transpose(1, pad_dim).contiguous() + return y, y_lengths diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py new file mode 100644 index 00000000..b6ccd5ef --- /dev/null +++ b/hyperion/torch/utils/masking.py @@ -0,0 +1,58 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.nn as nn + + +def scale_lengths(lengths, max_out_length, max_in_length=None): + if lengths is None: + return None + + if max_in_length is None: + max_in_length = lengths.max() + + return torch.div(lengths * max_out_length, max_in_length, rounding_mode="floor") + + +def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): + """Creates a binary masks indicating the valid values in a sequence. + + Args: + lengths: sequence lengths with shape=(batch,). If None, it returns None + max_length: maximum length of the sequence. + dtype: dtype for the mask. + time_dim: dimension corresponding to time in the mask. This will + return a view of the mask which will adapt to the shape + of the tensor where we want to apply the mask. + This has to be a positive integer. + + Returns: + Binary mask with shape=(batch,...,max_length) or None + """ + if lengths is None: + return None + + assert lengths.dim() == 1 + + if max_length is None: + max_length = lengths.max() + idx = torch.arange(max_length, dtype=lengths.dtype, device=lengths.device) + + # compute mask shape=(batch, max_length) + mask = idx.unsqueeze(0) < lengths.unsqueeze(1) + + # view to match the tensor where we want to apply the mask + if time_dim > 1: + shape = [1] * (time_dim + 1) + shape[0] = lengths.size(0) + shape[time_dim] = -1 + mask = mask.view(*shape) + + # change dtype if needed + if dtype is not None: + mask = mask.to(dtype) + + return mask diff --git a/hyperion/torch/utils/misc.py b/hyperion/torch/utils/misc.py index 2b4f6034..69d209eb 100644 --- a/hyperion/torch/utils/misc.py +++ b/hyperion/torch/utils/misc.py @@ -1,26 +1,69 @@ """ - Copyright 2020 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import torch +import torch.nn as nn import torch.cuda.amp as amp -def l2_norm(x, axis=-1): +def l2_norm(x, dim=1, axis=None): + """Applies length normalization to vectors. + + Args: + x: input tensor. + dim: dimension along which normalize the vectors. + axis: same as dim (deprecated). + + Returns: + Normalized tensor. + """ + if axis is not None: + dim = axis + with amp.autocast(enabled=False): - norm = torch.norm(x.float(), 2, axis, True) + 1e-10 + norm = torch.norm(x.float(), 2, dim, True) + 1e-10 y = torch.div(x, norm) return y -def compute_snr(x, n, axis=-1): - P_x = 10 * torch.log10(torch.mean(x ** 2, dim=axis)) - P_n = 10 * torch.log10(torch.mean(n ** 2, dim=axis)) +def compute_snr(x, n, dim=1, axis=None): + """Computes SNR (dB) + + Args: + x: tensor with clean signal. + n: tensor with noisy signal + dim: dimension along which normalize power. + axis: same as dim (deprecated). + + Returns: + Tensor with SNR(dB) + """ + if axis is not None: + dim = axis + P_x = 10 * torch.log10(torch.mean(x ** 2, dim=dim)) + P_n = 10 * torch.log10(torch.mean(n ** 2, dim=dim)) return P_x - P_n def compute_stats_adv_attack(x, x_adv): + """Compute statistics of adversarial attack sample. + + Args: + x: benign signal tensor. + x_adv: adversarial signal tensor. + + Returns: + SNR (dB). + Power of x. + Power of n. + L2 norm of x. + Linf norm of x. + L0 norm of n. + L2 norm of n. + Linf norm of n. + """ if x.dim() > 2: x = torch.flatten(x, start_dim=1) @@ -42,6 +85,17 @@ def compute_stats_adv_attack(x, x_adv): def get_selfsim_tarnon(y, return_mask=False): + """Computes ground truth selfsimilarity matrix given + integer class labels. + + Args: + y: integer tensor with class labels of shape (batch,). + return_mask: If True, it returns upper triangular mask with zero diagonal. + + Returns: + Self-similarity binary matrix wiht shape=(batch, batch). + Upper triangular mask. + """ y_bin = y.unsqueeze(-1) - y.unsqueeze(0) + 1 y_bin[y_bin != 1] = 0 y_bin = y_bin.float() diff --git a/hyperion/torch/utils/vad_utils.py b/hyperion/torch/utils/vad_utils.py new file mode 100644 index 00000000..a47b92ef --- /dev/null +++ b/hyperion/torch/utils/vad_utils.py @@ -0,0 +1,59 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.nn as nn + +from .collation import collate_seq_nd + + +def remove_silence(x, vad, x_lengths=None, time_dim=1, tol=0): + """Remove silence samples/frames. + + Args: + x: input signal/spectrogram of shape=(batch,...,time,...). + vad: binary voice activity detection mask of shape=(batch, time). + x_lenghts: lengths of each sequence in x. + time_dim: which dimension in x is time. + tol: tolerance for the difference between time dimensions in x and vad. + + Returns: + x without silence samples/frames. + """ + + # we make x and vad time dimensions of the same size. + assert x.size(0) == vad.size(0), "batch-size is different for x and vad" + x_max_length = x.size(time_dim) + vad_max_length = vad.size(-1) + length_err = x_max_length - vad_max_length + assert abs(length_err) <= tol, ( + f"Difference between x_length({x_max_length}) and " + f"vad_length({vad_max_length}) > tol ({tol})" + ) + if length_err > 0: + vad = nn.functional.pad(vad, (0, length_err), model="constant", value=0) + elif length_err < 0: + vad = vad[:, :x_max_length] + + # if x_lengths is passed, we make sure that vad is 0 for time steps larger + # than x_length + if x_lengths is not None: + for i in range(x.size(0)): + vad[i, x_lengths[i] :] = 0 + + trans = False + if time_dim != 1 or time_dim != 1 - x.dim(): + x = x.transpose(1, time_dim) + trans = True + + y = [] + for i in range(x.size(0)): + y.append(x[i, vad[i]]) + + y, y_lengths = collate_seq_nd(y, pad_dim=0) + if trans: + y = y.transpose(1, time_dim).contigous() + + return y, y_lengths From 01ada9f64621f923321fc3e2102ec200a91c1ef3 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 9 Mar 2022 18:23:45 -0500 Subject: [PATCH 004/154] more docs and x_lengths support --- .../torch/layer_blocks/res2net1d_blocks.py | 71 ++++- .../torch/layer_blocks/res2net2d_blocks.py | 72 ++++- hyperion/torch/layer_blocks/res2net_blocks.py | 71 ++++- .../torch/layer_blocks/resnet1d_blocks.py | 287 ++++++++++++++++-- .../torch/layer_blocks/resnet2d_blocks.py | 235 +++++++++++++- hyperion/torch/layer_blocks/resnet_blocks.py | 76 ++++- .../torch/layer_blocks/seresnet_blocks.py | 55 +++- hyperion/torch/layer_blocks/spine_blocks.py | 30 ++ .../transformer_conv2d_subsampler.py | 14 +- .../layer_blocks/transformer_feedforward.py | 4 +- hyperion/torch/layers/interpolate.py | 7 +- 11 files changed, 855 insertions(+), 67 deletions(-) diff --git a/hyperion/torch/layer_blocks/res2net1d_blocks.py b/hyperion/torch/layer_blocks/res2net1d_blocks.py index 6f66557b..59706f61 100644 --- a/hyperion/torch/layer_blocks/res2net1d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net1d_blocks.py @@ -46,6 +46,28 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) class Res2Net1dBasicBlock(nn.Module): + """Res2Net basic Block. This is a modified Res2Net block with + two 3x3 convolutions, instead of the standard bottleneck block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + width_factor: multiplication factor for the number of channels in the first layer + or the block. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + """ + expansion = 1 def __init__( @@ -160,7 +182,17 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time). + + Returns: + Tensor with shape = (batch, out_channels, time). + """ residual = x split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) @@ -194,7 +226,7 @@ def forward(self, x): x = self.bn2(x) if self.se_layer: - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) if self.drop_connect_rate > 0: x = self.drop_connect(x) @@ -215,6 +247,26 @@ def forward(self, x): class Res2Net1dBNBlock(nn.Module): + """Res2Net bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + kernel_size: kernel size in bottleneck layers. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + """ + def __init__( self, in_channels, @@ -232,7 +284,6 @@ def __init__( norm_layer=None, norm_before=True, se_r=None, - num_feats=None, ): super().__init__() @@ -322,7 +373,17 @@ def out_channels(self): def expansion(self): return self.channels / self.width / self.scale - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time). + + Returns: + Tensor with shape = (batch, out_channels, time). + """ residual = x x = self.conv1(x) @@ -360,7 +421,7 @@ def forward(self, x): x = self.bn3(x) if self.se_layer: - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) if self.drop_connect_rate > 0: x = self.drop_connect(x) diff --git a/hyperion/torch/layer_blocks/res2net2d_blocks.py b/hyperion/torch/layer_blocks/res2net2d_blocks.py index 37bbd966..e426d809 100644 --- a/hyperion/torch/layer_blocks/res2net2d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net2d_blocks.py @@ -45,6 +45,29 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) class Res2Net2dBasicBlock(nn.Module): + """Res2Net basic Block. This is a modified Res2Net block with + two 3x3 convolutions, instead of the standard bottleneck block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the first layer + or the block. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r=None: squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + expansion = 1 def __init__( @@ -159,7 +182,17 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) @@ -196,7 +229,7 @@ def forward(self, x): residual = self.downsample(residual) if self.se_layer: - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act2(x) @@ -211,6 +244,27 @@ def forward(self, x): class Res2Net2dBNBlock(nn.Module): + """Res2Net bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + kernel_size: kernel size in bottleneck layers. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r=None: squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + def __init__( self, in_channels, @@ -316,7 +370,17 @@ def out_channels(self): def expansion(self): return self.channels / self.width / self.scale - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -357,7 +421,7 @@ def forward(self, x): residual = self.downsample(residual) if self.se_layer: - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act3(x) diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 56804307..daf391be 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -42,6 +42,28 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) class Res2NetBasicBlock(nn.Module): + """Res2Net basic Block. This is a modified Res2Net block with + two 3x3 convolutions, instead of the standard bottleneck block. + + Attributes: + in_channels: input channels. + channels: output channels. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the first layer + or the block. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + expansion = 1 def __init__( @@ -136,12 +158,22 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) split_x = torch.split(x, split_size, 1) - # split_x = torch.split(x, self.width_in, 1) + x = [] for i in range(self.num_3x3): if i == 0 or self.stride > 1: @@ -173,7 +205,7 @@ def forward(self, x): residual = self.downsample(residual) if self.se_layer: - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act2(x) @@ -188,6 +220,25 @@ def forward(self, x): class Res2NetBNBlock(nn.Module): + """Res2Net bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + expansion = 4 def __init__( @@ -279,7 +330,17 @@ def __init__( def out_channels(self): return self.channels * self.expansion - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -320,7 +381,7 @@ def forward(self, x): residual = self.downsample(residual) if self.se_layer: - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act3(x) diff --git a/hyperion/torch/layer_blocks/resnet1d_blocks.py b/hyperion/torch/layer_blocks/resnet1d_blocks.py index d1965708..01fd1087 100644 --- a/hyperion/torch/layer_blocks/resnet1d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet1d_blocks.py @@ -113,6 +113,23 @@ def _make_upsample( class ResNet1dBasicBlock(nn.Module): + """ResNet 1d basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + expansion = 1 # __constants__ = ['downsample'] @@ -182,7 +199,16 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -218,6 +244,22 @@ def forward(self, x): class ResNet1dBasicDecBlock(nn.Module): + """ResNet 1d basic Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ expansion = 1 # __constants__ = ['downsample'] @@ -288,7 +330,16 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -324,6 +375,25 @@ def forward(self, x): class ResNet1dBNBlock(nn.Module): + """ResNet 1d bottleneck Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, @@ -401,7 +471,17 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ + residual = x x = self.conv1(x) @@ -443,6 +523,24 @@ def forward(self, x): class ResNet1dBNDecBlock(nn.Module): + """ResNet 1d bottleneck Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ def __init__( self, in_channels, @@ -514,7 +612,16 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -556,6 +663,23 @@ def forward(self, x): class SEResNet1dBasicBlock(ResNet1dBasicBlock): + """Squeeze-excitation ResNet 1d basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ expansion = 1 def __init__( @@ -592,7 +716,17 @@ def __init__( self.se_layer = SEBlock1d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -609,7 +743,7 @@ def forward(self, x): if self.norm_before: x = self.bn2(x) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) if self.drop_connect_rate > 0: x = self.drop_connect(x) @@ -629,6 +763,23 @@ def forward(self, x): class SEResNet1dBasicDecBlock(ResNet1dBasicDecBlock): + """Squeeze-excitation ResNet 1d basic Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ expansion = 1 def __init__( @@ -669,7 +820,17 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -686,7 +847,7 @@ def forward(self, x): if self.norm_before: x = self.bn2(x) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) if self.drop_connect_rate > 0: x = self.drop_connect(x) @@ -706,6 +867,26 @@ def forward(self, x): class SEResNet1dBNBlock(ResNet1dBNBlock): + """Squeeze-excitation ResNet 1d bottleneck Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, @@ -742,7 +923,17 @@ def __init__( self.se_layer = SEBlock1d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -765,7 +956,7 @@ def forward(self, x): if self.norm_before: x = self.bn3(x) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) if self.drop_connect_rate > 0: x = self.drop_connect(x) @@ -785,6 +976,25 @@ def forward(self, x): class SEResNet1dBNDecBlock(ResNet1dBNDecBlock): + """Squeeze-excitation ResNet 1d bottleneck Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + width_factor: multiplication factor for the number of channels in the bottleneck. + scale: scale parameter of the Res2Net. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ def __init__( self, in_channels, @@ -821,7 +1031,17 @@ def __init__( self.se_layer = SEBlock1d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -844,7 +1064,7 @@ def forward(self, x): if self.norm_before: x = self.bn3(x) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) if self.drop_connect_rate > 0: x = self.drop_connect(x) @@ -864,6 +1084,24 @@ def forward(self, x): class ResNet1dEndpoint(nn.Module): + """ Class that connects the ouputs of the ResNet1d to the rest of the network + when using multilevel feature aggregation. + + It converts the features of all the levels that we are going to aggregate + to the same temporal scale. + + Attributes: + in_channels: input channels. + channels: output channels. + in_scale: resolution scale of the input feature maps. + scale: resolution scale of the output feature maps. + upsampling_mode: algorithm used for upsampling: 'nearest' | 'linear' | 'bilinear' + activation: Non-linear activation object, string of configuration dictionary. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm1d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + + """ def __init__( self, in_channels, @@ -872,21 +1110,17 @@ def __init__( scale, upsampling_mode="nearest", activation={"name": "relu6", "inplace": True}, + use_norm=True, norm_layer=None, norm_before=True, ): - """ - Class that connects the ouputs of the ResNet1d to the rest of the network - when using multilevel feature aggregation - - It converts the features of all the levels that we are going to aggregate - to the same temporal scale - """ + super().__init__() if norm_layer is None: norm_layer = nn.BatchNorm1d self.in_channels = in_channels self.channels = channels + self.use_norm = use_norm self.norm_before = norm_before self.rel_scale = in_scale / scale if scale >= in_scale: @@ -906,12 +1140,21 @@ def __init__( ) self.act = AF.create(activation) - if not self.norm_before: + if use_norm and not self.norm_before: self.bn = norm_layer(channels) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.resample(x) x = self.act(x) - if not self.norm_before: + if self.use_norm not self.norm_before: x = self.bn(x) return x diff --git a/hyperion/torch/layer_blocks/resnet2d_blocks.py b/hyperion/torch/layer_blocks/resnet2d_blocks.py index 6149319c..7fbb8327 100644 --- a/hyperion/torch/layer_blocks/resnet2d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet2d_blocks.py @@ -79,6 +79,23 @@ def _make_upsample(in_channels, out_channels, stride, norm_layer, norm_before): class ResNet2dBasicBlock(nn.Module): + """ResNet 2d basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + + """ + expansion = 1 def __init__( @@ -140,7 +157,16 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -173,6 +199,23 @@ def forward(self, x): class ResNet2dBasicDecBlock(nn.Module): + """ResNet 2d basic Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + + """ + expansion = 1 def __init__( @@ -235,7 +278,16 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -268,6 +320,23 @@ def forward(self, x): class ResNet2dBNBlock(nn.Module): + """ResNet 2d bottleneck Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size in bottleneck. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, @@ -339,7 +408,16 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -378,6 +456,22 @@ def forward(self, x): class ResNet2dBNDecBlock(nn.Module): + """ResNet 2d bottleneck Block decoder. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size in bottleneck. + activation: Non-linear activation object, string of configuration dictionary. + stride: upsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ def __init__( self, in_channels, @@ -443,7 +537,16 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -482,6 +585,23 @@ def forward(self, x): class SEResNet2dBasicBlock(ResNet2dBasicBlock): + """Squeeze-excitation ResNet 2d basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ expansion = 1 def __init__( @@ -516,7 +636,17 @@ def __init__( self.se_layer = SEBlock2d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -536,7 +666,7 @@ def forward(self, x): if self.downsample is not None: residual = self.downsample(residual) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act2(x) @@ -550,6 +680,23 @@ def forward(self, x): class SEResNet2dBasicDecBlock(ResNet2dBasicDecBlock): + """Squeeze-excitation ResNet 2d basic Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + drop_connect_rate: drop-connect rate for stochastic number of layers. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ expansion = 1 def __init__( @@ -588,7 +735,17 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -608,7 +765,7 @@ def forward(self, x): if self.upsample is not None: residual = self.upsample(residual) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act2(x) @@ -622,6 +779,23 @@ def forward(self, x): class SEResNet2dBNBlock(ResNet2dBNBlock): + """Squeeze-excitation ResNet 2d bottleneck Block. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ def __init__( self, in_channels, @@ -656,7 +830,17 @@ def __init__( self.se_layer = SEBlock2d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -682,7 +866,7 @@ def forward(self, x): if self.downsample is not None: residual = self.downsample(residual) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act3(x) @@ -696,6 +880,23 @@ def forward(self, x): class SEResNet2dBNDecBlock(ResNet2dBNDecBlock): + """Squeeze-excitation ResNet 2d bottleneck Block for decoders. + + Attributes: + in_channels: input channels. + channels: output channels. + kernel_size: kernel size. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + expansion: expansion factor of the bottlneck channels to output channels. + se_r: squeeze-excitation compression ratio. + use_norm: if True, it uses normalization layers, otherwise it does not. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ def __init__( self, in_channels, @@ -730,7 +931,17 @@ def __init__( self.se_layer = SEBlock2d(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -756,7 +967,7 @@ def forward(self, x): if self.upsample is not None: residual = self.upsample(residual) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act3(x) diff --git a/hyperion/torch/layer_blocks/resnet_blocks.py b/hyperion/torch/layer_blocks/resnet_blocks.py index 439a440a..83e6d174 100644 --- a/hyperion/torch/layer_blocks/resnet_blocks.py +++ b/hyperion/torch/layer_blocks/resnet_blocks.py @@ -112,9 +112,22 @@ def forward(self, x): class ResNetBasicBlock(nn.Module): - expansion = 1 + """ResNet basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ - # __constants__ = ['downsample'] + expansion = 1 def __init__( self, @@ -166,7 +179,16 @@ def __init__( def out_channels(self): return self.channels - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -199,6 +221,20 @@ def forward(self, x): class ResNetBNBlock(nn.Module): + """Res2Net bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + expansion = 4 # __constants__ = ['downsample'] @@ -256,7 +292,16 @@ def __init__( def out_channels(self): return self.channels * self.expansion - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -305,6 +350,18 @@ def forward(self, x): class ResNetEndpointBlock(nn.Module): + """ResNet endpoint basic block. This is used as output block when + the output combines feature maps from different resolution levels. + + Attributes: + in_channels: input channels. + out_channels: output channels. + scale: interpolation factor. + activation: Non-linear activation object, string of configuration dictionary. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + """ + def __init__( self, in_channels, @@ -334,7 +391,16 @@ def __init__( if self.scale > 1: self.upsample = Interpolate(scale_factor=scale, mode="nearest") - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ if self.in_channels != self.out_channels: x = self.conv(x) diff --git a/hyperion/torch/layer_blocks/seresnet_blocks.py b/hyperion/torch/layer_blocks/seresnet_blocks.py index a5a7fecd..7f8d0bae 100644 --- a/hyperion/torch/layer_blocks/seresnet_blocks.py +++ b/hyperion/torch/layer_blocks/seresnet_blocks.py @@ -13,6 +13,23 @@ class SEResNetBasicBlock(ResNetBasicBlock): + """Squeeze-excitation ResNet basic Block. + + Attributes: + in_channels: input channels. + channels: output channels. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r: squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + def __init__( self, in_channels, @@ -46,7 +63,16 @@ def __init__( else: self.se_layer = SEBlock2D(channels, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) @@ -80,6 +106,23 @@ def forward(self, x): class SEResNetBNBlock(ResNetBNBlock): + """Squeeze-excitation ResNet bottleneck Block. + + Attributes: + in_channels: input channels. + channels: channels in bottleneck layer when width_factor=1. + activation: Non-linear activation object, string of configuration dictionary. + stride: downsampling stride of the convs. + dropout_rate: dropout rate. + groups: number of groups in the convolutions. + dilation: dilation factor of the conv. kernels. + norm_layer: normalization layer constructor, if None BatchNorm2d is used. + norm_before: if True, normalization layer is before the activation, after otherwise. + se_r=None: squeeze-excitation compression ratio. + time_se: If true, squeeze is done only in time dimension. + num_feats: Number of features in dimension 2, needed if time_se=True. + """ + def __init__( self, in_channels, @@ -115,7 +158,15 @@ def __init__( else: self.se_layer = SEBlock2D(channels * self.expansion, se_r, activation) - def forward(self, x): + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: unused. + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ residual = x x = self.conv1(x) diff --git a/hyperion/torch/layer_blocks/spine_blocks.py b/hyperion/torch/layer_blocks/spine_blocks.py index 21978192..c97cb027 100644 --- a/hyperion/torch/layer_blocks/spine_blocks.py +++ b/hyperion/torch/layer_blocks/spine_blocks.py @@ -132,6 +132,16 @@ def __init__( self.act1 = AF.create(activation) def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) @@ -200,6 +210,16 @@ def __init__( ) def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ if self.do_endpoint_conv and self.in_channels != self.channels: x = self.conv1(x) if self.norm_before: @@ -254,6 +274,16 @@ def __init__( self.bn2 = norm_layer(out_channels) def forward(self, x): + """Forward function. + + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, out_channels, out_heigh, out_width). + """ x = self.conv1(x) if self.norm_before: x = self.bn1(x) diff --git a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py b/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py index c841a056..bdd9b707 100644 --- a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py +++ b/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py @@ -31,16 +31,16 @@ def __init__(self, in_feats, out_feats, hid_act, pos_enc, time_dim=1): nn.Linear(out_feats * (((in_feats - 1) // 2 - 1) // 2), out_feats), pos_enc ) - def forward(self, x, mask): + def forward(self, x, x_mask=None): """Forward function. Args: - x: input tensor with size=(batch, time, num_feats) - mask: mask to indicate valid time steps for x (batch, time1, time2) + x: input tensor with size=(batch, time, in_feats) + x_mask: mask to indicate valid time steps for x (batch, time1, time2) Returns: - Tensor with output features - Tensor with subsampled mask + Tensor with output features with shape = (batch, time//4, out_feats) + Tensor with subsampled mask x4. """ if self.time_dim == 1: x = x.transpose(1, 2) @@ -49,6 +49,6 @@ def forward(self, x, mask): x = self.conv(x) b, c, f, t = x.size() x = self.out(x.contiguous().view(b, c * f, t).transpose(1, 2)) - if mask is None: + if x_mask is None: return x, None - return x, mask[:, :, :-2:2][:, :, :-2:2] + return x, x_mask[:, :, :-2:2][:, :, :-2:2] diff --git a/hyperion/torch/layer_blocks/transformer_feedforward.py b/hyperion/torch/layer_blocks/transformer_feedforward.py index 900500ff..93cc6b66 100644 --- a/hyperion/torch/layer_blocks/transformer_feedforward.py +++ b/hyperion/torch/layer_blocks/transformer_feedforward.py @@ -40,7 +40,7 @@ def forward(self, x): x: input size=(batch, time, num_feats) Returns: - tensor size=(batch, time, num_feats) + Tensor size=(batch, time, num_feats) """ if self.time_dim != 1: x = x.transpose(1, time_dim) @@ -157,7 +157,7 @@ def forward(self, x): x: input tensors with size=(batch, time, num_channels) or size=(batch, num_channels, time). Returns: - output tensor same size as input + Output tensor same size as input """ if self.time_dim != -1: x.transpose(-1, self.time_dim) diff --git a/hyperion/torch/layers/interpolate.py b/hyperion/torch/layers/interpolate.py index 94b3d2ed..335433fe 100644 --- a/hyperion/torch/layers/interpolate.py +++ b/hyperion/torch/layers/interpolate.py @@ -12,8 +12,9 @@ class Interpolate(nn.Module): """Interpolation class. Attributes: - scale_factor: upsampling scale factor. - mode: algorithm used for upsampling: 'nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'. + scale_factor: Upsampling scale factor. + mode: Algorithm used for upsampling: + 'nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'. """ def __init__(self, scale_factor, mode="nearest"): @@ -34,7 +35,7 @@ def forward(self, x): """Interpolates the input. Args: - x: input tensor. + x: Input tensor. Returns: Interpolated tensor. From 89c697ca288d76a1eb1ba407b6000999979460fb Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 10 Mar 2022 10:02:01 -0500 Subject: [PATCH 005/154] more minor bugs fixed --- hyperion/torch/narchs/dc1d_decoder.py | 4 +- hyperion/torch/narchs/dc1d_encoder.py | 4 +- hyperion/torch/narchs/dc2d_decoder.py | 6 +- hyperion/torch/narchs/dc2d_encoder.py | 4 +- hyperion/torch/narchs/efficient_net.py | 2 +- hyperion/torch/narchs/resnet.py | 24 +-- hyperion/torch/narchs/resnet1d_decoder.py | 6 +- hyperion/torch/narchs/resnet1d_encoder.py | 33 +++- hyperion/torch/narchs/resnet2d_decoder.py | 6 +- hyperion/torch/narchs/resnet2d_encoder.py | 44 +++++- hyperion/torch/narchs/torch_na_loader.py | 2 +- .../torch/narchs/transformer_encoder_v1.py | 2 +- hyperion/torch/narchs/xvector_classif.py | 145 ------------------ 13 files changed, 101 insertions(+), 181 deletions(-) delete mode 100644 hyperion/torch/narchs/xvector_classif.py diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index c35d7720..82ac5a8a 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -66,7 +66,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(self.conv_channels) // 2, 32) + norm_groups = min(min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block @@ -196,7 +196,7 @@ def _standarize_convblocks_param(p, num_blocks, p_name): return p def _compute_out_size(self, in_size): - out_size = in_size * in_stride + out_size = in_size * self.in_stride for stride in self.conv_strides: out_size *= stride diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index 091629f4..c2fb3d02 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -64,7 +64,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(self.conv_channels) // 2, 32) + norm_groups = min(min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block @@ -209,7 +209,7 @@ def out_shape(self, in_shape=None): else: T = self._compute_out_size(in_shape[2]) - return (in_shape[0], out_chanels, T) + return (in_shape[0], out_channels, T) def forward(self, x): diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index 6ad7c4c9..e21d615a 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -66,7 +66,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(self.conv_channels) // 2, 32) + norm_groups = min(min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block @@ -210,7 +210,7 @@ def _standarize_convblocks_param(p, num_blocks, p_name): return p def _compute_out_size(self, in_size): - out_size = in_size * in_stride + out_size = in_size * self.in_stride for stride in self.conv_strides: out_size *= stride @@ -243,7 +243,7 @@ def out_shape(self, in_shape=None): else: W = self._compute_out_size(in_shape[3]) - return (in_shape[0], out_chanels, H, W) + return (in_shape[0], out_channels, H, W) def _match_shape(self, x, target_shape): x_dim = x.dim() diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index c6857ff6..4102c4f7 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -65,7 +65,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(self.conv_channels) // 2, 32) + norm_groups = min(min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block @@ -215,7 +215,7 @@ def out_shape(self, in_shape=None): else: W = self._compute_out_size(in_shape[3]) - return (in_shape[0], out_chanels, H, W) + return (in_shape[0], out_channels, H, W) def forward(self, x): diff --git a/hyperion/torch/narchs/efficient_net.py b/hyperion/torch/narchs/efficient_net.py index ab60b8e2..8a71d6f4 100644 --- a/hyperion/torch/narchs/efficient_net.py +++ b/hyperion/torch/narchs/efficient_net.py @@ -132,7 +132,7 @@ def __init__( # set depth/width scales from net name self.cfg_width_scale = width_scale self.cfg_depth_scale = depth_scale - if width_scale is None or dept_scale is None: + if width_scale is None or depth_scale is None: width_scale, depth_scale = self.efficientnet_params(effnet_type)[:2] self.width_scale = width_scale self.depth_scale = depth_scale diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index ca972713..9185964c 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -29,9 +29,12 @@ class ResNet(NetArch): """ResNet2D base class Attributes: - block: resnet basic block type in ['basic', 'bn', 'sebasic', 'sebn'], meaning + block: resnet basic block type in + ['basic', 'bn', 'sebasic', 'sebn', 'res2basic' + 'res2bn', 'seres2basic', 'seres2bn'], meaning basic resnet block, bottleneck resnet block, basic block with squeeze-excitation, - and bottleneck block with squeeze-excitation + bottleneck block with squeeze-excitation, Res2Net basic and bottlenec, and + squeeze-excitation Res2Net basic and bottleneck. num_layers: list with the number of layers in each of the 4 layer blocks that we find in resnets, after each layer block feature maps are downsmapled times 2 in each dimension @@ -46,6 +49,8 @@ class ResNet(NetArch): out_act: output activation zero_init_residual: initializes batchnorm weights to zero so each residual block behaves as identitiy at the beggining. We observed worse results when using this option in x-vectors + multilevel: if True, the output is the combination of the feature maps at different resolution levels. + endpoint_channels: number of output channels when multilevel is True. groups: number of groups in convolutions replace_stride_with_dilation: use dialted conv nets instead of downsammpling, we never tested this. dropout_rate: dropout rate @@ -57,7 +62,8 @@ class ResNet(NetArch): instead of time-freq dimension or HxW dimensions in_feats: input feature size (number of components in dimension of 2 of input tensor), this is only required when time_se=True to calculcate the size of the squeeze excitation matrices. - + res2net_scale: Res2Net scale parameter + res2net_width_factor: Res2Net multiplier for the width of the bottlneck layers. """ def __init__( @@ -395,20 +401,14 @@ def out_shape(self, in_shape=None): return (in_shape[0], self.layer4[-1].out_channels, H, W) - def forward(self, x, use_amp=False): - if use_amp: - with torch.cuda.amp.autocast(): - return self._forward(x) - - return self._forward(x) - - def _forward(self, x): + def forward(self, x, x_lengths=None): """forward function Args: x: input tensor of size=(batch, Cin, Hin, Win) for image or size=(batch, C, freq, time) for audio - + x_lengths: when x are sequences with time in Win dimension, it + contains the lengths of the sequences. Returns: Tensor with output logits of size=(batch, out_units) if out_units>0, otherwise, it returns tensor of represeantions of size=(batch, Cout, Hout, Wout) diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index e3290c71..f24887fe 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -85,7 +85,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(resb_channels) // 2, 32) + norm_groups = min(min(resb_channels) // 2, 32) norm_groups = max(norm_groups, resb_groups) self._norm_layer = NLF.create(norm_layer, norm_groups) @@ -236,7 +236,7 @@ def _standarize_resblocks_param(p, num_blocks, p_name): return p def _compute_out_size(self, in_size): - out_size = in_size * in_stride + out_size = in_size * self.in_stride for stride in self.conv_strides: out_size *= stride @@ -264,7 +264,7 @@ def out_shape(self, in_shape=None): else: T = self._compute_out_size(in_shape[2]) - return (in_shape[0], out_chanels, T) + return (in_shape[0], out_channels, T) def _match_shape(self, x, target_shape): t = x.size(-1) diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index 78ceeac6..2044f528 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -11,6 +11,7 @@ import torch import torch.nn as nn +from ..utils import seq_lengths_to_mask from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF from ..layer_blocks import ( @@ -371,13 +372,36 @@ def _match_lens(endpoints): return endpoints - def forward(self, x): + @staticmethod + def _update_mask(x, x_lengths, x_mask=None): + if x_lengths is None: + return None - x = self.in_block(x) + if x_mask is not None and x.size(-1) == x_mask.size(-1): + return x_mask + + return seq_lengths_to_mask(x_lengths, x.size(-1), time_dim=2) + + def forward(self, x, x_lengths=None): + """forward function + + Args: + x: input tensor of size=(batch, C, time) + x_lengths: it contains the lengths of the sequences. + Returns: + Tensor with output logits of size=(batch, out_units) if out_units>0, + otherwise, it returns tensor of represeantions of size=(batch, Cout, out_time) + + """ + + x_mask = self._update_mask(x, x_lengths) + x = self.in_block(x, x_mask=x_mask) endpoints = [] + for i, superblock in enumerate(self.blocks): for j, block in enumerate(superblock): - x = block(x) + x_mask = self._update_mask(x, x_lengths, x_mask) + x = block(x, x_mask=x_mask) if self.multilayer and self.is_endpoint[i]: endpoint_i = x @@ -401,11 +425,12 @@ def forward(self, x): x = torch.mean(torch.stack(endpoints), 0) if self.head_channels > 0: + x_mask = self._update_mask(x, x_lengths, x_mask) x = self.head_block(x) return x - def forward_hid_feats(self, x, layers=None, return_output=False): + def forward_hid_feats(self, x, x_lengths=None, layers=None, return_output=False): assert layers is not None or return_output if layers is None: diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index f5becf76..6457ada1 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -86,7 +86,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(resb_channels) // 2, 32) + norm_groups = min(min(resb_channels) // 2, 32) norm_groups = max(norm_groups, resb_groups) self._norm_layer = NLF.create(norm_layer, norm_groups) @@ -237,7 +237,7 @@ def _standarize_resblocks_param(p, num_blocks, p_name): return p def _compute_out_size(self, in_size): - out_size = in_size * in_stride + out_size = in_size * self.in_stride for stride in self.conv_strides: out_size *= stride @@ -270,7 +270,7 @@ def out_shape(self, in_shape=None): else: W = self._compute_out_size(in_shape[3]) - return (in_shape[0], out_chanels, H, W) + return (in_shape[0], out_channels, H, W) def _match_shape(self, x, target_shape): x_dim = x.dim() diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index 22fc7fdd..a72cabac 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -9,6 +9,7 @@ import torch import torch.nn as nn +from ..utils import seq_lengths_to_mask from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..layer_blocks import ResNet2dBasicBlock, ResNet2dBNBlock, DC2dEncBlock @@ -18,6 +19,35 @@ class ResNet2dEncoder(NetArch): + """ResNet 2d Encoder. + This is similar to ResNet class but it offers more configuration possibilities + + Attributes: + in_channels=1, + in_conv_channels=64, + in_kernel_size=3, + in_stride=1, + resb_type="basic", + resb_repeats=[2, 2, 2, 2], + resb_channels=[64, 128, 256, 512], + resb_kernel_sizes=3, + resb_strides=2, + resb_dilations=1, + resb_groups=1, + head_channels=0, + hid_act="relu6", + head_act=None, + dropout_rate=0, + se_r=16, + time_se=False, + in_feats=None, + res2net_width_factor=1, + res2net_scale=4, + use_norm=True, + norm_layer=None, + norm_before=True, + """ + def __init__( self, in_channels=1, @@ -104,7 +134,7 @@ def __init__( self.norm_layer = norm_layer norm_groups = None if norm_layer == "group-norm": - norm_groups = min(np.min(resb_channels) // 2, 32) + norm_groups = min(min(resb_channels) // 2, 32) norm_groups = max(norm_groups, resb_groups) self._norm_layer = NLF.create(norm_layer, norm_groups) @@ -266,7 +296,17 @@ def out_shape(self, in_shape=None): else: W = self._compute_out_size(in_shape[3]) - return (in_shape[0], out_chanels, H, W) + return (in_shape[0], out_channels, H, W) + + @staticmethod + def _update_mask(x, x_lengths, x_mask=None): + if x_lengths is None: + return None + + if x_mask is not None and x.size(-1) == x_mask.size(-1): + return x_mask + + return seq_lengths_to_mask(x_lengths, x.size(-1), time_dim=3) def forward(self, x): diff --git a/hyperion/torch/narchs/torch_na_loader.py b/hyperion/torch/narchs/torch_na_loader.py index 542742fa..97bf5fb9 100644 --- a/hyperion/torch/narchs/torch_na_loader.py +++ b/hyperion/torch/narchs/torch_na_loader.py @@ -37,7 +37,7 @@ class TorchNALoader(object): @staticmethod def load(file_path, extra_objs={}): - model_data = torch.load(model_path) + model_data = torch.load(file_path) cfg = model_data["model_cfg"] class_name = cfg["class_name"] del cfg["class_name"] diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index 8d479f24..f1a5b26c 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -157,7 +157,7 @@ def _make_in_layer(self): nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc ) elif isinstance(self.in_layer_type, nn.Module): - self.in_layer = nn.Sequential(in_layer_type, pos_enc) + self.in_layer = nn.Sequential(self.in_layer_type, pos_enc) elif self.in_layer_type is None: self.in_layer = pos_enc else: diff --git a/hyperion/torch/narchs/xvector_classif.py b/hyperion/torch/narchs/xvector_classif.py deleted file mode 100644 index e87c3db1..00000000 --- a/hyperion/torch/narchs/xvector_classif.py +++ /dev/null @@ -1,145 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -import torch.nn as nn -from torch.nn import Linear, BatchNorm1d, Dropout - -from ..layers import ActivationFactory as AF -from .net_arch import NetArch - - -class XVectorClassifV1(NetArch): - def __init__( - self, - input_units, - num_classes, - embed_dim=512, - num_hid_layers=2, - hid_act="relu", - outputs="logits", - use_batchnorm=True, - dropout_rate=0, - ): - - super(XVectorClassifV1, self).__init__() - assert num_hid_layers >= 1, "num_hid_layers (%d < 1)" % num_hid_layers - - self.num_hid_layers = num_hid_layers - self.input_units = input_units - self.embed_dim = embed_dim - self.num_classes = num_classes - self.use_batchnorm = use_batchnorm - self.dropout_rate = dropout_rate - self.outputs = outputs - - if isinstance(hid_units, list): - assert num_hid_layers == len(embed_dim) - else: - embed_dim = [embed_dim for i in range(num_hid_layers)] - - units = [input_units] + embed_dim - - # fully connected layers - fc_layers = [] - for i in range(1, num_hid_layers + 1): - fc_layers.append(Linear(units[i - 1], units[i])) - - self.fc_layers = nn.ModuleList(fc_layers) - - # hidden activations - self.hid_acts = None - if hid_act is not None: - hid_acts = [] - for i in range(num_hid_layers): - hid_act = AF.create(hid_act) - hid_acts.append(hid_act) - self.hid_acts = nn.ModuleList(hid_acts) - - # batch normalization - self.batchnorm_layers = None - if use_batchnorm: - batchnorm_layers = [] - for i in range(num_hid_layers): - batchnorm_layers.append(BatchNorm1d(units[i])) - self.batchnorm_layers = nn.ModuleList(batchnorm_layers) - - # dropout - self.dropout_layers = None - if dropout_rate > 0: - dropout_layers = [] - for i in range(num_hid_layers): - dropout_layers.append(Dropout(dropout_rate)) - self.dropout_layers = nn.ModuleList(dropout_layers) - - # output layers - self.logits_layer = Linear(units[-1], num_classes) - - def forward(self, x): - - for l in range(self.num_hid_layers): - if self.use_batchnorm: - x = self.batchnorm_layers[l](x) - - x = self.fc_layers[l](x) - if self.hid_acts is not None: - x = self.hid_acts[l](x) - - if self.dropout_rate > 0: - x = self.dropout_layers[l](x) - - y = self.logits_layer(x) - - return y - - def extract_embed(self, x, embed_layers=0): - - if isinstance(embed_layers, int): - embed_layers = [embed_layers] - - last_embed_layer = np.max(embed_layers) - embed_layers = set(embed_layers) - - embed_list = [] - for l in range(self.num_hid_layers): - if self.use_batchnorm: - x = self.batchnorm_layers[l](x) - - x = self.fc_layers[l](x) - if l in embed_layers: - embed_list.append(x) - - if l == last_embed_layer: - break - - if self.hid_acts is not None: - x = self.hid_acts[l](x) - - if self.dropout_rate > 0: - x = self.dropout_layers[l](x) - - y = torch.cat((embed_list), dim=-1) - return y - - def get_config(self): - - if self.hid_acts is None: - hid_act = None - else: - hid_act = AF.get_config(self.hid_acts[0]) - - config = { - "num_hid_layers": self.num_hid_layers, - "num_classes": self.num_classes, - "embed_dim": self.embed_dim, - "input_units": self.input_units, - "use_batchnorm": self.use_batchnorm, - "dropout_rate": self.dropout_rate, - "hid_act": hid_act, - } - - base_config = super(XVectorClassifV1, self).get_config() - return dict(list(base_config.items()) + list(config.items())) From c82231f2749565544bd073607fefcc9850d67fa3 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 17 Mar 2022 09:43:11 -0400 Subject: [PATCH 006/154] *** --- hyperion/torch/models/xvectors/spinenet_xvector.py | 2 +- hyperion/torch/torch_model_loader.py | 2 +- hyperion/torch/trainers/xvector_trainer_from_wav.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index 1b7401a4..d3a22bce 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -3,7 +3,7 @@ Copyright 2020 Magdalena Rybicka Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +from jsonargparse import ArgumentParser, ActionParser import logging import torch diff --git a/hyperion/torch/torch_model_loader.py b/hyperion/torch/torch_model_loader.py index 142656d1..92e0beb4 100644 --- a/hyperion/torch/torch_model_loader.py +++ b/hyperion/torch/torch_model_loader.py @@ -55,7 +55,7 @@ def load(file_path, extra_objs={}, map_location=None): if "n_averaged" in state_dict: del state_dict["n_averaged"] - cfg = self._fix_compatibilty(class_obj, cfg) + cfg = TorchModelLoader._fix_compatibilty(class_obj, cfg) p = re.compile("^module\.") num_tries = 3 diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index a00016e6..3519b6d6 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -122,6 +122,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() + self.feat_extractor.train() self.set_train_mode() for batch, (data, target) in enumerate(data_loader): @@ -170,6 +171,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): """ metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() + self.feat_extractor.eval() with torch.no_grad(): if swa_update_bn: log_tag = "train_" From 80cfc3c14478e9f4812d77becd71621a47833654 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 25 Mar 2022 15:37:00 -0400 Subject: [PATCH 007/154] find bug not initializing augmentation --- .../v1/conf/fbank40_nomn_16k.pyconf | 17 -- .../conf/fbank40_nope_hammw_stmn_16k.pyconf | 22 -- .../v1/conf/fbank40_stmn_16k.pyconf | 18 -- .../v1/conf/fbank40_stmvn_16k.pyconf | 19 -- egs/sre19-cmn2/v1/conf/fbank64_8k.pyconf | 14 - egs/sre19-cmn2/v1/conf/fbank64_mvn_8k.pyconf | 18 -- egs/sre19-cmn2/v1/conf/fbank64_stmn_8k.pyconf | 18 -- egs/sre19-cmn2/v1/conf/fbank80_16k.pyconf | 14 - egs/sre19-cmn2/v1/conf/fbank80_mvn_16k.pyconf | 18 -- .../v1/conf/fbank80_stmn_16k.pyconf | 18 -- .../v1/conf/linfbank40_stmn_16k.pyconf | 20 -- egs/sre19-cmn2/v1/conf/vad_16k.pyconf | 16 - ..._resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | 24 +- egs/voxceleb/v1.1/run_011_train_xvector.sh | 60 +--- hyperion/__init__.py | 13 +- .../bin/torch-train-spinenet-xvec-from-wav.py | 2 +- hyperion/bin/torch-train-xvec-from-wav.py | 215 -------------- hyperion/bin/torch_train_xvec_from_wav.py | 275 ++++++++++++++++++ hyperion/io/data_reader.py | 2 +- hyperion/io/data_rw_factory.py | 1 - hyperion/io/int32_writer.py | 2 +- hyperion/io/kaldi_data_reader.py | 2 +- hyperion/np/__init__.py | 1 + hyperion/np/augment/reverb_augment.py | 2 +- hyperion/np/augment/speed_augment.py | 2 +- hyperion/np/metrics/confusion_matrix.py | 2 +- hyperion/np/transforms/transform_list.py | 6 +- hyperion/torch/__init__.py | 15 - hyperion/torch/data/audio_dataset.py | 31 +- hyperion/torch/data/weighted_embed_sampler.py | 4 + hyperion/torch/data/weighted_seq_sampler.py | 23 +- .../torch/layer_blocks/resnet1d_blocks.py | 16 +- .../torch/layer_blocks/resnet2d_blocks.py | 11 +- .../layer_blocks/transformer_feedforward.py | 20 +- hyperion/torch/layers/global_pool.py | 14 +- hyperion/torch/layers/margin_losses.py | 4 +- hyperion/torch/lr_schedulers/cos_lr.py | 8 +- hyperion/torch/lr_schedulers/lr_scheduler.py | 5 +- hyperion/torch/models/xvectors/xvector.py | 2 +- hyperion/torch/torch_model_loader.py | 2 +- hyperion/torch/trainers/ae_trainer.py | 2 + hyperion/torch/trainers/dvae_trainer.py | 2 + hyperion/torch/trainers/plda_trainer.py | 2 + hyperion/torch/trainers/torch_trainer.py | 50 +++- hyperion/torch/trainers/vae_trainer.py | 2 + hyperion/torch/trainers/vq_dvae_trainer.py | 2 + hyperion/torch/trainers/vq_vae_trainer.py | 2 + .../torch/trainers/xvector_adv_trainer.py | 3 + .../trainers/xvector_adv_trainer_from_wav.py | 7 +- hyperion/torch/trainers/xvector_trainer.py | 2 + .../trainers/xvector_trainer_deep_feat_reg.py | 2 + .../xvector_trainer_deep_feat_reg_from_wav.py | 2 + .../trainers/xvector_trainer_from_wav.py | 2 + 53 files changed, 485 insertions(+), 571 deletions(-) delete mode 100644 egs/sre19-cmn2/v1/conf/fbank40_nomn_16k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/fbank40_nope_hammw_stmn_16k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/fbank40_stmn_16k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/fbank40_stmvn_16k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/fbank64_8k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/fbank64_mvn_8k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/fbank64_stmn_8k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/fbank80_16k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/fbank80_mvn_16k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/fbank80_stmn_16k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/linfbank40_stmn_16k.pyconf delete mode 100644 egs/sre19-cmn2/v1/conf/vad_16k.pyconf delete mode 100755 hyperion/bin/torch-train-xvec-from-wav.py create mode 100755 hyperion/bin/torch_train_xvec_from_wav.py diff --git a/egs/sre19-cmn2/v1/conf/fbank40_nomn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank40_nomn_16k.pyconf deleted file mode 100644 index d04eb2ec..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank40_nomn_16k.pyconf +++ /dev/null @@ -1,17 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-no-norm-mean diff --git a/egs/sre19-cmn2/v1/conf/fbank40_nope_hammw_stmn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank40_nope_hammw_stmn_16k.pyconf deleted file mode 100644 index da766d41..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank40_nope_hammw_stmn_16k.pyconf +++ /dev/null @@ -1,22 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-preemphasis-coeff -0 ---feats-window-type -hamming ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank40_stmn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank40_stmn_16k.pyconf deleted file mode 100644 index 919efdec..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank40_stmn_16k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank40_stmvn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank40_stmvn_16k.pyconf deleted file mode 100644 index b81e9283..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank40_stmvn_16k.pyconf +++ /dev/null @@ -1,19 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 ---mvn-norm-var diff --git a/egs/sre19-cmn2/v1/conf/fbank64_8k.pyconf b/egs/sre19-cmn2/v1/conf/fbank64_8k.pyconf deleted file mode 100644 index f5a57052..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank64_8k.pyconf +++ /dev/null @@ -1,14 +0,0 @@ ---sample-frequency -8000 ---frame-length -25 ---low-freq -20 ---high-freq -3700 ---num-filters -64 ---snip-edges -false ---use-energy -false diff --git a/egs/sre19-cmn2/v1/conf/fbank64_mvn_8k.pyconf b/egs/sre19-cmn2/v1/conf/fbank64_mvn_8k.pyconf deleted file mode 100644 index 29ce58a9..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank64_mvn_8k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -8000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -3700 ---feats-num-filters -64 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank64_stmn_8k.pyconf b/egs/sre19-cmn2/v1/conf/fbank64_stmn_8k.pyconf deleted file mode 100644 index 29ce58a9..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank64_stmn_8k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -8000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -3700 ---feats-num-filters -64 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank80_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank80_16k.pyconf deleted file mode 100644 index 3e65fe32..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank80_16k.pyconf +++ /dev/null @@ -1,14 +0,0 @@ ---sample-frequency -16000 ---frame-length -25 ---low-freq -20 ---high-freq -7600 ---num-filters -80 ---snip-edges -false ---use-energy -false diff --git a/egs/sre19-cmn2/v1/conf/fbank80_mvn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank80_mvn_16k.pyconf deleted file mode 100644 index ffdbf165..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank80_mvn_16k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -80 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/fbank80_stmn_16k.pyconf b/egs/sre19-cmn2/v1/conf/fbank80_stmn_16k.pyconf deleted file mode 100644 index ffdbf165..00000000 --- a/egs/sre19-cmn2/v1/conf/fbank80_stmn_16k.pyconf +++ /dev/null @@ -1,18 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -20 ---feats-high-freq -7600 ---feats-num-filters -80 ---feats-snip-edges -false ---feats-use-energy -false ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/linfbank40_stmn_16k.pyconf b/egs/sre19-cmn2/v1/conf/linfbank40_stmn_16k.pyconf deleted file mode 100644 index f80faad2..00000000 --- a/egs/sre19-cmn2/v1/conf/linfbank40_stmn_16k.pyconf +++ /dev/null @@ -1,20 +0,0 @@ ---feats-audio-feat -logfb ---feats-sample-frequency -16000 ---feats-frame-length -25 ---feats-low-freq -64 ---feats-high-freq -8000 ---feats-num-filters -40 ---feats-snip-edges -false ---feats-use-energy -false ---feats-fb-type -linear ---mvn-context -150 diff --git a/egs/sre19-cmn2/v1/conf/vad_16k.pyconf b/egs/sre19-cmn2/v1/conf/vad_16k.pyconf deleted file mode 100644 index b52af74b..00000000 --- a/egs/sre19-cmn2/v1/conf/vad_16k.pyconf +++ /dev/null @@ -1,16 +0,0 @@ ---sample-frequency -16000 ---frame-shift -10 ---frame-length -25 ---snip-edges -false ---vad-energy-threshold -5.5 ---vad-energy-mean-scale -0.5 ---vad-proportion-threshold -0.12 ---vad-frames-context -2 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh index fc62c86b..63fecf32 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,31 +9,27 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg + +nnet_type=resnet + +resnet_type=resnet34 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=resnet34 dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/v1.1/run_011_train_xvector.sh b/egs/voxceleb/v1.1/run_011_train_xvector.sh index 0b0e4d50..9b30369e 100755 --- a/egs/voxceleb/v1.1/run_011_train_xvector.sh +++ b/egs/voxceleb/v1.1/run_011_train_xvector.sh @@ -10,9 +10,8 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" use_tb=false use_wandb=false @@ -20,20 +19,17 @@ use_wandb=false . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$use_tb" == "true" ];then - args="$args --use-tensorboard" + extra_args="$extra_args --trainer.use-tensorboard" fi if [ "$use_wandb" == "true" ];then - args="$args --use-wandb --wandb.project voxceleb-v1.1 --wandb.name $nnet_name.$(date -Iminutes)" + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" fi if [ "$interactive" == "true" ];then @@ -43,47 +39,21 @@ fi # Network Training if [ $stage -le 1 ]; then - if [[ ${nnet_type} =~ resnet1d ]]; then - train_exec=torch-train-resnet1d-xvec-from-wav.py - elif [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]] || [[ ${nnet_type} =~ res2net ]] || [[ ${nnet_type} =~ res2next ]]; then - train_exec=torch-train-resnet-xvec-from-wav.py - elif [[ ${nnet_type} =~ efficientnet ]]; then - train_exec=torch-train-efficientnet-xvec-from-wav.py - elif [[ ${nnet_type} =~ tdnn ]]; then - train_exec=torch-train-tdnn-xvec-from-wav.py - elif [[ ${nnet_type} =~ transformer ]]; then - train_exec=torch-train-transformer-xvec-v1-from-wav.py - elif [[ ${nnet_type} =~ spinenet ]] || [[ ${nnet_type} =~ spine2net ]] || [[ ${nnet_type} =~ r0_sp53 ]]; then - train_exec=torch-train-spinenet-xvec-from-wav.py - else - echo "$nnet_type not supported" - exit 1 - fi mkdir -p $nnet_dir/log $cuda_cmd \ --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --embed-dim $embed_dim $nnet_opt $opt_opt $lrs_opt \ - --epochs $nnet_num_epochs \ - --cos-scale $s --margin $margin --margin-warmup-epochs $margin_warmup \ - --dropout-rate $dropout \ + torch_train_xvec_from_wav.py $nnet_type --cfg $xvec_train_base_cfg $xvec_train_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir $args \ --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args fi - -exit diff --git a/hyperion/__init__.py b/hyperion/__init__.py index 6e59062b..055441cd 100644 --- a/hyperion/__init__.py +++ b/hyperion/__init__.py @@ -5,17 +5,10 @@ from . import utils -from . import metrics -from . import pdfs -from . import transforms +from . import np from . import io -from . import feats -from . import calibration -from . import score_norm - -# from . import keras +from . import torch from . import helpers -# from . import generators -__version__ = "0.3.1" +__version__ = "0.4.0a" diff --git a/hyperion/bin/torch-train-spinenet-xvec-from-wav.py b/hyperion/bin/torch-train-spinenet-xvec-from-wav.py index 91aa17b1..7bac503c 100755 --- a/hyperion/bin/torch-train-spinenet-xvec-from-wav.py +++ b/hyperion/bin/torch-train-spinenet-xvec-from-wav.py @@ -15,6 +15,7 @@ import time import logging import multiprocessing +from pathlib import Path import numpy as np @@ -22,7 +23,6 @@ import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device from hyperion.torch.utils import ddp from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.models import SpineNetXVector as XVec diff --git a/hyperion/bin/torch-train-xvec-from-wav.py b/hyperion/bin/torch-train-xvec-from-wav.py deleted file mode 100755 index 8dcd0482..00000000 --- a/hyperion/bin/torch-train-xvec-from-wav.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging -import multiprocessing - -import numpy as np - -import torch -import torch.nn as nn - -from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.models import ResNetXVector as RXVec -from hyperion.torch.models import EfficientNetXVector as EXVec -from hyperion.torch.models import TDNNXVector as TDXVec -from hyperion.torch.models import TransformerXVectorV1 as TFXVec - -xvec_dict = { - "resnet": RXVec, - "efficientnet": EXVec, - "tdnn": TDXVec, - "transformer": TFXVec, -} - - -def init_data( - audio_path, - train_list, - val_list, - train_aug_cfg, - val_aug_cfg, - num_workers, - num_gpus, - rank, - **kwargs -): - - ad_args = AD.filter_args(**kwargs) - sampler_args = Sampler.filter_args(**kwargs) - if rank == 0: - logging.info("audio dataset args={}".format(ad_args)) - logging.info("sampler args={}".format(sampler_args)) - logging.info("init datasets") - - train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) - val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) - - if rank == 0: - logging.info("init samplers") - train_sampler = Sampler(train_data, **sampler_args) - val_sampler = Sampler(val_data, **sampler_args) - - num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ( - {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} - ) - - train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler=train_sampler, **largs - ) - - test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler=val_sampler, **largs - ) - - return train_loader, test_loader - - -def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs["feats"]) - if rank == 0: - logging.info("feat args={}".format(feat_args)) - logging.info("initializing feature extractor") - feat_extractor = AF(trans=True, **feat_args) - if rank == 0: - logging.info("feat-extractor={}".format(feat_extractor)) - return feat_extractor - - -def init_xvector(num_classes, rank, xvec_class, **kwargs): - - xvec_args = xvec_class.filter_args(**kwargs) - if rank == 0: - logging.info("xvector network args={}".format(xvec_args)) - xvec_args["num_classes"] = num_classes - model = xvec_class(**xvec_args) - if rank == 0: - logging.info("x-vector-model={}".format(model)) - return model - - -def train_xvec(gpu_id, args): - - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - kwargs = namespace_to_dict(args) - torch.manual_seed(args.seed) - set_float_cpu("float32") - - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank - - train_loader, test_loader = init_data(**kwargs) - feat_extractor = init_feats(**kwargs) - model = init_xvector(train_loader.dataset.num_classes, **kwargs) - - trn_args = Trainer.filter_args(**kwargs) - if rank == 0: - logging.info("trainer args={}".format(trn_args)) - metrics = {"acc": CategoricalAccuracy()} - trainer = Trainer( - model, - feat_extractor, - device=device, - metrics=metrics, - ddp=world_size > 1, - **trn_args - ) - if args.resume: - trainer.load_last_checkpoint() - trainer.fit(train_loader, test_loader) - - ddp.ddp_cleanup() - - -def make_parser(xvec_class): - parser = ArgumentParser() - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--audio-path", required=True) - parser.add_argument("--train-list", required=True) - parser.add_argument("--val-list", required=True) - - AD.add_class_args(parser) - Sampler.add_class_args(parser) - - parser.add_argument("--train-aug-cfg", default=None) - parser.add_argument("--val-aug-cfg", default=None) - - parser.add_argument( - "--num-workers", type=int, default=5, help="num_workers of data loader" - ) - - AF.add_class_args(parser, prefix="feats") - xvec_class.add_class_args(parser) - Trainer.add_class_args(parser) - ddp.add_ddp_args(parser) - parser.add_argument("--seed", type=int, default=1123581321, help="random seed") - parser.add_argument( - "--resume", - action="store_true", - default=False, - help="resume training from checkpoint", - ) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - return parser - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Train XVector from audio files") - - parser.add_argument("--cfg", action=ActionConfigFile) - - subcommands = parser.add_subcommands() - - for k, v in xvec_dict.items(): - parser_k = make_parser(v) - subcommands.add_subcommand(k, parser_k) - - args = parser.parse_args() - try: - gpu_id = int(os.environ["LOCAL_RANK"]) - except: - gpu_id = 0 - - xvec_type = args.subcommand - args_sc = vars(args)[xvec_type] - - if gpu_id == 0: - try: - config_file = Path(args_sc.exp_path) / "config.yaml" - parser.save(args, str(config_file), format="yaml", overwrite=True) - except: - pass - - args_sc.xvec_class = xvec_dict[xvec_type] - # torch docs recommend using forkserver - multiprocessing.set_start_method("forkserver") - train_xvec(gpu_id, args_sc) diff --git a/hyperion/bin/torch_train_xvec_from_wav.py b/hyperion/bin/torch_train_xvec_from_wav.py new file mode 100755 index 00000000..df948b87 --- /dev/null +++ b/hyperion/bin/torch_train_xvec_from_wav.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import open_device +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import TDNNXVector as TDXVec +from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "efficientnet": EXVec, + "tdnn": TDXVec, + "transformer": TFXVec, + "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = Sampler.filter_args(**kwargs["sampler"]) + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + ad_args["is_val"] = partition == "val" + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = Sampler(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +# def init_data( +# audio_path, +# train_list, +# val_list, +# train_aug_cfg, +# val_aug_cfg, +# num_workers, +# num_gpus, +# rank, +# **kwargs +# ): + +# ad_args = AD.filter_args(**kwargs) +# sampler_args = Sampler.filter_args(**kwargs) +# if rank == 0: +# logging.info("audio dataset args={}".format(ad_args)) +# logging.info("sampler args={}".format(sampler_args)) +# logging.info("init datasets") + +# train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) +# val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) + +# if rank == 0: +# logging.info("init samplers") +# train_sampler = Sampler(train_data, **sampler_args) +# val_sampler = Sampler(val_data, **sampler_args) + +# num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) +# largs = ( +# {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} +# ) + +# train_loader = torch.utils.data.DataLoader( +# train_data, batch_sampler=train_sampler, **largs +# ) + +# test_loader = torch.utils.data.DataLoader( +# val_data, batch_sampler=val_sampler, **largs +# ) + +# return train_loader, test_loader + + +def init_feats(rank, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + if rank == 0: + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + if rank == 0: + logging.info("feat-extractor={}".format(feat_extractor)) + return feat_extractor + + +def init_xvector(num_classes, rank, xvec_class, **kwargs): + + xvec_args = xvec_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + feat_extractor = init_feats(**kwargs) + model = init_xvector(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + # parser.add_argument("--audio-path", required=True) + # parser.add_argument("--train-list", required=True) + # parser.add_argument("--val-list", required=True) + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + Sampler.add_class_args(train_parser, prefix="sampler") + # parser.add_argument("--train-aug-cfg", default=None) + # parser.add_argument("--val-aug-cfg", default=None) + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + Sampler.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_file", "data.val.dataset.class_file" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + parser.link_arguments( + "data.train.sampler.batch_size", "data.val.sampler.batch_size" + ) + + AF.add_class_args(parser, prefix="feats") + xvec_class.add_class_args(parser, prefix="model") + Trainer.add_class_args(parser, prefix="trainer") + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + # parser.add_argument( + # "--resume", + # action="store_true", + # default=False, + # help="resume training from checkpoint", + # ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train XVector from audio files") + + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) diff --git a/hyperion/io/data_reader.py b/hyperion/io/data_reader.py index f0c61d3a..da0279e1 100644 --- a/hyperion/io/data_reader.py +++ b/hyperion/io/data_reader.py @@ -10,7 +10,7 @@ from ..hyp_defs import float_cpu from ..utils.scp_list import SCPList -from ..transforms import TransformList +from ..np.transforms import TransformList class DataReader(object): diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py index ed408156..0c49cd9f 100644 --- a/hyperion/io/data_rw_factory.py +++ b/hyperion/io/data_rw_factory.py @@ -13,7 +13,6 @@ from .ark_data_reader import SequentialArkFileDataReader as SAFDR from .ark_data_reader import SequentialArkScriptDataReader as SASDR from .ark_data_reader import RandomAccessArkDataReader as RADR -from .h5_data_writer import H5DataWriter as H5DW from .h5_data_reader import SequentialH5FileDataReader as SH5FDR from .h5_data_reader import SequentialH5ScriptDataReader as SH5SDR from .h5_data_reader import RandomAccessH5FileDataReader as RH5FDR diff --git a/hyperion/io/int32_writer.py b/hyperion/io/int32_writer.py index c823dc0e..d881fb16 100644 --- a/hyperion/io/int32_writer.py +++ b/hyperion/io/int32_writer.py @@ -12,4 +12,4 @@ class Int32Writer(DataWriter): """Class to write data to int32 files.""" def __init__(self, wspecifier): - super(Int32Writer, self).__init__(wspecifier) + super().__init__(wspecifier) diff --git a/hyperion/io/kaldi_data_reader.py b/hyperion/io/kaldi_data_reader.py index 712941ec..6313cb29 100644 --- a/hyperion/io/kaldi_data_reader.py +++ b/hyperion/io/kaldi_data_reader.py @@ -133,7 +133,7 @@ def _read_ascii_matrix(f): while 1: line = f.readline() if len(line) == 0: - raise BadInputFormat # eof, should not happen! + raise ValueError() # eof, should not happen! if len(line.strip()) == 0: continue # skip empty line arr = line.strip().split() diff --git a/hyperion/np/__init__.py b/hyperion/np/__init__.py index d2774314..86fff349 100644 --- a/hyperion/np/__init__.py +++ b/hyperion/np/__init__.py @@ -3,5 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + from .np_model import NPModel from .np_model_loader import NPModelLoader diff --git a/hyperion/np/augment/reverb_augment.py b/hyperion/np/augment/reverb_augment.py index fe225e53..ef5293d6 100644 --- a/hyperion/np/augment/reverb_augment.py +++ b/hyperion/np/augment/reverb_augment.py @@ -15,7 +15,7 @@ from scipy import signal from ...hyp_defs import float_cpu -from ..io import RandomAccessDataReaderFactory as DRF +from ...io import RandomAccessDataReaderFactory as DRF class RIRNormType(Enum): diff --git a/hyperion/np/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py index 7fdaab3c..2f353ebe 100644 --- a/hyperion/np/augment/speed_augment.py +++ b/hyperion/np/augment/speed_augment.py @@ -5,7 +5,7 @@ import logging from copy import deepcopy -import multiprocessing +import yaml import numpy as np from librosa.effects import time_stretch diff --git a/hyperion/np/metrics/confusion_matrix.py b/hyperion/np/metrics/confusion_matrix.py index 2efdd9e4..084aa7a9 100644 --- a/hyperion/np/metrics/confusion_matrix.py +++ b/hyperion/np/metrics/confusion_matrix.py @@ -8,7 +8,7 @@ import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix -from ..utils.list_utils import list2ndarray +from ...utils.list_utils import list2ndarray def compute_confusion_matrix( diff --git a/hyperion/np/transforms/transform_list.py b/hyperion/np/transforms/transform_list.py index 3e89966a..62bc802e 100644 --- a/hyperion/np/transforms/transform_list.py +++ b/hyperion/np/transforms/transform_list.py @@ -8,7 +8,7 @@ import numpy as np import h5py -from ..hyp_model import HypModel +from ..np_model import NPModel from .cent_whiten import CentWhiten from .cent_whiten_up import CentWhitenUP @@ -22,11 +22,11 @@ from .gaussianizer import Gaussianizer -class TransformList(HypModel): +class TransformList(NPModel): """Class to perform a list of transformations""" def __init__(self, transforms, **kwargs): - super(TransformList, self).__init__(**kwargs) + super().__init__(**kwargs) if not isinstance(transforms, list): transforms = [transforms] self.transforms = transforms diff --git a/hyperion/torch/__init__.py b/hyperion/torch/__init__.py index 8fade929..41745d38 100644 --- a/hyperion/torch/__init__.py +++ b/hyperion/torch/__init__.py @@ -3,21 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# - -# from . import utils -# from . import loggers -# from . import metrics -# from . import lr_schedulers -# from . import data -# from . import layers -# from . import layer_blocks -# from . import narchs -# from . import trainers -# from . import transforms -# from . import adv_attacks -# from . import helpers -# from . import seq_embed from .torch_model import TorchModel from .torch_model_loader import TorchModelLoader diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 1801f11a..38da8eb9 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -25,7 +25,7 @@ class AudioDataset(Dataset): def __init__( self, - audio_path, + audio_file, key_file, class_file=None, time_durs_file=None, @@ -51,8 +51,8 @@ def __init__( self.world_size = world_size if rank == 0: - logging.info("opening dataset %s" % audio_path) - self.r = AR(audio_path, wav_scale=wav_scale) + logging.info("opening dataset %s", audio_file) + self.r = AR(audio_file, wav_scale=wav_scale) if rank == 0: logging.info("loading utt2info file %s" % key_file) self.u2c = Utt2Info.load(key_file, sep=" ") @@ -62,7 +62,6 @@ def __init__( self.is_val = is_val self._read_time_durs_file(time_durs_file) - # self._seq_lengths = self.r.read_time_duration(self.u2c.key) self._prune_short_seqs(min_chunk_length) self.short_seq_exist = self._seq_shorter_than_max_length_exists( @@ -366,6 +365,9 @@ def filter_args(**kwargs): ar_args = AR.filter_args(**kwargs) valid_args = ( + "audio_file", + "key_file", + "aug_cfg", "path_prefix", "class_file", "time_durs_file", @@ -380,7 +382,7 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix=None, skip={"audio_file", "key_file"}): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -388,6 +390,19 @@ def add_class_args(parser, prefix=None): # parser.add_argument('--path-prefix', # default='', # help=('path prefix for rspecifier scp file')) + if "audio_file" not in skip: + parser.add_argument( + "--audio-file", + required=True, + help=("audio manifest file"), + ) + + if "key_file" not in skip: + parser.add_argument( + "--key-file", + required=True, + help=("key manifest file"), + ) parser.add_argument( "--class-file", @@ -399,6 +414,12 @@ def add_class_args(parser, prefix=None): "--time-durs-file", default=None, help=("utt to duration in secs file") ) + parser.add_argument( + "--aug-cfg", + default=None, + help=("augmentation configuration file."), + ) + parser.add_argument( "--min-chunk-length", type=float, diff --git a/hyperion/torch/data/weighted_embed_sampler.py b/hyperion/torch/data/weighted_embed_sampler.py index 61e4a0ad..2c381365 100644 --- a/hyperion/torch/data/weighted_embed_sampler.py +++ b/hyperion/torch/data/weighted_embed_sampler.py @@ -36,6 +36,10 @@ def __iter__(self): self.batch = 0 return self + @property + def avg_batch_size(self): + return self.batch_size + def _remove_duplicate_idx(self, utt_idx): utt_idx_uniq = torch.unique(utt_idx) c = 0 diff --git a/hyperion/torch/data/weighted_seq_sampler.py b/hyperion/torch/data/weighted_seq_sampler.py index 9d128bb8..34c3fcbc 100644 --- a/hyperion/torch/data/weighted_seq_sampler.py +++ b/hyperion/torch/data/weighted_seq_sampler.py @@ -35,7 +35,7 @@ def __init__( world_size = 1 self.dataset = dataset - self.batch_size = int(math.ceil(batch_size / world_size)) + self.batch_size = batch_size self.num_egs_per_class = num_egs_per_class self.num_egs_per_utt = num_egs_per_utt self.var_batch_size = var_batch_size @@ -65,13 +65,22 @@ def __init__( self.iters_per_epoch * dataset.num_seqs / avg_batch_size / world_size ) ) - - logging.info("num batches per epoch: %d" % self._len) + print( + "num_batches", + self.iters_per_epoch, + dataset.num_seqs, + avg_batch_size, + world_size, + self._len, + flush=True, + ) + self.avg_batch_size = avg_batch_size + logging.info("num batches per epoch: %d", self._len) self._num_classes_per_batch = int( - math.ceil(batch_size / num_egs_per_class / num_egs_per_utt) + math.ceil(avg_batch_size / num_egs_per_class / num_egs_per_utt) ) - logging.info("num classes per batch: %d" % self._num_classes_per_batch) + logging.info("num classes per batch: %d", self._num_classes_per_batch) # self.weights = torch.as_tensor(dataset.class_weights, dtype=torch.double) @@ -228,7 +237,9 @@ def add_class_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") - parser.add_argument("--batch-size", default=128, type=int, help=("batch size")) + parser.add_argument( + "--batch-size", default=128, type=int, help=("batch size per gpu") + ) parser.add_argument( "--var-batch-size", diff --git a/hyperion/torch/layer_blocks/resnet1d_blocks.py b/hyperion/torch/layer_blocks/resnet1d_blocks.py index 01fd1087..ca99bb3d 100644 --- a/hyperion/torch/layer_blocks/resnet1d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet1d_blocks.py @@ -260,6 +260,7 @@ class ResNet1dBasicDecBlock(nn.Module): norm_layer: normalization layer constructor, if None BatchNorm1d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + expansion = 1 # __constants__ = ['downsample'] @@ -541,6 +542,7 @@ class ResNet1dBNDecBlock(nn.Module): norm_layer: normalization layer constructor, if None BatchNorm1d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + def __init__( self, in_channels, @@ -680,6 +682,7 @@ class SEResNet1dBasicBlock(ResNet1dBasicBlock): norm_layer: normalization layer constructor, if None BatchNorm1d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + expansion = 1 def __init__( @@ -780,6 +783,7 @@ class SEResNet1dBasicDecBlock(ResNet1dBasicDecBlock): norm_layer: normalization layer constructor, if None BatchNorm1d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + expansion = 1 def __init__( @@ -886,7 +890,7 @@ class SEResNet1dBNBlock(ResNet1dBNBlock): norm_layer: normalization layer constructor, if None BatchNorm1d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ - + def __init__( self, in_channels, @@ -976,7 +980,7 @@ def forward(self, x, x_mask=None): class SEResNet1dBNDecBlock(ResNet1dBNDecBlock): - """Squeeze-excitation ResNet 1d bottleneck Block for decoders. + """Squeeze-excitation ResNet 1d bottleneck Block for decoders. Attributes: in_channels: input channels. @@ -995,6 +999,7 @@ class SEResNet1dBNDecBlock(ResNet1dBNDecBlock): norm_layer: normalization layer constructor, if None BatchNorm1d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + def __init__( self, in_channels, @@ -1084,7 +1089,7 @@ def forward(self, x, x_mask=None): class ResNet1dEndpoint(nn.Module): - """ Class that connects the ouputs of the ResNet1d to the rest of the network + """Class that connects the ouputs of the ResNet1d to the rest of the network when using multilevel feature aggregation. It converts the features of all the levels that we are going to aggregate @@ -1102,6 +1107,7 @@ class ResNet1dEndpoint(nn.Module): norm_before: if True, normalization layer is before the activation, after otherwise. """ + def __init__( self, in_channels, @@ -1114,7 +1120,7 @@ def __init__( norm_layer=None, norm_before=True, ): - + super().__init__() if norm_layer is None: norm_layer = nn.BatchNorm1d @@ -1155,6 +1161,6 @@ def forward(self, x, x_mask=None): """ x = self.resample(x) x = self.act(x) - if self.use_norm not self.norm_before: + if self.use_norm and not self.norm_before: x = self.bn(x) return x diff --git a/hyperion/torch/layer_blocks/resnet2d_blocks.py b/hyperion/torch/layer_blocks/resnet2d_blocks.py index 7fbb8327..65761526 100644 --- a/hyperion/torch/layer_blocks/resnet2d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet2d_blocks.py @@ -93,7 +93,7 @@ class ResNet2dBasicBlock(nn.Module): use_norm: if True, it uses normalization layers, otherwise it does not. norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. - + """ expansion = 1 @@ -213,7 +213,7 @@ class ResNet2dBasicDecBlock(nn.Module): use_norm: if True, it uses normalization layers, otherwise it does not. norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. - + """ expansion = 1 @@ -456,7 +456,7 @@ def forward(self, x, x_mask=None): class ResNet2dBNDecBlock(nn.Module): - """ResNet 2d bottleneck Block decoder. + """ResNet 2d bottleneck Block decoder. Attributes: in_channels: input channels. @@ -472,6 +472,7 @@ class ResNet2dBNDecBlock(nn.Module): norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + def __init__( self, in_channels, @@ -602,6 +603,7 @@ class SEResNet2dBasicBlock(ResNet2dBasicBlock): norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + expansion = 1 def __init__( @@ -697,6 +699,7 @@ class SEResNet2dBasicDecBlock(ResNet2dBasicDecBlock): norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + expansion = 1 def __init__( @@ -796,6 +799,7 @@ class SEResNet2dBNBlock(ResNet2dBNBlock): norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + def __init__( self, in_channels, @@ -897,6 +901,7 @@ class SEResNet2dBNDecBlock(ResNet2dBNDecBlock): norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. """ + def __init__( self, in_channels, diff --git a/hyperion/torch/layer_blocks/transformer_feedforward.py b/hyperion/torch/layer_blocks/transformer_feedforward.py index 93cc6b66..7d2e8c1b 100644 --- a/hyperion/torch/layer_blocks/transformer_feedforward.py +++ b/hyperion/torch/layer_blocks/transformer_feedforward.py @@ -43,7 +43,7 @@ def forward(self, x): Tensor size=(batch, time, num_feats) """ if self.time_dim != 1: - x = x.transpose(1, time_dim) + x = x.transpose(1, self.time_dim) x = self.activation(self.w_1(x)) if self.dropout_rate > 0: @@ -51,7 +51,7 @@ def forward(self, x): x = self.w_2(x) if self.time_dim != 1: - x = x.transpose(1, time_dim) + x = x.transpose(1, self.time_dim) return x @@ -73,7 +73,13 @@ class Conv1dx2(nn.Module): """ def __init__( - self, num_channels, hid_channels, kernel_size, dropout_rate=0, time_dim=-1 + self, + num_channels, + hid_channels, + kernel_size, + activation="relu6", + dropout_rate=0, + time_dim=-1, ): super().__init__() @@ -133,7 +139,13 @@ class Conv1dLinear(nn.Module): """ def __init__( - self, num_channels, hid_channels, kernel_size, dropout_rate=0, time_dim=-1 + self, + num_channels, + hid_channels, + kernel_size, + activation="relu6", + dropout_rate=0, + time_dim=-1, ): super().__init__() self.w_1 = nn.Conv1d( diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 467ea589..b6b3569e 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -10,9 +10,7 @@ import torch.nn as nn import torch.nn.functional as nnf -from hyperion.torch.utils.masking import seq_lengths_to_mask - -from ..utils import seq_le +from ..utils import seq_lengths_to_mask SQRT_EPS = 1e-5 N_EPS = 1e-6 @@ -44,7 +42,7 @@ def _standardize_weights(self, x, x_lengths=None, weights=None): """ if weights is None: return seq_lengths_to_mask( - x, x.size(self.dim), dtype=x.dtype, time_dim=self.dim + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=self.dim ) if weights.dim() == x.dim(): @@ -478,7 +476,9 @@ def __str__(self): def _standardize_weights(self, x, x_lengths=None, weights=None): """standardizes the weights to have shape (batch, max_length).""" if weights is None: - return seq_lengths_to_mask(x, x.size(self.dim), dtype=x.dtype, time_dim=1) + return seq_lengths_to_mask( + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=1 + ) if weights.dim() == x.dim(): return weights.traspose(1, self.dim) @@ -597,7 +597,9 @@ def __str__(self): def _standardize_weights(self, x, x_lengths=None, weights=None): """standardizes the weights to have shape (batch, max_length).""" if weights is None: - return seq_lengths_to_mask(x, x.size(self.dim), dtype=x.dtype, time_dim=1) + return seq_lengths_to_mask( + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=1 + ) if weights.dim() == x.dim(): return weights.traspose(1, self.dim) diff --git a/hyperion/torch/layers/margin_losses.py b/hyperion/torch/layers/margin_losses.py index 5ae2b518..63da2493 100644 --- a/hyperion/torch/layers/margin_losses.py +++ b/hyperion/torch/layers/margin_losses.py @@ -251,8 +251,8 @@ def forward(self, x, y=None): """Computes penalized logits. Args: - x: input feature tensor with shape = (batch, in_feats). - y: ground truth classes. This is required to penalize the logit of + x: Input feature tensor with shape = (batch, in_feats). + y: Ground truth classes. This is required to penalize the logit of the true class at training time. Returns: diff --git a/hyperion/torch/lr_schedulers/cos_lr.py b/hyperion/torch/lr_schedulers/cos_lr.py index 6e36cf2a..83b9206f 100644 --- a/hyperion/torch/lr_schedulers/cos_lr.py +++ b/hyperion/torch/lr_schedulers/cos_lr.py @@ -64,7 +64,7 @@ def __init__( self.gamma = gamma def on_epoch_begin(self, epoch=None, epoch_updates=1, **kwargs): - super(CosineLR, self).on_epoch_begin(epoch) + super().on_epoch_begin(epoch) if self.update_lr_on_opt_step: # T has to correspond to an integer number of epochs T = int(math.ceil(self.T / epoch_updates) * epoch_updates) @@ -122,7 +122,7 @@ def __init__( step=-1, update_lr_on_opt_step=False, ): - super(AdamCosineLR, super).__init__( + super().__init__( optimizer, T, T_mul, @@ -143,12 +143,12 @@ def get_lr(self, step): if self.warm_restarts: self.last_restart = step x = 0 - self.T *= T_mul + self.T *= self.T_mul self.num_restarts += 1 else: return self.min_lrs - alpha = gamma ** self.num_restarts + alpha = self.gamma ** self.num_restarts r = math.pi / self.T return [ diff --git a/hyperion/torch/lr_schedulers/lr_scheduler.py b/hyperion/torch/lr_schedulers/lr_scheduler.py index 319ea7a2..2ad1740e 100644 --- a/hyperion/torch/lr_schedulers/lr_scheduler.py +++ b/hyperion/torch/lr_schedulers/lr_scheduler.py @@ -56,7 +56,7 @@ def __init__( @property def in_warmup(self): - return self.step <= self.warmup_steps + return self.step < self.warmup_steps def state_dict(self): """Returns the state of the scheduler as a :class:`dict`. @@ -104,9 +104,6 @@ def on_epoch_end(self, metrics=None): def on_opt_step(self): - # self.update_lr_on_opt_step=True - # print('exp-lr', self.last_step, self.hold_steps, self.decay_rate, self.decay_steps) - if self.in_warmup: for param_group, lr in zip( self.optimizer.param_groups, self.get_warmup_lr() diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 685ead4a..3e9e9fcd 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -699,7 +699,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - valid_args = ("loss_type", "s", "margin", "margin_warmup_epochs") + valid_args = ("loss_type", "cos_scale", "margin", "margin_warmup_epochs") args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args diff --git a/hyperion/torch/torch_model_loader.py b/hyperion/torch/torch_model_loader.py index 92e0beb4..c173cd50 100644 --- a/hyperion/torch/torch_model_loader.py +++ b/hyperion/torch/torch_model_loader.py @@ -55,7 +55,7 @@ def load(file_path, extra_objs={}, map_location=None): if "n_averaged" in state_dict: del state_dict["n_averaged"] - cfg = TorchModelLoader._fix_compatibilty(class_obj, cfg) + cfg = TorchModelLoader._fix_compatibility(class_obj, cfg) p = re.compile("^module\.") num_tries = 3 diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 8646c79f..4bd6790a 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -56,6 +56,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, @@ -86,6 +87,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 0d9b1de3..5649cc01 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -54,6 +54,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, @@ -82,6 +83,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index 4365ed56..dd797996 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -58,6 +58,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, @@ -91,6 +92,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 72f6d164..ecdb4dd8 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -4,6 +4,7 @@ """ import os +import math import contextlib from collections import OrderedDict as ODict from enum import Enum @@ -76,6 +77,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, @@ -102,6 +104,7 @@ def __init__( self.epochs = epochs self.cur_epoch = cur_epoch self.grad_acc_steps = grad_acc_steps + self.eff_batch_size = eff_batch_size self.exp_path = Path(exp_path) if loggers is None: @@ -113,8 +116,6 @@ def __init__( else: self.loggers = loggers - # self.lr_scheduler = lr_scheduler - self.metrics = metrics self.device = device self.train_mode = train_mode @@ -211,8 +212,7 @@ def fit(self, train_data, val_data=None): val_data: PyTorch data loader for the validation loop """ self.exp_path.mkdir(parents=True, exist_ok=True) - # if not os.path.exists(self.exp_path): - # os.makedirs(self.exp_path) + self._compute_grad_acc_steps(train_data) if self.do_swa and self.cur_epoch >= self.swa_start: self.in_swa = True @@ -435,6 +435,40 @@ def _get_lr(self): for param_group in self.optimizer.param_groups: return param_group["lr"] + def _compute_grad_acc_steps(self, data_loader): + if self.eff_batch_size is None: + return + + if data_loader.batch_sampler is not None: + try: + batch_size = data_loader.batch_sampler.avg_batch_size + except: + logging.warn( + "batch sampler doesn't have avg_batch_size property, " + "we cannot estimate grad_acc_steps, using grad_acc_steps=%d", + self.grad_acc_steps, + ) + return + + self.grad_acc_steps = int( + math.ceil(self.eff_batch_size / batch_size / self.world_size) + ) + logging.info( + "Setting grad_acc_steps=%d for" + "eff_batch_size=%d, avg_batch_size=%d, world_size=%d", + self.grad_acc_steps, + self.eff_batch_size, + batch_size, + self.world_size, + ) + return + + logging.warn( + "We cannot determine the batch_size, " + "we cannot estimate grad_acc_steps, using grad_acc_steps=%d", + self.grad_acc_steps, + ) + def checkpoint(self, logs=None): """Creates a checkpoint of the training, to save and posterior recovery @@ -566,6 +600,7 @@ def load_last_checkpoint(self): def filter_args(**kwargs): valid_args = ( "grad_acc_steps", + "eff_batch_size", "epochs", "log_interval", "use_amp", @@ -604,6 +639,12 @@ def add_class_args(parser, prefix=None, skip=[]): default=1, help="gradient accumulation batches before weigth update", ) + parser.add_argument( + "--eff-batch-size", + type=int, + default=None, + help="effective total batch size, if given, it overrides grad_acc_steps", + ) parser.add_argument("--epochs", type=int, default=200, help="number of epochs") parser.add_argument( "--log-interval", @@ -680,6 +721,5 @@ def add_class_args(parser, prefix=None, skip=[]): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='trainer options') add_argparse_args = add_class_args diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index 53486c7b..bc72bbe2 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -54,6 +54,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, @@ -82,6 +83,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index a2da616c..ac87ba5a 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -54,6 +54,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, @@ -81,6 +82,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index d187af79..1b13bac1 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -54,6 +54,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, @@ -81,6 +82,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 0784a2ea..7dee1303 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -7,6 +7,7 @@ import time import logging +from jsonargparse import ArgumentParser, ActionParser import torch import torch.nn as nn @@ -58,6 +59,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, p_attack=0.8, p_val_attack=0, device=None, @@ -88,6 +90,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index fef0b3b5..0719f350 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -7,11 +7,12 @@ import time import logging +from jsonargparse import ArgumentParser, ActionParser import torch import torch.nn as nn -from ..utils import MetricAcc +from ..utils import MetricAcc from .xvector_trainer_from_wav import XVectorTrainerFromWav @@ -60,6 +61,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, p_attack=0.8, p_val_attack=0, device=None, @@ -91,6 +93,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, @@ -128,7 +131,6 @@ def __init__( % (p_attack, 1.0 / self.grad_acc_steps) ) - def train_epoch(self, data_loader): self.model.update_loss_margin(self.cur_epoch) @@ -258,4 +260,3 @@ def add_class_args(parser, prefix=None, skip=[]): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 2e032a49..3e704bd5 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -54,6 +54,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, @@ -85,6 +86,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 47801c29..e6014750 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -61,6 +61,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, reg_layers_enc=None, reg_layers_classif=None, reg_weight_enc=0.1, @@ -94,6 +95,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 6763b035..dafeb0c5 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -62,6 +62,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, reg_layers_enc=None, reg_layers_classif=None, reg_weight_enc=0.1, @@ -96,6 +97,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, reg_layers_enc=reg_layers_enc, reg_layers_classif=reg_layers_classif, reg_weight_enc=reg_weight_enc, diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 3519b6d6..a8f9da99 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -55,6 +55,7 @@ def __init__( exp_path="./train", cur_epoch=0, grad_acc_steps=1, + eff_batch_size=None, device=None, metrics=None, lrsched=None, @@ -83,6 +84,7 @@ def __init__( exp_path, cur_epoch=cur_epoch, grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, device=device, metrics=metrics, lrsched=lrsched, From 3a0eeff1baa3ac6ba38be9a9db429878c67ae0a6 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 1 Apr 2022 12:02:17 -0400 Subject: [PATCH 008/154] voxceleb/v1.1 refactorized --- egs/sre19-cmn2/v1/conf/ecapatdnn_small.yaml | 34 +++++++++++++ egs/sre19-cmn2/v1/conf/efficientnet_b4.yaml | 20 ++++++++ egs/sre19-cmn2/v1/conf/efficientnet_b7.yaml | 22 ++++++++ .../v1/conf/lrsched_exp_default.yaml | 7 +++ .../v1/conf/optim_adam_default.yaml | 6 +++ egs/sre19-cmn2/v1/conf/res2net50.yaml | 13 +++++ egs/sre19-cmn2/v1/conf/resnet34.yaml | 11 ++++ egs/sre19-cmn2/v1/conf/spinenet49.yaml | 11 ++++ .../v1/conf/train_data_default.yaml | 10 ++++ .../v1/conf/train_ecapatdnn_xvec_default.yaml | 7 +++ .../v1/conf/train_effnetb4_xvec_default.yaml | 7 +++ .../v1/conf/train_res2net50_xvec_default.yaml | 7 +++ .../v1/conf/train_resnet34_xvec_default.yaml | 7 +++ .../conf/train_spinenet49_xvec_default.yaml | 7 +++ egs/sre19-cmn2/v1/conf/trainer_default.yaml | 6 +++ .../v1/conf/trainer_swa_default.yaml | 9 ++++ egs/sre19-cmn2/v1/conf/val_data_default.yaml | 10 ++++ egs/voxceleb/v1.1/conf | 1 - egs/voxceleb/v1.1/conf/clsp.conf | 11 ++++ egs/voxceleb/v1.1/conf/coe_gpu_bigmem.conf | 11 ++++ egs/voxceleb/v1.1/conf/coe_gpu_long.conf | 13 +++++ egs/voxceleb/v1.1/conf/coe_gpu_rtx.conf | 11 ++++ egs/voxceleb/v1.1/conf/coe_gpu_short.conf | 11 ++++ egs/voxceleb/v1.1/conf/coe_gpu_v100.conf | 11 ++++ egs/voxceleb/v1.1/conf/ecapatdnn_small.yaml | 34 +++++++++++++ egs/voxceleb/v1.1/conf/efficientnet_b4.yaml | 20 ++++++++ egs/voxceleb/v1.1/conf/efficientnet_b7.yaml | 22 ++++++++ egs/voxceleb/v1.1/conf/fbank64_8k.yaml | 7 +++ egs/voxceleb/v1.1/conf/fbank64_stmn_8k.yaml | 12 +++++ egs/voxceleb/v1.1/conf/fbank80_16k.yaml | 7 +++ egs/voxceleb/v1.1/conf/fbank80_stmn_16k.yaml | 12 +++++ .../v1.1/conf/lrsched_exp_default.yaml | 7 +++ egs/voxceleb/v1.1/conf/noise_aug.yaml | 19 +++++++ egs/voxceleb/v1.1/conf/online_pitch.conf | 1 + .../v1.1/conf/optim_adam_default.yaml | 6 +++ egs/voxceleb/v1.1/conf/res2net50.yaml | 13 +++++ egs/voxceleb/v1.1/conf/resnet34.yaml | 11 ++++ egs/voxceleb/v1.1/conf/reverb_noise_aug.yaml | 35 +++++++++++++ egs/voxceleb/v1.1/conf/spinenet49.yaml | 11 ++++ .../v1.1/conf/train_data_default.yaml | 10 ++++ .../conf/train_ecapatdnn_xvec_default.yaml | 7 +++ .../conf/train_effnetb4_xvec_default.yaml | 7 +++ .../conf/train_res2net50_xvec_default.yaml | 7 +++ .../conf/train_resnet34_xvec_default.yaml | 7 +++ .../conf/train_spinenet49_xvec_default.yaml | 7 +++ egs/voxceleb/v1.1/conf/trainer_default.yaml | 6 +++ .../v1.1/conf/trainer_swa_default.yaml | 9 ++++ egs/voxceleb/v1.1/conf/vad_16k.yaml | 8 +++ egs/voxceleb/v1.1/conf/vad_8k.yaml | 8 +++ egs/voxceleb/v1.1/conf/val_data_default.yaml | 10 ++++ ...statsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh | 50 ++++--------------- ...fnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | 29 +++++------ ..._eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | 28 +++++------ ...net34_345_arcs30m0.3_adam_lr0.05_amp.v1.sh | 32 ++++++------ ...lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | 26 ++++------ ...pinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | 27 ++++------ ...et34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 33 ++++++------ ...et34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 35 ++++++------- ...et50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | 33 ++++++------ ...et50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 34 ++++++------- ...et50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | 33 ++++++------ ...w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | 35 ++++++------- ..._resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | 3 +- ...net34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | 27 +++++----- ...4_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh | 27 ++++------ ..._resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | 34 ++++++------- ...et50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 36 ++++++------- ...ine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | 29 +++++------ ...ne2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | 29 +++++------ ...pinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | 26 ++++------ ...inenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | 29 +++++------ ...et50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 35 ++++++------- ...eresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | 23 ++++----- ...ine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | 30 +++++------ ...ne2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | 31 +++++------- hyp_utils/conda_env.sh | 37 +++++++------- hyp_utils/feats/make_evad.sh | 2 +- .../xvectors/extract_xvectors_from_wav.sh | 4 +- .../make_babble_noise_for_nnet_train.sh | 2 +- .../xvectors/pack_rirs_for_nnet_train.sh | 2 +- .../preprocess_audios_for_nnet_train.sh | 2 +- ...te-energy-vad.py => compute_energy_vad.py} | 0 ...om-wav.py => extract_xvectors_from_wav.py} | 0 ...es.py => make_babble_noise_audio_files.py} | 0 .../{pack-wav-rirs.py => pack_wav_rirs.py} | 0 ...dio-files.py => preprocess_audio_files.py} | 0 hyperion/bin/torch_train_xvec_from_wav.py | 3 +- hyperion/torch/layer_blocks/res2net_blocks.py | 4 +- hyperion/torch/layers/global_pool.py | 4 +- hyperion/torch/layers/pool_factory.py | 2 + hyperion/torch/models/xvectors/xvector.py | 1 + hyperion/torch/trainers/torch_trainer.py | 6 ++- 92 files changed, 915 insertions(+), 474 deletions(-) create mode 100644 egs/sre19-cmn2/v1/conf/ecapatdnn_small.yaml create mode 100644 egs/sre19-cmn2/v1/conf/efficientnet_b4.yaml create mode 100644 egs/sre19-cmn2/v1/conf/efficientnet_b7.yaml create mode 100644 egs/sre19-cmn2/v1/conf/lrsched_exp_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/optim_adam_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/res2net50.yaml create mode 100644 egs/sre19-cmn2/v1/conf/resnet34.yaml create mode 100644 egs/sre19-cmn2/v1/conf/spinenet49.yaml create mode 100644 egs/sre19-cmn2/v1/conf/train_data_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/train_ecapatdnn_xvec_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/train_effnetb4_xvec_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/train_res2net50_xvec_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/train_resnet34_xvec_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/train_spinenet49_xvec_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/trainer_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/trainer_swa_default.yaml create mode 100644 egs/sre19-cmn2/v1/conf/val_data_default.yaml delete mode 120000 egs/voxceleb/v1.1/conf create mode 100644 egs/voxceleb/v1.1/conf/clsp.conf create mode 100644 egs/voxceleb/v1.1/conf/coe_gpu_bigmem.conf create mode 100644 egs/voxceleb/v1.1/conf/coe_gpu_long.conf create mode 100644 egs/voxceleb/v1.1/conf/coe_gpu_rtx.conf create mode 100644 egs/voxceleb/v1.1/conf/coe_gpu_short.conf create mode 100644 egs/voxceleb/v1.1/conf/coe_gpu_v100.conf create mode 100644 egs/voxceleb/v1.1/conf/ecapatdnn_small.yaml create mode 100644 egs/voxceleb/v1.1/conf/efficientnet_b4.yaml create mode 100644 egs/voxceleb/v1.1/conf/efficientnet_b7.yaml create mode 100644 egs/voxceleb/v1.1/conf/fbank64_8k.yaml create mode 100644 egs/voxceleb/v1.1/conf/fbank64_stmn_8k.yaml create mode 100644 egs/voxceleb/v1.1/conf/fbank80_16k.yaml create mode 100644 egs/voxceleb/v1.1/conf/fbank80_stmn_16k.yaml create mode 100644 egs/voxceleb/v1.1/conf/lrsched_exp_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/noise_aug.yaml create mode 100644 egs/voxceleb/v1.1/conf/online_pitch.conf create mode 100644 egs/voxceleb/v1.1/conf/optim_adam_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/res2net50.yaml create mode 100644 egs/voxceleb/v1.1/conf/resnet34.yaml create mode 100644 egs/voxceleb/v1.1/conf/reverb_noise_aug.yaml create mode 100644 egs/voxceleb/v1.1/conf/spinenet49.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_data_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_ecapatdnn_xvec_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_effnetb4_xvec_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_resnet34_xvec_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_spinenet49_xvec_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/trainer_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/trainer_swa_default.yaml create mode 100644 egs/voxceleb/v1.1/conf/vad_16k.yaml create mode 100644 egs/voxceleb/v1.1/conf/vad_8k.yaml create mode 100644 egs/voxceleb/v1.1/conf/val_data_default.yaml rename hyperion/bin/{compute-energy-vad.py => compute_energy_vad.py} (100%) rename hyperion/bin/{torch-extract-xvectors-from-wav.py => extract_xvectors_from_wav.py} (100%) rename hyperion/bin/{make-babble-noise-audio-files.py => make_babble_noise_audio_files.py} (100%) rename hyperion/bin/{pack-wav-rirs.py => pack_wav_rirs.py} (100%) rename hyperion/bin/{preprocess-audio-files.py => preprocess_audio_files.py} (100%) diff --git a/egs/sre19-cmn2/v1/conf/ecapatdnn_small.yaml b/egs/sre19-cmn2/v1/conf/ecapatdnn_small.yaml new file mode 100644 index 00000000..fd386500 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/ecapatdnn_small.yaml @@ -0,0 +1,34 @@ +resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 +pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/efficientnet_b4.yaml b/egs/sre19-cmn2/v1/conf/efficientnet_b4.yaml new file mode 100644 index 00000000..f87c1e02 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/efficientnet_b4.yaml @@ -0,0 +1,20 @@ +effnet_type: efficientnet-b4 +in_feats: 80 +in_channels: 1 +in_kernel_size: 3 +in_stride: 1 +se_r: 4 +fix_stem_head: true +mbconv_strides: +- 1 +- 1 +- 2 +- 2 +- 1 +- 2 +- 1 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/efficientnet_b7.yaml b/egs/sre19-cmn2/v1/conf/efficientnet_b7.yaml new file mode 100644 index 00000000..bae5c7cb --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/efficientnet_b7.yaml @@ -0,0 +1,22 @@ +effnet_type: efficientnet-b7 +in_feats: 80 +in_channels: 1 +in_kernel_size: 3 +in_stride: 1 +se_r: 4 +fix_stem_head: true +mbconv_strides: +- 1 +- 1 +- 2 +- 2 +- 1 +- 2 +- 1 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 +norm_layer: instance-norm-affine +head_norm_layer: layer-norm diff --git a/egs/sre19-cmn2/v1/conf/lrsched_exp_default.yaml b/egs/sre19-cmn2/v1/conf/lrsched_exp_default.yaml new file mode 100644 index 00000000..fe08b704 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/lrsched_exp_default.yaml @@ -0,0 +1,7 @@ +lrsch_type: exp_lr +decay_rate: 0.5 +decay_steps: 8000 +hold_steps: 40000 +min_lr: 1.0e-05 +update_lr_on_opt_step: true +warmup_steps: 1000 diff --git a/egs/sre19-cmn2/v1/conf/optim_adam_default.yaml b/egs/sre19-cmn2/v1/conf/optim_adam_default.yaml new file mode 100644 index 00000000..b6620069 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/optim_adam_default.yaml @@ -0,0 +1,6 @@ +opt_type: adam +lr: 0.05 +amsgrad: true +beta1: 0.9 +beta2: 0.95 +weight_decay: 1.0e-05 diff --git a/egs/sre19-cmn2/v1/conf/res2net50.yaml b/egs/sre19-cmn2/v1/conf/res2net50.yaml new file mode 100644 index 00000000..48067a3d --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/res2net50.yaml @@ -0,0 +1,13 @@ +resnet_type: res2net50 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +res2net_width_factor: 3.25 +res2net_scale: 8 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/resnet34.yaml b/egs/sre19-cmn2/v1/conf/resnet34.yaml new file mode 100644 index 00000000..98695823 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/resnet34.yaml @@ -0,0 +1,11 @@ +resnet_type: resnet34 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/spinenet49.yaml b/egs/sre19-cmn2/v1/conf/spinenet49.yaml new file mode 100644 index 00000000..66b8d517 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/spinenet49.yaml @@ -0,0 +1,11 @@ +spinenet_type: spinenet49 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/sre19-cmn2/v1/conf/train_data_default.yaml b/egs/sre19-cmn2/v1/conf/train_data_default.yaml new file mode 100644 index 00000000..451ffa35 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_data_default.yaml @@ -0,0 +1,10 @@ +dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_ecapatdnn_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_ecapatdnn_xvec_default.yaml new file mode 100644 index 00000000..46298946 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_ecapatdnn_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: ecapatdnn_small.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_effnetb4_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_effnetb4_xvec_default.yaml new file mode 100644 index 00000000..1bc74de6 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_effnetb4_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: efficientnet_b4.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_res2net50_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_res2net50_xvec_default.yaml new file mode 100644 index 00000000..1d387790 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_res2net50_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: resnet34.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_resnet34_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_resnet34_xvec_default.yaml new file mode 100644 index 00000000..1d387790 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_resnet34_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: resnet34.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/train_spinenet49_xvec_default.yaml b/egs/sre19-cmn2/v1/conf/train_spinenet49_xvec_default.yaml new file mode 100644 index 00000000..07167987 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/train_spinenet49_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: spinenet49.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/sre19-cmn2/v1/conf/trainer_default.yaml b/egs/sre19-cmn2/v1/conf/trainer_default.yaml new file mode 100644 index 00000000..86dcc2e4 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/trainer_default.yaml @@ -0,0 +1,6 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 70 +eff_batch_size: 512 diff --git a/egs/sre19-cmn2/v1/conf/trainer_swa_default.yaml b/egs/sre19-cmn2/v1/conf/trainer_swa_default.yaml new file mode 100644 index 00000000..0cafad01 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/trainer_swa_default.yaml @@ -0,0 +1,9 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 80 +eff_batch_size: 512 +swa_start: 60 +swa_lr: 1e-3 +swa_anneal_epochs: 5 diff --git a/egs/sre19-cmn2/v1/conf/val_data_default.yaml b/egs/sre19-cmn2/v1/conf/val_data_default.yaml new file mode 100644 index 00000000..451ffa35 --- /dev/null +++ b/egs/sre19-cmn2/v1/conf/val_data_default.yaml @@ -0,0 +1,10 @@ +dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf b/egs/voxceleb/v1.1/conf deleted file mode 120000 index 25a735e3..00000000 --- a/egs/voxceleb/v1.1/conf +++ /dev/null @@ -1 +0,0 @@ -../v1/conf \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/clsp.conf b/egs/voxceleb/v1.1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v1.1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_long.conf b/egs/voxceleb/v1.1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_rtx.conf b/egs/voxceleb/v1.1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_short.conf b/egs/voxceleb/v1.1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v1.1/conf/coe_gpu_v100.conf b/egs/voxceleb/v1.1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v1.1/conf/ecapatdnn_small.yaml b/egs/voxceleb/v1.1/conf/ecapatdnn_small.yaml new file mode 100644 index 00000000..fd386500 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/ecapatdnn_small.yaml @@ -0,0 +1,34 @@ +resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 +pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/efficientnet_b4.yaml b/egs/voxceleb/v1.1/conf/efficientnet_b4.yaml new file mode 100644 index 00000000..f87c1e02 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/efficientnet_b4.yaml @@ -0,0 +1,20 @@ +effnet_type: efficientnet-b4 +in_feats: 80 +in_channels: 1 +in_kernel_size: 3 +in_stride: 1 +se_r: 4 +fix_stem_head: true +mbconv_strides: +- 1 +- 1 +- 2 +- 2 +- 1 +- 2 +- 1 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/efficientnet_b7.yaml b/egs/voxceleb/v1.1/conf/efficientnet_b7.yaml new file mode 100644 index 00000000..bae5c7cb --- /dev/null +++ b/egs/voxceleb/v1.1/conf/efficientnet_b7.yaml @@ -0,0 +1,22 @@ +effnet_type: efficientnet-b7 +in_feats: 80 +in_channels: 1 +in_kernel_size: 3 +in_stride: 1 +se_r: 4 +fix_stem_head: true +mbconv_strides: +- 1 +- 1 +- 2 +- 2 +- 1 +- 2 +- 1 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 +norm_layer: instance-norm-affine +head_norm_layer: layer-norm diff --git a/egs/voxceleb/v1.1/conf/fbank64_8k.yaml b/egs/voxceleb/v1.1/conf/fbank64_8k.yaml new file mode 100644 index 00000000..a77eb899 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank64_8k.yaml @@ -0,0 +1,7 @@ +sample_frequency: 8000 +frame_length: 25 +low_freq: 20 +high_freq: 3700 +num_filters: 64 +snip_edges: false +use_energy: false diff --git a/egs/voxceleb/v1.1/conf/fbank64_stmn_8k.yaml b/egs/voxceleb/v1.1/conf/fbank64_stmn_8k.yaml new file mode 100644 index 00000000..dfd0d3e5 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank64_stmn_8k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.1/conf/fbank80_16k.yaml b/egs/voxceleb/v1.1/conf/fbank80_16k.yaml new file mode 100644 index 00000000..88bae69e --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank80_16k.yaml @@ -0,0 +1,7 @@ +sample_frequency: 16000 +frame_length: 25 +low_freq: 20 +high_freq: 7600 +num_filters: 80 +snip_edges: false +use_energy: false diff --git a/egs/voxceleb/v1.1/conf/fbank80_stmn_16k.yaml b/egs/voxceleb/v1.1/conf/fbank80_stmn_16k.yaml new file mode 100644 index 00000000..f4091f5d --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank80_stmn_16k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.1/conf/lrsched_exp_default.yaml b/egs/voxceleb/v1.1/conf/lrsched_exp_default.yaml new file mode 100644 index 00000000..fe08b704 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/lrsched_exp_default.yaml @@ -0,0 +1,7 @@ +lrsch_type: exp_lr +decay_rate: 0.5 +decay_steps: 8000 +hold_steps: 40000 +min_lr: 1.0e-05 +update_lr_on_opt_step: true +warmup_steps: 1000 diff --git a/egs/voxceleb/v1.1/conf/noise_aug.yaml b/egs/voxceleb/v1.1/conf/noise_aug.yaml new file mode 100644 index 00000000..7e575faf --- /dev/null +++ b/egs/voxceleb/v1.1/conf/noise_aug.yaml @@ -0,0 +1,19 @@ +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/v1.1/conf/online_pitch.conf b/egs/voxceleb/v1.1/conf/online_pitch.conf new file mode 100644 index 00000000..926bcfca --- /dev/null +++ b/egs/voxceleb/v1.1/conf/online_pitch.conf @@ -0,0 +1 @@ +--sample-frequency=8000 diff --git a/egs/voxceleb/v1.1/conf/optim_adam_default.yaml b/egs/voxceleb/v1.1/conf/optim_adam_default.yaml new file mode 100644 index 00000000..b6620069 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/optim_adam_default.yaml @@ -0,0 +1,6 @@ +opt_type: adam +lr: 0.05 +amsgrad: true +beta1: 0.9 +beta2: 0.95 +weight_decay: 1.0e-05 diff --git a/egs/voxceleb/v1.1/conf/res2net50.yaml b/egs/voxceleb/v1.1/conf/res2net50.yaml new file mode 100644 index 00000000..48067a3d --- /dev/null +++ b/egs/voxceleb/v1.1/conf/res2net50.yaml @@ -0,0 +1,13 @@ +resnet_type: res2net50 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +res2net_width_factor: 3.25 +res2net_scale: 8 +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/resnet34.yaml b/egs/voxceleb/v1.1/conf/resnet34.yaml new file mode 100644 index 00000000..98695823 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/resnet34.yaml @@ -0,0 +1,11 @@ +resnet_type: resnet34 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v1.1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/v1.1/conf/spinenet49.yaml b/egs/voxceleb/v1.1/conf/spinenet49.yaml new file mode 100644 index 00000000..66b8d517 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/spinenet49.yaml @@ -0,0 +1,11 @@ +spinenet_type: spinenet49 +in_channels: 1 +in_feats: 80 +in_kernel_size: 3 +in_stride: 1 +no_maxpool: true +embed_dim: 256 +cos_scale: 30.0 +margin: 0.3 +margin_warmup_epochs: 20.0 +dropout_rate: 0.0 diff --git a/egs/voxceleb/v1.1/conf/train_data_default.yaml b/egs/voxceleb/v1.1/conf/train_data_default.yaml new file mode 100644 index 00000000..451ffa35 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_data_default.yaml @@ -0,0 +1,10 @@ +dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn_xvec_default.yaml new file mode 100644 index 00000000..46298946 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: ecapatdnn_small.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_effnetb4_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_effnetb4_xvec_default.yaml new file mode 100644 index 00000000..1bc74de6 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_effnetb4_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: efficientnet_b4.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml new file mode 100644 index 00000000..1d387790 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: resnet34.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_resnet34_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_resnet34_xvec_default.yaml new file mode 100644 index 00000000..1d387790 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_resnet34_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: resnet34.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/train_spinenet49_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_spinenet49_xvec_default.yaml new file mode 100644 index 00000000..07167987 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_spinenet49_xvec_default.yaml @@ -0,0 +1,7 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +feats: fbank80_stmn_16k.yaml +model: spinenet49.yaml +trainer: trainer_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/trainer_default.yaml b/egs/voxceleb/v1.1/conf/trainer_default.yaml new file mode 100644 index 00000000..86dcc2e4 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/trainer_default.yaml @@ -0,0 +1,6 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 70 +eff_batch_size: 512 diff --git a/egs/voxceleb/v1.1/conf/trainer_swa_default.yaml b/egs/voxceleb/v1.1/conf/trainer_swa_default.yaml new file mode 100644 index 00000000..0cafad01 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/trainer_swa_default.yaml @@ -0,0 +1,9 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 80 +eff_batch_size: 512 +swa_start: 60 +swa_lr: 1e-3 +swa_anneal_epochs: 5 diff --git a/egs/voxceleb/v1.1/conf/vad_16k.yaml b/egs/voxceleb/v1.1/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/v1.1/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v1.1/conf/vad_8k.yaml b/egs/voxceleb/v1.1/conf/vad_8k.yaml new file mode 100644 index 00000000..7592c9d1 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/vad_8k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v1.1/conf/val_data_default.yaml b/egs/voxceleb/v1.1/conf/val_data_default.yaml new file mode 100644 index 00000000..451ffa35 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/val_data_default.yaml @@ -0,0 +1,10 @@ +dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh index 2b4f07a7..3cd4b108 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,4 @@ -# Time SE Res2Net50 w26s4 x-vector with mixed precision training +# ECAPA-TDNN small # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,56 +9,26 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=512 -ep_channels=1536 -width_factor=1 -scale=8 -se_r=4 -dropout=0 -attstats_inner=128 +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +dropout=0 embed_dim=256 +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet_enc.in-feats 80 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +xvec_train_base_cfg=conf/train_ecapatdnn_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu" nnet_name=${feat_type}_ecapatdnn512x3_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 + nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh index 0765b60d..2806a422 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh @@ -9,32 +9,28 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.01 +# x-vector cfg +nnet_type=efficientnet -nnet_type=efficientnet-b4 +effnet_type=efficientnet-b4 dropout=0 embed_dim=256 -se_r=4 - s=30 margin_warmup=20 margin=0.3 +se_r=4 -nnet_opt="--effnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --se-r $se_r --fix-stem-head --mbconv-strides 1 1 2 2 1 2 1" +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size +lr=0.01 +nnet_num_epochs=70 -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +xvec_train_base_cfg=conf/train_effnetb4_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" + +nnet_name=${feat_type}_${effnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -51,4 +47,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh index 7d1fd1dc..d83ca483 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh @@ -9,32 +9,28 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=2 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.01 +# x-vector cfg +nnet_type=efficientnet -nnet_type=efficientnet-b7 +effnet_type=efficientnet-b7 dropout=0 embed_dim=256 -se_r=4 - s=30 margin_warmup=20 margin=0.3 +se_r=4 -nnet_opt="--effnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --se-r $se_r --fix-stem-head --mbconv-strides 1 1 2 2 1 2 1 --norm-layer instance-norm-affine --head-norm-layer layer-norm" +batch_size_1gpu=2 +eff_batch_size=512 # effective batch size +lr=0.01 +nnet_num_epochs=70 -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +xvec_train_base_cfg=conf/train_effnetb4_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model $PWD/conf/efficientnet_b7.yaml --trainer.optim.lr $lr" + +nnet_name=${feat_type}_${effnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_eina_hln_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_eina_hln_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_345_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_345_arcs30m0.3_adam_lr0.05_amp.v1.sh index dbab12ae..9bfb7bb7 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_345_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_345_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,38 +1,35 @@ -# LResNet34_345 (multi-level feature) x-vector with mixed precision training +# LResNet34 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg + +nnet_type=resnet -nnet_type=lresnet34_345 +resnet_type=lresnet34_345 +batch_size_1gpu=128 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -50,3 +47,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh index c243020f..4aabd592 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -7,34 +7,29 @@ feat_type=fbank80_stmn #vad vad_config=conf/vad_16k.yaml - # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg + +nnet_type=resnet + +resnet_type=lresnet34 batch_size_1gpu=128 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=lresnet34 #light resnet dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -52,3 +47,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lspinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lspinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh index 9ba45ab5..2afe35ef 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lspinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_lspinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,35 +4,31 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=lspinenet49 batch_size_1gpu=64 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=lspinenet49 dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -49,4 +45,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 7cfe8894..f995fc0f 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,38 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net34 +nnet_type=resnet + +resnet_type=res2net34 +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=1 -scale=4 -ws_tag=w16s4 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +width_factor=1 +scale=4 +ws_tag=w16s4 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" +xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 75f3bbbd..a2e8cdba 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,4 @@ -# Res2Net34 w26 s4 x-vector with mixed precision training +# Res2Net34 w26s4 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,38 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net34 +nnet_type=resnet + +resnet_type=res2net34 +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +width_factor=1.625 +scale=4 +ws_tag=w26s4 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" +xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh index cbd13a22..6ddb9e2c 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,38 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net50 +nnet_type=resnet + +resnet_type=res2net50 +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=1.625 -scale=8 -ws_tag=w13s8 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +width_factor=1.625 +scale=8 +ws_tag=w13s8 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" +xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 4c7e6fc5..cfec2b09 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,38 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net50 +nnet_type=resnet + +resnet_type=res2net50 +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +width_factor=1.625 +scale=4 +ws_tag=w26s4 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" +xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 @@ -54,3 +49,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh index db3bfea8..3cf18fcf 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,38 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net50 +nnet_type=resnet + +resnet_type=res2net50 +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +width_factor=3.25 +scale=8 +ws_tag=w26s8 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" +xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh index c2191649..a5767e50 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh @@ -1,4 +1,4 @@ -# Res2Net50 w26s8 x-vector with mixed precision training +# Res2Net50 w26s8 x-vector with mixed precision training and SWA # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,38 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=res2net50 +nnet_type=resnet + +resnet_type=res2net50 +batch_size_1gpu=16 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +width_factor=3.25 +scale=8 +ws_tag=w26s8 +nnet_num_epochs=90 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" +xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --trainer.epochs $nnet_num_epochs --trainer.swa-start 70 --trainer.swa-lr 1e-3 --trainer.swa-anneal-epochs 5" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 70 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 -nnet_num_epochs=90 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/swa_model_ep0091.pth - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh index 63fecf32..b10e5e86 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -11,7 +11,6 @@ vad_config=conf/vad_16k.yaml nnet_data=voxceleb2cat_train # x-vector cfg - nnet_type=resnet resnet_type=resnet34 @@ -25,7 +24,7 @@ margin_warmup=20 margin=0.3 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_xvec_default.yaml +xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu" nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh index d5f9e623..2666b93e 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh @@ -1,4 +1,4 @@ -# ResNet34 x-vector with mixed precision training +# ResNet34 x-vector with mixed precision training and SWA # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,31 +9,27 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg + +nnet_type=resnet + +resnet_type=resnet34 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=resnet34 dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer $PWD/conf/trainer_swa_default.yaml" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 60 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 -nnet_num_epochs=80 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/swa_model_ep0081.pth @@ -51,3 +47,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh index b172ad91..0ec34ef1 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh @@ -1,4 +1,4 @@ -# ResNet34 x-vector with mixed precision training +# ResNet34 x-vector with mixed precision training and sharded distrib. data parallel # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,33 +9,28 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg + +nnet_type=resnet + +resnet_type=resnet34 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.01 - -nnet_type=resnet34 dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.ddp-type oss_sharded_ddp" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --ddp-type oss_sharded_ddp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_sharded_ddp_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_sharded_ddp_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/swa_model_ep0071.pth nnet=$nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh index f9b8c038..ced8b8d6 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,5 @@ -# LResNet34 x-vector with mixed precision training +# ResNet50 x-vector with mixed precision training + # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -8,32 +9,28 @@ feat_type=fbank80_stmn vad_config=conf/vad_16k.yaml # x-vector training -nnet_data=voxceleb2cat -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=resnet +resnet_type=resnet50 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=resnet50 dropout=0 embed_dim=256 - +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -42,12 +39,13 @@ nnet=$nnet_dir/model_ep0070.pth plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat + plda_data=voxceleb2cat_train else - plda_data=voxceleb2cat_augx${plda_num_augs} + plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 8ee1b484..08669114 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,4 @@ -# SE ResNet34 x-vector with mixed precision training +# Squeeze-Excitation Res2Net50 w26s8 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,39 +9,34 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=seres2net50 +nnet_type=resnet + +resnet_type=seres2net50 +batch_size_1gpu=24 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 width_factor=1.625 scale=4 ws_tag=w26s4 +nnet_num_epochs=70 se_r=16 -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r" +xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 @@ -55,3 +50,4 @@ lda_dim=200 plda_y_dim=150 plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh index 5ea146b6..f3a5ef5a 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,38 +4,34 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=spine2net49 batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=spine2net49 dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 - width_factor=1.625 scale=4 ws_tag=w26s4 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" + +nnet_name=${feat_type}_${spinenet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -52,4 +48,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh index 6aa20991..40957669 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,38 +4,34 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=spine2net49s batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=spine2net49s dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 - width_factor=1.625 scale=4 ws_tag=w26s4 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" + +nnet_name=${feat_type}_${spinenet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -52,4 +48,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh index 28418a2e..43f539f9 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,35 +4,31 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=spinenet49 batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=spinenet49 dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh index 8be0e057..f834b2cb 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,35 +4,31 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg +nnet_type=spinenet -nnet_type=spinenet49s +spinenet_type=spinenet49s +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -49,4 +45,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 7a3b0351..243dab65 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -1,4 +1,4 @@ -# Time SE Res2Net50 w26s4 x-vector with mixed precision training +# Time-Squeeze-Excitation Res2Net50 w26s8 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,39 +9,34 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 +# x-vector cfg -nnet_type=tseres2net50 +nnet_type=resnet + +resnet_type=tseres2net50 +batch_size_1gpu=24 +eff_batch_size=512 # effective batch size dropout=0 embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 width_factor=1.625 scale=4 ws_tag=w26s4 +nnet_num_epochs=70 se_r=256 -s=30 -margin_warmup=20 -margin=0.3 +xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r" +nnet_name=${feat_type}_${resnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=6 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh index 35a146a5..749ca557 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -9,32 +9,29 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg + +nnet_type=resnet + +resnet_type=tseresnet34 batch_size_1gpu=32 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=tseresnet34 dropout=0 embed_dim=256 se_r=16 +lr=0.05 s=30 margin_warmup=20 margin=0.3 +nnet_num_epochs=70 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --se-r $se_r" +xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.se-r $se_r" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +nnet_name=${feat_type}_${resnet_type}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh index 48f54f8b..d3a5595c 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,39 +4,35 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +# vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=tsespine2net49 batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=tsespine2net49 dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 - width_factor=1.625 scale=4 -se_r=256 ws_tag=w26s4 +se_r=256 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" + +nnet_name=${feat_type}_${spinenet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh index 6253ee54..4ffdd48b 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -4,39 +4,35 @@ feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +# vad +vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" +# x-vector cfg +nnet_type=spinenet + +spinenet_type=tsespine2net49s batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=tsespine2net49s dropout=0 embed_dim=256 - -loss_type=arc-softmax +lr=0.05 s=30 margin_warmup=20 margin=0.3 - width_factor=1.625 scale=4 -se_r=256 ws_tag=w26s4 +se_r=256 +nnet_num_epochs=70 -nnet_opt="--spinenet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" + +nnet_name=${feat_type}_${spinenet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_name=${feat_type}_${nnet_type}_${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth @@ -53,4 +49,3 @@ plda_type=splda lda_dim=200 plda_y_dim=150 plda_z_dim=200 - diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index 1aea9eb9..1ed39a7d 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -50,27 +50,28 @@ conda activate $conda_env command="python" if [ $num_gpus -gt 0 ];then # set CUDA_VISIBLE_DEVICES + if [ ! -z "$SGE_HGR_gpu" ]; then echo "SGE_HGR_gpu=$SGE_HGR_gpu" - if [ ! -z "$SGE_HGR_gpu" ]; then - export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') - else - # seach location of free-gpu program in the PATH or hyp_utils directory - free_gpu=$(which free-gpu) - if [ -z "$free_gpu" ];then - free_gpu=$(which hyp_utils/free-gpu) - fi - - if [ ! -z "$free_gpu" ];then - # if free-gpu found set env var, otherwise we assume that you can use any gpu - export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) - fi + export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') + else + # seach location of free-gpu program in the PATH or hyp_utils directory + free_gpu=$(which free-gpu) + if [ -z "$free_gpu" ];then + free_gpu=$(which hyp_utils/free-gpu) fi - echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" - if [ $num_gpus -gt 1 ];then - [[ $(type -P "$torchrun") ]] && command="torchrun" \ - || command="python -m torch.distributed.run" - command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" + + if [ ! -z "$free_gpu" ];then + # if free-gpu found set env var, otherwise we assume that you can use any gpu + export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) fi + fi + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" + # export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters + if [ $num_gpus -gt 1 ];then + [[ $(type -P "$torchrun") ]] && command="torchrun" \ + || command="python -m torch.distributed.run" + command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" + fi fi py_exec=$(which $1) diff --git a/hyp_utils/feats/make_evad.sh b/hyp_utils/feats/make_evad.sh index 8717fc3c..373fc4a6 100755 --- a/hyp_utils/feats/make_evad.sh +++ b/hyp_utils/feats/make_evad.sh @@ -86,7 +86,7 @@ fi $cmd JOB=1:$nj $logdir/make_vad_${name}.JOB.log \ hyp_utils/conda_env.sh \ - compute-energy-vad.py --cfg $vad_config $opt_args \ + compute_energy_vad.py --cfg $vad_config $opt_args \ --input $scp --output ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \ --part-idx JOB --num-parts $nj || exit 1 diff --git a/hyp_utils/xvectors/extract_xvectors_from_wav.sh b/hyp_utils/xvectors/extract_xvectors_from_wav.sh index 2aa0d460..ef06d94d 100755 --- a/hyp_utils/xvectors/extract_xvectors_from_wav.sh +++ b/hyp_utils/xvectors/extract_xvectors_from_wav.sh @@ -85,7 +85,7 @@ if [ $stage -le 0 ];then set +e $cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-extract-xvectors-from-wav.py \ + extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ --part-idx JOB --num-parts $nj \ --input $data_dir/wav.scp \ @@ -107,7 +107,7 @@ if [ $stage -le 1 ];then fi $cmd $output_dir/log/extract_xvectors.$i.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-extract-xvectors-from-wav.py \ + extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ --part-idx $i --num-parts $nj \ --input $data_dir/wav.scp \ diff --git a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh index 92256004..27c77454 100755 --- a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh +++ b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh @@ -51,7 +51,7 @@ output_dir=$(utils/make_absolute.sh $dir) args="" $cmd $dir/log/make_babble_noise_${name}.log \ hyp_utils/conda_env.sh \ - make-babble-noise-audio-files.py ${args} \ + make_babble_noise_audio_files.py ${args} \ --output-audio-format $file_format $args $proc_opts \ --min-spks $min_spks --max-spks $max_spks --num-reuses $num_reuses \ --write-time-durs $data_out/utt2dur \ diff --git a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh index 9c122f1e..c6634135 100755 --- a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh +++ b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh @@ -63,7 +63,7 @@ utils/create_data_link.pl $output_dir/rirs_${name}.${file_format} args="" $cmd $dir/log/pack_rirs_${name}.log \ hyp_utils/conda_env.sh \ - pack-wav-rirs.py ${args} --input $data_in/wav.scp \ + pack_wav_rirs.py ${args} --input $data_in/wav.scp \ --output ${file_format},scp:$output_dir/rirs_${name}.${file_format},$data_out/rirs.scp || exit 1; diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index 35794d65..1a1fd7ad 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -88,7 +88,7 @@ fi $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ hyp_utils/conda_env.sh \ - preprocess-audio-files.py ${args} --output-audio-format $file_format $args $proc_opts \ + preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \ --write-time-durs $output_dir/utt2dur.${name}.JOB \ --part-idx JOB --num-parts $nj \ --input $data_in/wav.scp \ diff --git a/hyperion/bin/compute-energy-vad.py b/hyperion/bin/compute_energy_vad.py similarity index 100% rename from hyperion/bin/compute-energy-vad.py rename to hyperion/bin/compute_energy_vad.py diff --git a/hyperion/bin/torch-extract-xvectors-from-wav.py b/hyperion/bin/extract_xvectors_from_wav.py similarity index 100% rename from hyperion/bin/torch-extract-xvectors-from-wav.py rename to hyperion/bin/extract_xvectors_from_wav.py diff --git a/hyperion/bin/make-babble-noise-audio-files.py b/hyperion/bin/make_babble_noise_audio_files.py similarity index 100% rename from hyperion/bin/make-babble-noise-audio-files.py rename to hyperion/bin/make_babble_noise_audio_files.py diff --git a/hyperion/bin/pack-wav-rirs.py b/hyperion/bin/pack_wav_rirs.py similarity index 100% rename from hyperion/bin/pack-wav-rirs.py rename to hyperion/bin/pack_wav_rirs.py diff --git a/hyperion/bin/preprocess-audio-files.py b/hyperion/bin/preprocess_audio_files.py similarity index 100% rename from hyperion/bin/preprocess-audio-files.py rename to hyperion/bin/preprocess_audio_files.py diff --git a/hyperion/bin/torch_train_xvec_from_wav.py b/hyperion/bin/torch_train_xvec_from_wav.py index df948b87..c488b5c5 100755 --- a/hyperion/bin/torch_train_xvec_from_wav.py +++ b/hyperion/bin/torch_train_xvec_from_wav.py @@ -131,7 +131,6 @@ def init_feats(rank, **kwargs): def init_xvector(num_classes, rank, xvec_class, **kwargs): - xvec_args = xvec_class.filter_args(**kwargs["model"]) if rank == 0: logging.info("xvector network args={}".format(xvec_args)) @@ -171,7 +170,7 @@ def train_xvec(gpu_id, args): device=device, metrics=metrics, ddp=world_size > 1, - **trn_args + **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index daf391be..83ce7601 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -96,7 +96,7 @@ def __init__( width_in = in_channels // scale width_mid = int(width_factor * channels) // scale self.width_in = width_in - self.has_proj1 = width_in != width_mid + self.has_proj1 = width_in != width_mid and stride == 1 self.scale = scale channels_mid = width_mid * scale if scale == 1: @@ -189,7 +189,7 @@ def forward(self, x, x_mask=None): x_i = self.bn1s[i](x_i) x_i = self.act1(x_i) if not self.norm_before: - x_i = self.bn1(x_i) + x_i = self.bn1s[i](x_i) x.append(x_i) if self.scale > 1: diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index b6b3569e..5001bfd0 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -750,7 +750,9 @@ def _standardize_weights(self, x, x_lengths=None, weights=None): multiplied by the input data. """ if weights is None: - return seq_lengths_to_mask(x, x.size(self.dim), dtype=x.dtype, time_dim=-1) + return seq_lengths_to_mask( + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=-1 + ) if weights.dim() == x.dim(): return weights.transpose(self.dim, -1) diff --git a/hyperion/torch/layers/pool_factory.py b/hyperion/torch/layers/pool_factory.py index fa1032a8..723c64a4 100644 --- a/hyperion/torch/layers/pool_factory.py +++ b/hyperion/torch/layers/pool_factory.py @@ -93,6 +93,8 @@ def create( keepdim=keepdim, ) + raise ValueError(f"Invalid pooling type {pool_type}") + @staticmethod def filter_args(**kwargs): """Filters the arguments corresponding to the creation of a pooling layer. diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 3e9e9fcd..d11fb020 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -563,6 +563,7 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index ecdb4dd8..2755bbbe 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -149,7 +149,9 @@ def __init__( oss = False if ddp_type == DDPType.DDP else True self.optimizer = self._make_optimizer(optim, self.model, oss=oss) self.model = TorchDDP( - self.model, device_ids=[device], output_device=device + self.model, + device_ids=[device], + output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) @@ -454,7 +456,7 @@ def _compute_grad_acc_steps(self, data_loader): math.ceil(self.eff_batch_size / batch_size / self.world_size) ) logging.info( - "Setting grad_acc_steps=%d for" + "Setting grad_acc_steps=%d for " "eff_batch_size=%d, avg_batch_size=%d, world_size=%d", self.grad_acc_steps, self.eff_batch_size, From ed08db306b4ec584c654f438427dcd7439eb22f9 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 29 Apr 2022 16:29:14 -0400 Subject: [PATCH 009/154] huffing face x-vectors working on multi-gpu --- egs/voxceleb/v1.1/default_config.sh | 55 +- egs/voxceleb/v1.1/run_011_train_xvector.sh | 2 +- egs/voxceleb/v2/README.md | 205 ++++++ egs/voxceleb/v2/cmd.sh | 28 + egs/voxceleb/v2/conf/clsp.conf | 11 + egs/voxceleb/v2/conf/coe_gpu_bigmem.conf | 11 + egs/voxceleb/v2/conf/coe_gpu_long.conf | 13 + egs/voxceleb/v2/conf/coe_gpu_rtx.conf | 11 + egs/voxceleb/v2/conf/coe_gpu_short.conf | 11 + egs/voxceleb/v2/conf/coe_gpu_v100.conf | 11 + .../v2/conf/hubertbase_ecapatdnn512x2.yaml | 37 + egs/voxceleb/v2/conf/lrsched_exp_default.yaml | 7 + egs/voxceleb/v2/conf/optim_adam_default.yaml | 6 + egs/voxceleb/v2/conf/reverb_noise_aug.yaml | 35 + egs/voxceleb/v2/conf/train_data_default.yaml | 11 + ...ain_hubertbase_ecapatdnn512x2_default.yaml | 6 + ...v2vec2base960h_ecapatdnn512x2_default.yaml | 6 + ...n_wav2vec2base_ecapatdnn512x2_default.yaml | 6 + ...rain_wavlmbase_ecapatdnn512x2_default.yaml | 6 + egs/voxceleb/v2/conf/trainer_swa_default.yaml | 9 + egs/voxceleb/v2/conf/vad_16k.yaml | 8 + egs/voxceleb/v2/conf/val_data_default.yaml | 11 + .../conf/wav2vec2base960h_ecapatdnn512x2.yaml | 37 + .../v2/conf/wav2vec2base_ecapatdnn512x2.yaml | 37 + .../v2/conf/wavlmbase_ecapatdnn512x2.yaml | 37 + egs/voxceleb/v2/datapath.sh | 22 + egs/voxceleb/v2/default_config.sh | 1 + ...dnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh | 48 ++ ...tdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh | 48 ++ ...dnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh | 51 ++ ...dnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh | 49 ++ egs/voxceleb/v2/hyp_utils | 1 + egs/voxceleb/v2/local | 1 + egs/voxceleb/v2/path.sh | 5 + egs/voxceleb/v2/run_001_prepare_data.sh | 28 + egs/voxceleb/v2/run_002_compute_evad.sh | 57 ++ .../v2/run_010_prepare_xvec_train_data.sh | 42 ++ egs/voxceleb/v2/run_011_train_xvector.sh | 59 ++ egs/voxceleb/v2/run_030_extract_xvectors.sh | 60 ++ egs/voxceleb/v2/run_040_eval_be.sh | 125 ++++ egs/voxceleb/v2/steps | 1 + egs/voxceleb/v2/steps_be | 1 + egs/voxceleb/v2/steps_pyfe | 1 + egs/voxceleb/v2/steps_xvec | 1 + egs/voxceleb/v2/utils | 1 + hyp_utils/conda_env.sh | 2 +- .../xvectors/extract_wav2vec2xvectors.sh | 155 ++++ hyperion/bin/extract_wav2vec2xvectors.py | 313 ++++++++ hyperion/bin/train_wav2vec2xvector.py | 197 ++++++ hyperion/bin/train_xvector_from_feats.py | 204 ++++++ ..._from_wav.py => train_xvector_from_wav.py} | 1 - hyperion/torch/data/audio_dataset.py | 3 - hyperion/torch/data/feat_seq_dataset.py | 24 +- hyperion/torch/models/__init__.py | 6 + .../torch/models/wav2xvectors/__init__.py | 4 + .../hf_hubert2resnet1d_xvector.py | 74 ++ .../hf_wav2vec2resnet1d_xvector.py | 54 +- .../models/wav2xvectors/hf_wav2xvector.py | 229 +++++- .../wav2xvectors/hf_wavlm2resnet1d_xvector.py | 74 ++ .../torch/models/wav2xvectors/wav2xvector.py | 1 + .../torch/models/xvectors/resnet1d_xvector.py | 2 +- hyperion/torch/models/xvectors/xvector.py | 45 +- hyperion/torch/tpm/__init__.py | 6 + hyperion/torch/tpm/hf/__init__.py | 8 + hyperion/torch/tpm/hf/hf_hubert.py | 553 +++++++++++++++ hyperion/torch/tpm/hf/hf_wav2vec2.py | 668 ++++++++++++++++++ hyperion/torch/tpm/hf/hf_wav2vec_base.py | 331 +++++++++ hyperion/torch/tpm/hf/hf_wavlm.py | 622 ++++++++++++++++ hyperion/torch/trainers/torch_trainer.py | 7 +- hyperion/torch/trainers/xvector_trainer.py | 12 + hyperion/torch/utils/__init__.py | 2 +- hyperion/torch/utils/ddp.py | 25 +- hyperion/torch/utils/masking.py | 2 +- 73 files changed, 4756 insertions(+), 87 deletions(-) mode change 100644 => 120000 egs/voxceleb/v1.1/default_config.sh create mode 100644 egs/voxceleb/v2/README.md create mode 100755 egs/voxceleb/v2/cmd.sh create mode 100644 egs/voxceleb/v2/conf/clsp.conf create mode 100644 egs/voxceleb/v2/conf/coe_gpu_bigmem.conf create mode 100644 egs/voxceleb/v2/conf/coe_gpu_long.conf create mode 100644 egs/voxceleb/v2/conf/coe_gpu_rtx.conf create mode 100644 egs/voxceleb/v2/conf/coe_gpu_short.conf create mode 100644 egs/voxceleb/v2/conf/coe_gpu_v100.conf create mode 100644 egs/voxceleb/v2/conf/hubertbase_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/conf/lrsched_exp_default.yaml create mode 100644 egs/voxceleb/v2/conf/optim_adam_default.yaml create mode 100644 egs/voxceleb/v2/conf/reverb_noise_aug.yaml create mode 100644 egs/voxceleb/v2/conf/train_data_default.yaml create mode 100644 egs/voxceleb/v2/conf/train_hubertbase_ecapatdnn512x2_default.yaml create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2base_ecapatdnn512x2_default.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbase_ecapatdnn512x2_default.yaml create mode 100644 egs/voxceleb/v2/conf/trainer_swa_default.yaml create mode 100644 egs/voxceleb/v2/conf/vad_16k.yaml create mode 100644 egs/voxceleb/v2/conf/val_data_default.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2base960h_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2base_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbase_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/datapath.sh create mode 120000 egs/voxceleb/v2/default_config.sh create mode 100644 egs/voxceleb/v2/global_conf/config_hubertbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base960h_ecapatdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh create mode 120000 egs/voxceleb/v2/hyp_utils create mode 120000 egs/voxceleb/v2/local create mode 100755 egs/voxceleb/v2/path.sh create mode 100755 egs/voxceleb/v2/run_001_prepare_data.sh create mode 100755 egs/voxceleb/v2/run_002_compute_evad.sh create mode 100755 egs/voxceleb/v2/run_010_prepare_xvec_train_data.sh create mode 100755 egs/voxceleb/v2/run_011_train_xvector.sh create mode 100755 egs/voxceleb/v2/run_030_extract_xvectors.sh create mode 100755 egs/voxceleb/v2/run_040_eval_be.sh create mode 120000 egs/voxceleb/v2/steps create mode 120000 egs/voxceleb/v2/steps_be create mode 120000 egs/voxceleb/v2/steps_pyfe create mode 120000 egs/voxceleb/v2/steps_xvec create mode 120000 egs/voxceleb/v2/utils create mode 100755 hyp_utils/xvectors/extract_wav2vec2xvectors.sh create mode 100755 hyperion/bin/extract_wav2vec2xvectors.py create mode 100755 hyperion/bin/train_wav2vec2xvector.py create mode 100755 hyperion/bin/train_xvector_from_feats.py rename hyperion/bin/{torch_train_xvec_from_wav.py => train_xvector_from_wav.py} (99%) create mode 100644 hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py create mode 100644 hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py create mode 100644 hyperion/torch/tpm/__init__.py create mode 100644 hyperion/torch/tpm/hf/__init__.py create mode 100644 hyperion/torch/tpm/hf/hf_hubert.py create mode 100644 hyperion/torch/tpm/hf/hf_wav2vec2.py create mode 100644 hyperion/torch/tpm/hf/hf_wav2vec_base.py create mode 100644 hyperion/torch/tpm/hf/hf_wavlm.py diff --git a/egs/voxceleb/v1.1/default_config.sh b/egs/voxceleb/v1.1/default_config.sh deleted file mode 100644 index 652b4d61..00000000 --- a/egs/voxceleb/v1.1/default_config.sh +++ /dev/null @@ -1,54 +0,0 @@ -# Default parameters -# LResNet34 x-vector without mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_16k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=lresnet34 #light resnet -dropout=0 -embed_dim=256 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v1.1/default_config.sh b/egs/voxceleb/v1.1/default_config.sh new file mode 120000 index 00000000..8f713463 --- /dev/null +++ b/egs/voxceleb/v1.1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh \ No newline at end of file diff --git a/egs/voxceleb/v1.1/run_011_train_xvector.sh b/egs/voxceleb/v1.1/run_011_train_xvector.sh index 9b30369e..17d50722 100755 --- a/egs/voxceleb/v1.1/run_011_train_xvector.sh +++ b/egs/voxceleb/v1.1/run_011_train_xvector.sh @@ -44,7 +44,7 @@ if [ $stage -le 1 ]; then $cuda_cmd \ --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch_train_xvec_from_wav.py $nnet_type --cfg $xvec_train_base_cfg $xvec_train_args $extra_args \ + train_xvector_from_wav.py $nnet_type --cfg $xvec_train_base_cfg $xvec_train_args $extra_args \ --data.train.dataset.audio-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md new file mode 100644 index 00000000..5b5b93e5 --- /dev/null +++ b/egs/voxceleb/v2/README.md @@ -0,0 +1,205 @@ +# VoxCeleb V1.1 + +Recipe for the VoxCeleb Speaker Verification Task + +## Differences w.r.t VoxCeleb V1 recipe + +In recipe version V1: + - We compute speech augmentations and acoustic features offline and dump them to disk. + - Augmentation is performed using Kaldi scripts and wav-reverbate tool + - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. + +In this recipe: + - We compute speech augmentations and acoustic features are computed always on-the-fly, + we don't dump any features to disk. + - Augmentation is performed using Hyperin SpeechAugment class. + - The behavior of this class is controlled + by the the configuration file `conf/reverb_noise_aug.yml`, + which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. + - Babble noise is created offline by mixing 3-10 single speaker files. + + +## Citing + +## Training Data + + - x-Vector network is trained on Voxceleb2 dev + test with augmentations + - MUSAN noise + - RIR reverberation + +## Test data + + - Test data is VoxCeleb 1 + - We evaluate 6 conditions: + - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers + - Voxceleb-O-cleaned: VoxCeleb-O cleaned-up of some errors + - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 + - Voxceleb-E-cleaned: VoxCeleb-E cleaned-up of some errors + - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. + - Voxceleb-H-cleaned: VoxCeleb-H cleaned-up of some errors + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it will use Light ResNet (16 base channels) + - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as +```bash +run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true +run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +``` + + - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` + +## Recipe Steps: + + - `run_001_prepare_data.sh` + - Data preparation script to generate Kaldi style data directories for + - VoxCeleb2 train+test + - VoxCeleb1 O/E/H eval sets + + - `run_002_compute_evad.sh` + - Computes Energy VAD for all datasets + + - `run_003_prepare_noises_rirs.sh` + - Prepares MUSAN noises, music to be used by SpeechAugment class. + - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. + - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. + + - `run_010_prepare_xvec_train_data.sh` + - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. + - Removes silence from the audios + - Removes utterances shorter than 4secs and speakers with less than 8 utterances. + - Creates training and validation lists for x-vector training + + - `run_011_train_xvector.sh` + - Trains the x-vector network + + - `run_030_extract_xvectors.sh` + - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training + - Exctracts x-vectors for VoxCeleb1 test sets + + - `run_040_eval_be.sh` + - Trains PLDA and evals PLDA and cosine scoring back-ends + + +## Results + +### VoxCeleb 1 Original-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | +| | | | Cosine | 2.04 | 0.138 | 0.210 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.35 | 0.091 | 0.159 | +| | | | Cosine | 1.22 | 0.082 | 0.129 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 | +| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.090 | 0.160 | +| | | | Cosine | 1.44 | 0.100 | 0.173 | +| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 | +| | | | Cosine | 1.17 | 0.081 | 0.110 | +| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 | +| | | | Cosine | 1.31 | 0.080 | 0.139 | +| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 | +| | | | Cosine | 1.23 | 0.083 | 0.136 | +| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.095 | 0.156 | +| | | | Cosine | 1.29 | 0.089 | 0.146 | +| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.084 | 0.136 | +| | | | Cosine | 1.18 | 0.078 | 0.115 | +| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.084 | 0.145 | +| | | | Cosine | 1.12 | 0.073 | 0.131 | +| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.53 | 0.104 | 0.189 | +| | | | Cosine | 1.31 | 0.084 | 0.132 | +| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 0.98 | 0.066 | 0.116 | +| | | | Cosine | 1.12 | 0.071 | 0.103 | +| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.05 | 0.077 | 0.123 | +| | | | Cosine | 0.96 | 0.065 | 0.110 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.04 | 0.071 | 0.118 | +| | | | Cosine | 0.93 | 0.067 | 0.108 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 0.90 | 0.067 | 0.118 | +| | | | Cosine | 0.85 | 0.060 | 0.094 | +| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 | +| | | | Cosine | 1.29 | 0.084 | 0.140 | +| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 | +| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 | +| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 | + + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 | +| | | | Cosine | 1.93 | 0.122 | 0.201 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 | +| | | | Cosine | 1.24 | 0.080 | 0.136 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 | +| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 | +| | | | Cosine | 1.30 | 0.082 | 0.150 | +| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 | +| | | | Cosine | 1.09 | 0.071 | 0.124 | +| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 | +| | | | Cosine | 1.15 | 0.076 | 0.132 | +| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 | +| | | | Cosine | 1.27 | 0.082 | 0.148 | +| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.31 | 0.086 | 0.149 | +| | | | Cosine | 1.22 | 0.079 | 0.134 | +| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.082 | 0.145 | +| | | | Cosine | 1.16 | 0.074 | 0.130 | +| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.077 | 0.136 | +| | | | Cosine | 1.11 | 0.071 | 0.125 | +| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.46 | 0.097 | 0.173 | +| | | | Cosine | 1.24 | 0.080 | 0.140 | +| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.071 | 0.127 | +| | | | Cosine | 1.05 | 0.067 | 0.117 | +| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.078 | 0.134 | +| | | | Cosine | 1.05 | 0.069 | 0.121 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.18 | 0.075 | 0.131 | +| | | | Cosine | 0.98 | 0.063 | 0.110 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA | 1.17 | 0.072 | 0.123 | +| | | | Cosine | 0.94 | 0.061 | 0.107 | +| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 | +| | | | Cosine | 1.27 | 0.079 | 0.142 | +| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 | +| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 | +| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 | + + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 | +| | | | Cosine | 3.27 | 0.188 | 0.303 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 | +| | | | Cosine | 2.32 | 0.139 | 0.232 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 | +| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 | +| | | | Cosine | 2.33 | 0.142 | 0.235 | +| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 | +| | | | Cosine | 2.14 | 0.126 | 0.203 | +| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 | +| | | | Cosine | 2.11 | 0.127 | 0.205 | +| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 | +| | | | Cosine | 2.33 | 0.141 | 0.232 | +| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 2.42 | 0.144 | 0.245 | +| | | | Cosine | 2.26 | 0.133 | 0.224 +| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.39 | 0.141 | 0.235 | +| | | | Cosine | 2.17 | 0.128 | 0.215 +| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.28 | 0.131 | 0.225 | +| | | | Cosine | 2.11 | 0.124 | 0.204 | +| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 2.77 | 0.172 | 0.271 | +| | | | Cosine | 2.45 | 0.141 | 0.225 | +| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 2.07 | 0.124 | 0.201 | +| | | | Cosine | 1.95 | 0.113 | 0.181 | +| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 2.34 | 0.136 | 0.230 | +| | | | Cosine | 1.99 | 0.119 | 0.196 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 2.18 | 0.127 | 0.211 | +| | | | Cosine | 1.89 | 0.112 | 0.184 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 2.14 | 0.125 | 0.209 | +| | | | Cosine | 1.84 | 0.110 | 0.186 | +| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 | +| | | | Cosine | 2.26 | 0.134 | 0.214 | +| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 | +| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 | +| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 | diff --git a/egs/voxceleb/v2/cmd.sh b/egs/voxceleb/v2/cmd.sh new file mode 100755 index 00000000..040f458b --- /dev/null +++ b/egs/voxceleb/v2/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/v2/conf/clsp.conf b/egs/voxceleb/v2/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v2/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v2/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v2/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v2/conf/coe_gpu_long.conf b/egs/voxceleb/v2/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v2/conf/coe_gpu_rtx.conf b/egs/voxceleb/v2/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v2/conf/coe_gpu_short.conf b/egs/voxceleb/v2/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v2/conf/coe_gpu_v100.conf b/egs/voxceleb/v2/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v2/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v2/conf/hubertbase_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/hubertbase_ecapatdnn512x2.yaml new file mode 100644 index 00000000..94bb31cc --- /dev/null +++ b/egs/voxceleb/v2/conf/hubertbase_ecapatdnn512x2.yaml @@ -0,0 +1,37 @@ +hf_feats: + pretrained_model_path: facebook/hubert-base-ls960 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/lrsched_exp_default.yaml b/egs/voxceleb/v2/conf/lrsched_exp_default.yaml new file mode 100644 index 00000000..fe08b704 --- /dev/null +++ b/egs/voxceleb/v2/conf/lrsched_exp_default.yaml @@ -0,0 +1,7 @@ +lrsch_type: exp_lr +decay_rate: 0.5 +decay_steps: 8000 +hold_steps: 40000 +min_lr: 1.0e-05 +update_lr_on_opt_step: true +warmup_steps: 1000 diff --git a/egs/voxceleb/v2/conf/optim_adam_default.yaml b/egs/voxceleb/v2/conf/optim_adam_default.yaml new file mode 100644 index 00000000..b6620069 --- /dev/null +++ b/egs/voxceleb/v2/conf/optim_adam_default.yaml @@ -0,0 +1,6 @@ +opt_type: adam +lr: 0.05 +amsgrad: true +beta1: 0.9 +beta2: 0.95 +weight_decay: 1.0e-05 diff --git a/egs/voxceleb/v2/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/voxceleb/v2/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/v2/conf/train_data_default.yaml b/egs/voxceleb/v2/conf/train_data_default.yaml new file mode 100644 index 00000000..72c77204 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_data_default.yaml @@ -0,0 +1,11 @@ +dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_hubertbase_ecapatdnn512x2_default.yaml b/egs/voxceleb/v2/conf/train_hubertbase_ecapatdnn512x2_default.yaml new file mode 100644 index 00000000..6cec83c8 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_hubertbase_ecapatdnn512x2_default.yaml @@ -0,0 +1,6 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: hubertbase_ecapatdnn512x2.yaml +trainer: trainer_swa_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml b/egs/voxceleb/v2/conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml new file mode 100644 index 00000000..a7fc925e --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml @@ -0,0 +1,6 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: wav2vec2base960h_ecapatdnn512x2.yaml +trainer: trainer_swa_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2base_ecapatdnn512x2_default.yaml b/egs/voxceleb/v2/conf/train_wav2vec2base_ecapatdnn512x2_default.yaml new file mode 100644 index 00000000..90f35805 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2base_ecapatdnn512x2_default.yaml @@ -0,0 +1,6 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: wav2vec2base_ecapatdnn512x2.yaml +trainer: trainer_swa_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbase_ecapatdnn512x2_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbase_ecapatdnn512x2_default.yaml new file mode 100644 index 00000000..424c9bd6 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbase_ecapatdnn512x2_default.yaml @@ -0,0 +1,6 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: wavlmbase_ecapatdnn512x2.yaml +trainer: trainer_swa_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/trainer_swa_default.yaml b/egs/voxceleb/v2/conf/trainer_swa_default.yaml new file mode 100644 index 00000000..c45e3eb5 --- /dev/null +++ b/egs/voxceleb/v2/conf/trainer_swa_default.yaml @@ -0,0 +1,9 @@ +optim: optim_adam_default.yaml +lrsched: lrsched_exp_default.yaml +use_amp: true +log_interval: 1000 +epochs: 63 +eff_batch_size: 512 +swa_start: 60 +swa_lr: 1e-4 +swa_anneal_epochs: 1 diff --git a/egs/voxceleb/v2/conf/vad_16k.yaml b/egs/voxceleb/v2/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/v2/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v2/conf/val_data_default.yaml b/egs/voxceleb/v2/conf/val_data_default.yaml new file mode 100644 index 00000000..72c77204 --- /dev/null +++ b/egs/voxceleb/v2/conf/val_data_default.yaml @@ -0,0 +1,11 @@ +dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 +sampler: + batch_size: 32 + iters_per_epoch: 6 +data_loader: + num_workers: 8 + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/wav2vec2base960h_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base960h_ecapatdnn512x2.yaml new file mode 100644 index 00000000..85964372 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base960h_ecapatdnn512x2.yaml @@ -0,0 +1,37 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_ecapatdnn512x2.yaml new file mode 100644 index 00000000..fb7c7cde --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_ecapatdnn512x2.yaml @@ -0,0 +1,37 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wavlmbase_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wavlmbase_ecapatdnn512x2.yaml new file mode 100644 index 00000000..b5d14412 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbase_ecapatdnn512x2.yaml @@ -0,0 +1,37 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/datapath.sh b/egs/voxceleb/v2/datapath.sh new file mode 100644 index 00000000..9a2f7529 --- /dev/null +++ b/egs/voxceleb/v2/datapath.sh @@ -0,0 +1,22 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/v2/default_config.sh b/egs/voxceleb/v2/default_config.sh new file mode 120000 index 00000000..65108e89 --- /dev/null +++ b/egs/voxceleb/v2/default_config.sh @@ -0,0 +1 @@ +global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh \ No newline at end of file diff --git a/egs/voxceleb/v2/global_conf/config_hubertbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh b/egs/voxceleb/v2/global_conf/config_hubertbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh new file mode 100644 index 00000000..9ea07c9c --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_hubertbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh @@ -0,0 +1,48 @@ +# Hubert base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 + +# hugging face model +hf_model_name=hubertbase + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_hubert2resnet1d + +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +dropout=0 +embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 +nnet_num_epochs=70 + +lr=0.002 +xvec_train_base_cfg=conf/train_hubertbase_ecapatdnn512x2_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" + +nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=6 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base960h_ecapatdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base960h_ecapatdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh new file mode 100644 index 00000000..b6cbdf30 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2base960h_ecapatdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -0,0 +1,48 @@ +# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 + +# hugging face model +hf_model_name=wav2vec2base960h + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +dropout=0 +embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 +nnet_num_epochs=70 + +lr=0.002 +xvec_train_base_cfg=conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" + +nnet_name=${hf_model_name}_ecapatdnn512x3_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=6 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh new file mode 100644 index 00000000..a021d5a1 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh @@ -0,0 +1,51 @@ +# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 + +# hugging face model +hf_model_name=wav2vec2base + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +dropout=0 +embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 +nnet_num_epochs=70 + +lr=0.002 +lr=0.001 +lr=0.005 +xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 10000 --trainer.lrsched.hold-steps 20000" + +nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v2 #v1 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth +nnet=$nnet_dir/model_ep0060.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=6 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh b/egs/voxceleb/v2/global_conf/config_wavlmbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh new file mode 100644 index 00000000..ba4272a2 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 + +# hugging face model +hf_model_name=wavlmbase + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +dropout=0 +embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 +nnet_num_epochs=70 + +lr=0.002 +lr=0.001 +xvec_train_base_cfg=conf/train_wavlmbase_ecapatdnn512x2_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" + +nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=6 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/hyp_utils b/egs/voxceleb/v2/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v2/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v2/local b/egs/voxceleb/v2/local new file mode 120000 index 00000000..740b697d --- /dev/null +++ b/egs/voxceleb/v2/local @@ -0,0 +1 @@ +../v1/local/ \ No newline at end of file diff --git a/egs/voxceleb/v2/path.sh b/egs/voxceleb/v2/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/v2/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/v2/run_001_prepare_data.sh b/egs/voxceleb/v2/run_001_prepare_data.sh new file mode 100755 index 00000000..7bf15448 --- /dev/null +++ b/egs/voxceleb/v2/run_001_prepare_data.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh + + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train +fi + +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + # This script is for the old version of the dataset + # local/make_voxceleb1_oeh.pl $voxceleb1_root data + # Use this for the newer version of voxceleb1: + local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data +fi diff --git a/egs/voxceleb/v2/run_002_compute_evad.sh b/egs/voxceleb/v2/run_002_compute_evad.sh new file mode 100755 index 00000000..eeae00ac --- /dev/null +++ b/egs/voxceleb/v2/run_002_compute_evad.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +storage_name=$(date +'%m_%d_%H_%M') +vaddir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" + fi + fi +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + + diff --git a/egs/voxceleb/v2/run_010_prepare_xvec_train_data.sh b/egs/voxceleb/v2/run_010_prepare_xvec_train_data.sh new file mode 100755 index 00000000..5936fbf4 --- /dev/null +++ b/egs/voxceleb/v2/run_010_prepare_xvec_train_data.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 2 ]; then + # This script preprocess audio for x-vector training + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${nnet_data} data/${nnet_data}_proc_audio_no_sil exp/${nnet_data}_proc_audio_no_sil + hyp_utils/kaldi/utils/fix_data_dir.sh data/${nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 4s + hyp_utils/remove_short_audios.sh --min-len 4 data/${nnet_data}_proc_audio_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 4 utterances. + hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 4 ]; then + # Prepare train and validation lists for x-vectors + local/make_train_lists_sup_embed_with_augm.sh \ + data/${nnet_data}_proc_audio_no_sil \ + data/${nnet_data}_proc_audio_no_sil/lists_xvec +fi + +exit diff --git a/egs/voxceleb/v2/run_011_train_xvector.sh b/egs/voxceleb/v2/run_011_train_xvector.sh new file mode 100755 index 00000000..0b9a092e --- /dev/null +++ b/egs/voxceleb/v2/run_011_train_xvector.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_wav2vec2xvector.py $nnet_type --cfg $xvec_train_base_cfg $xvec_train_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir $args \ + --num-gpus $ngpu \ + +fi + diff --git a/egs/voxceleb/v2/run_030_extract_xvectors.sh b/egs/voxceleb/v2/run_030_extract_xvectors.sh new file mode 100755 index 00000000..90186a42 --- /dev/null +++ b/egs/voxceleb/v2/run_030_extract_xvectors.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=120 #seconds +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 6G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [ $stage -le 1 ]; then + # Extract xvectors for training LDA/PLDA + for name in voxceleb2cat_train + do + if [ $plda_num_augs -eq 0 ]; then + steps_xvec/extract_wav2vec2xvectors.sh --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + $nnet data/${name} \ + $xvector_dir/${name} + else + steps_xvec/extract_wav2vec2xvectors.sh --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ + --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + --aug-config $plda_aug_config --num-augs $plda_num_augs \ + $nnet data/${name} \ + $xvector_dir/${name}_augx${plda_num_augs} \ + data/${name}_augx${plda_num_augs} + fi + done +fi + + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + for name in voxceleb1_test + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj $nj ${xvec_args} \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + +exit diff --git a/egs/voxceleb/v2/run_040_eval_be.sh b/egs/voxceleb/v2/run_040_eval_be.sh new file mode 100755 index 00000000..cd168180 --- /dev/null +++ b/egs/voxceleb/v2/run_040_eval_be.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_plda_dir=$score_dir/plda +score_cosine_dir=exp/scores/$nnet_name/cosine + +if [ $stage -le 1 ]; then + + echo "Train PLDA on Voxceleb2" + steps_be/train_be_v1.sh --cmd "$train_cmd" \ + --lda_dim $lda_dim \ + --plda_type $plda_type \ + --y_dim $plda_y_dim --z_dim $plda_z_dim \ + $xvector_dir/$plda_data/xvector.scp \ + data/$plda_data \ + $be_dir & + + + wait + +fi + + +if [ $stage -le 2 ];then + + echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" + steps_be/eval_be_v1.sh --cmd "$train_cmd" --plda_type $plda_type \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $be_dir/lda_lnorm.h5 \ + $be_dir/plda.h5 \ + $score_plda_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + + +score_plda_dir=$score_cosine_dir + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_plda_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + +be_dir=exp/be/$nnet_name/cw +score_plda_dir=$score_dir/cw_cosine + +if [ $stage -le 4 ]; then + echo "Train centering+whitening on Voxceleb2" + steps_be/train_be_v2.sh --cmd "$train_cmd" \ + $xvector_dir/$plda_data/xvector.scp \ + data/$plda_data \ + $be_dir +fi + + +if [ $stage -le 5 ];then + + echo "Eval Voxceleb 1 with CentWhiten + Cosine scoring" + steps_be/eval_be_v2.sh --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $be_dir/cw.h5 \ + $score_plda_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + +exit + diff --git a/egs/voxceleb/v2/steps b/egs/voxceleb/v2/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/voxceleb/v2/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/voxceleb/v2/steps_be b/egs/voxceleb/v2/steps_be new file mode 120000 index 00000000..b2098c2a --- /dev/null +++ b/egs/voxceleb/v2/steps_be @@ -0,0 +1 @@ +../v1/steps_be \ No newline at end of file diff --git a/egs/voxceleb/v2/steps_pyfe b/egs/voxceleb/v2/steps_pyfe new file mode 120000 index 00000000..7b9d122a --- /dev/null +++ b/egs/voxceleb/v2/steps_pyfe @@ -0,0 +1 @@ +hyp_utils/feats \ No newline at end of file diff --git a/egs/voxceleb/v2/steps_xvec b/egs/voxceleb/v2/steps_xvec new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/voxceleb/v2/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file diff --git a/egs/voxceleb/v2/utils b/egs/voxceleb/v2/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/voxceleb/v2/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index 1ed39a7d..283a7a49 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -66,7 +66,7 @@ if [ $num_gpus -gt 0 ];then fi fi echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" - # export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters + export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters if [ $num_gpus -gt 1 ];then [[ $(type -P "$torchrun") ]] && command="torchrun" \ || command="python -m torch.distributed.run" diff --git a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh new file mode 100755 index 00000000..56ed6b56 --- /dev/null +++ b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +nj=30 +cmd="run.pl" + +chunk_length=0 # The chunk size over which the embedding is extracted. +use_gpu=false +write_utt2speech_dur=true # If true writes utt2speech_dur. +stage=0 +min_utt_length=5 +max_utt_length=120 +random_utt_length=false +aug_config="" +num_augs=0 +use_bin_vad=true + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + echo " --use-bin-vad # If true, uses binary VAD from vad.scp" + echo " --write-utt2speech-dur # If true, write utt2speech_dur (in secs) file." + echo " --chunk-length # If provided, applies encoder with specified chunk-length and " + echo " # concatenates the chunks outputs before pooling" + echo " --aug-config # augmentation config file" + echo " --random-utt-length # If true, extracts a random chunk from the utterance between " + echo " # min_utt_length and max_utt_length" + echo " --min-utt-length # " + echo " --max-utt-length # " + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +data_out_dir=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +args="" +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + args="--use-gpu" +fi + +if [ "$use_bin_vad" == "true" ];then + f=$data_dir/vad.scp + [ ! -f $f ] && echo "No such file $f" && exit 1; + args="${args} --vad scp:$f" +fi + +if [ -n "$aug_config" ];then + args="${args} --aug-cfg $aug_config --num-augs $num_augs --aug-info-path $output_dir/aug_info.JOB.csv" +fi + +if [ "$random_utt_length" == "true" ];then + args="${args} --random-utt-length --min-utt-length $min_utt_length --max-utt-length $max_utt_length" +fi + +if [ "$write_utt2speech_dur" == "true" ];then + write_speech_dur_opt="--write-speech-dur $output_dir/utt2speech_dur.JOB" +fi + +if [ $stage -le 0 ];then + set +e + $cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + extract_wav2vec2xvectors.py \ + ${args} $write_speech_dur_opt \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file --chunk-length $chunk_length \ + --output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp + set -e +fi + +if [ $stage -le 1 ];then + for((i=1;i<=$nj;i++)) + do + status=$(tail -n 1 $output_dir/log/extract_xvectors.$i.log | \ + awk '/status 0/ { print 0} + !/status 0/ { print 1}') + if [ $status -eq 1 ];then + echo "JOB $i failed, resubmitting" + if [ "$write_utt2speech_dur" == "true" ];then + write_speech_dur_opt="--write-speech-dur $output_dir/utt2speech_dur.$i" + fi + $cmd $output_dir/log/extract_xvectors.$i.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + extract_wav2vec2xvectors.py \ + ${args} $write_speech_dur_opt \ + --part-idx $i --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file --chunk-length $chunk_length \ + --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & + fi + done + wait +fi + +if [ $stage -le 2 ]; then + echo "$0: combining xvectors across jobs" + for j in $(seq $nj); do cat $output_dir/xvector.$j.scp; done > $output_dir/xvector.scp || exit 1; + if [ "$write_utt2speech_dur" == "true" ];then + for n in $(seq $nj); do + cat $output_dir/utt2speech_dur.$n || exit 1; + done > $output_dir/utt2speech_dur || exit 1 + fi + + if [ -f $output_dir/aug_info.1.csv ];then + cat $output_dir/aug_info.1.csv > $output_dir/aug_info.csv + for j in $(seq 2 $nj); + do + tail -n +2 $output_dir/aug_info.$j.csv + done >> $output_dir/aug_info.csv + fi +fi + +if [ $stage -le 3 ]; then + if [ -n "$data_out_dir" ];then + echo "$0: creating data dir $data_out_dir for augmented x-vectors" + mkdir -p $data_out_dir + awk -F "," '$1 != "key_aug" { print $1,$2}' $output_dir/aug_info.csv \ + > $data_out_dir/augm2clean + awk -v u2s=$data_dir/utt2spk 'BEGIN{ +while(getline < u2s) +{ + spk[$1]=$2 +} +} +{ print $1,spk[$2]}' $data_out_dir/augm2clean > $data_out_dir/utt2spk + utils/utt2spk_to_spk2utt.pl $data_out_dir/utt2spk > $data_out_dir/spk2utt + cp $output_dir/utt2speech_dur $data_out_dir + else + cp $output_dir/utt2speech_dur $data_dir + fi +fi diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py new file mode 100755 index 00000000..5675ace1 --- /dev/null +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np +import pandas as pd + +import torch + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.utils import Utt2Info +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment + +from hyperion.torch.utils import open_device +from hyperion.torch import TorchModelLoader as TML + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): + utt_length = rng.randint(low=fs * min_utt_length, high=fs * max_utt_length + 1) + if utt_length < x.shape[1]: + first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, + ) + return x + + +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_speech_dur, + scp_sep, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs +): + + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + model = load_model(model_path, device) + + if write_speech_dur is not None: + keys = [] + info = [] + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output stream: %s", output_spec) + with DWF.create(output_spec, scp_sep=scp_sep) as writer: + + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) + with AR(input_spec, **ar_args) as reader: + + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create( + vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + ) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + fs = fs[0] + t2 = time.time() + + logging.info("processing utt %s", key0) + for aug_id in range(num_augs): + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + + t5 = time.time() + tot_samples = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0)[0] + vad = torch.tensor( + vad[None, None, :], dtype=torch.float + ).to(device) + vad = torch.nn.functional.interpolate( + vad, size=x.size(-1), mode="nearest" + ).bool()[0, 0] + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech samples", + key, + x.shape[1], + tot_samples, + x.shape[1] / tot_samples * 100, + ) + + if random_utt_length: + x = select_random_chunk( + key, x, fs, min_utt_length, max_utt_length, rng + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + y = ( + model.extract_embed( + x, + chunk_length=fs * chunk_length, + embed_layer=embed_layer, + ) + .cpu() + .numpy()[0] + ) + + t7 = time.time() + writer.write([key], [y]) + if write_speech_dur is not None: + keys.append(key) + info.append(str(x.shape[1] * fs)) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x.shape[1] / fs / tot_time, + ) + + if write_speech_dur is not None: + logging.info("writing speech duration in secs to %s", write_speech_dur) + u2sd = Utt2Info.create(keys, info) + u2sd.save(write_speech_dur) + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=( + "Extracts x-vectors from waveform computing " "acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument("--write-speech-dur", default=None) + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_class_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=5, + help=("minimum utterance length in secs when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=120, + help=("maximum utterance length in secs when using random utt length"), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py new file mode 100755 index 00000000..cbb37bb3 --- /dev/null +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ( + HFWav2Vec2ResNet1dXVector, + HFHubert2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) + +model_dict = { + "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, + "hf_hubert2resnet1d": HFHubert2ResNet1dXVector, + "hf_wavlm2resnet1d": HFWavLM2ResNet1dXVector, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = Sampler.filter_args(**kwargs["sampler"]) + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + ad_args["is_val"] = partition == "val" + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = Sampler(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_model(num_classes, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + model_args["xvector"]["num_classes"] = num_classes + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + Sampler.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + Sampler.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_file", "data.val.dataset.class_file" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + parser.link_arguments( + "data.train.sampler.batch_size", "data.val.sampler.batch_size" + ) + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args(parser, prefix="trainer") + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train Wav2Vec2XVector model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py new file mode 100755 index 00000000..c09f15a4 --- /dev/null +++ b/hyperion/bin/train_xvector_from_feats.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.data import FeatSeqDataset as SD +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import TDNNXVector as TDXVec +from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "efficientnet": EXVec, + "tdnn": TDXVec, + "transformer": TFXVec, + "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + sd_args = SD.filter_args(**kwargs["dataset"]) + sampler_args = Sampler.filter_args(**kwargs["sampler"]) + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, sd_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + sd_args["is_val"] = partition == "val" + dataset = SD(**sd_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = Sampler(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_xvector(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + SD.add_class_args(train_parser, prefix="dataset", skip={}) + Sampler.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + SD.add_class_args(val_parser, prefix="dataset", skip={}) + Sampler.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_file", "data.val.dataset.class_file" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + parser.link_arguments( + "data.train.sampler.batch_size", "data.val.sampler.batch_size" + ) + + xvec_class.add_class_args(parser, prefix="model") + Trainer.add_class_args(parser, prefix="trainer") + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train XVector from audio files") + + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) diff --git a/hyperion/bin/torch_train_xvec_from_wav.py b/hyperion/bin/train_xvector_from_wav.py similarity index 99% rename from hyperion/bin/torch_train_xvec_from_wav.py rename to hyperion/bin/train_xvector_from_wav.py index c488b5c5..39b037ba 100755 --- a/hyperion/bin/torch_train_xvec_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -22,7 +22,6 @@ import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device from hyperion.torch.utils import ddp from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.data import AudioDataset as AD diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 38da8eb9..f86ad0a2 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -387,9 +387,6 @@ def add_class_args(parser, prefix=None, skip={"audio_file", "key_file"}): outer_parser = parser parser = ArgumentParser(prog="") - # parser.add_argument('--path-prefix', - # default='', - # help=('path prefix for rspecifier scp file')) if "audio_file" not in skip: parser.add_argument( "--audio-file", diff --git a/hyperion/torch/data/feat_seq_dataset.py b/hyperion/torch/data/feat_seq_dataset.py index 2774c899..462bfe41 100644 --- a/hyperion/torch/data/feat_seq_dataset.py +++ b/hyperion/torch/data/feat_seq_dataset.py @@ -26,7 +26,7 @@ class FeatSeqDataset(Dataset): def __init__( self, - rspecifier, + feat_file, key_file, class_file=None, num_frames_file=None, @@ -39,8 +39,8 @@ def __init__( is_val=False, ): - logging.info("opening dataset %s" % rspecifier) - self.r = RF.create(rspecifier, path_prefix=path_prefix, scp_sep=" ") + logging.info("opening dataset %s", feat_file) + self.r = RF.create(feat_file, path_prefix=path_prefix, scp_sep=" ") logging.info("loading utt2info file %s" % key_file) self.u2c = Utt2Info.load(key_file, sep=" ") logging.info("dataset contains %d seqs" % self.num_seqs) @@ -264,6 +264,8 @@ def _get_random_chunk(self, index): @staticmethod def filter_args(**kwargs): valid_args = ( + "feat_file", + "key_file", "path_prefix", "class_file", "num_frames_file", @@ -276,11 +278,25 @@ def filter_args(**kwargs): return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix=None, skip={"feat_file", "key_file"}): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") + if "feat_file" not in skip: + parser.add_argument( + "--feat-file", + required=True, + help=("acoustic features manifest file"), + ) + + if "key_file" not in skip: + parser.add_argument( + "--key-file", + required=True, + help=("key manifest file"), + ) + parser.add_argument( "--path-prefix", default="", help=("path prefix for rspecifier scp file") ) diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index be4e0441..e953f58c 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -12,5 +12,11 @@ from .xvectors.spinenet_xvector import SpineNetXVector from .xvectors.resnet1d_xvector import ResNet1dXVector +from .wav2xvectors import ( + HFWav2Vec2ResNet1dXVector, + HFHubert2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) + from .vae.vae import VAE from .vae.vq_vae import VQVAE diff --git a/hyperion/torch/models/wav2xvectors/__init__.py b/hyperion/torch/models/wav2xvectors/__init__.py index d1e65dd0..015c8d0f 100644 --- a/hyperion/torch/models/wav2xvectors/__init__.py +++ b/hyperion/torch/models/wav2xvectors/__init__.py @@ -11,3 +11,7 @@ # from .wav2transformer_xvector_v1 import Wav2TransformerXVectorV1 # from .wav2spinenet_xvector import Wav2SpineNetXVector from .wav2resnet1d_xvector import Wav2ResNet1dXVector + +from .hf_wav2vec2resnet1d_xvector import HFWav2Vec2ResNet1dXVector +from .hf_hubert2resnet1d_xvector import HFHubert2ResNet1dXVector +from .hf_wavlm2resnet1d_xvector import HFWavLM2ResNet1dXVector diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py new file mode 100644 index 00000000..d585567f --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py @@ -0,0 +1,74 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from jsonargparse import ArgumentParser, ActionParser +from typing import Union, Dict, Optional + +import torch +import torch.nn as nn + +from ..xvectors import ResNet1dXVector +from ...tpm import HFHubert +from .hf_wav2xvector import HFWav2XVector + + +class HFHubert2ResNet1dXVector(HFWav2XVector): + """Class extracting Hubert + ResNet1d x-vectors from waveform. + + Attributes: + Attributes: + hf_feats: HFHubert configuration dictionary or object. + This is a warpper over Hugging Face Hubert model. + xvector: ResNet1dXVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the Hubert layers from "feat_fusion_start" to + the Hubert "num_layers". + feat_fusion_method: method to fuse the hidden layers from the Hubert model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFHubert], + xvector: Union[Dict, ResNet1dXVector], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + hf_feats = HFHubert(**hf_feats) + else: + assert isinstance(hf_feats, HFHubert) + + if isinstance(xvector, dict): + xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + xvector = ResNet1dXVector(**xvector) + else: + assert isinstance(xvector, ResNet1dXVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFHubert.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFHubert.add_class_args(parser, prefix="hf_feats") + ResNet1dXVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py index 78724174..3b44c53f 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -4,37 +4,75 @@ """ import logging from jsonargparse import ArgumentParser, ActionParser +from typing import Union, Dict, Optional import torch import torch.nn as nn from ..xvectors import ResNet1dXVector -from ...tpm import HFWav2Vec +from ...tpm import HFWav2Vec2 from .hf_wav2xvector import HFWav2XVector class HFWav2Vec2ResNet1dXVector(HFWav2XVector): - """Class extracting ResNet1d x-vectors from waveform. - It contains acoustic feature extraction, feature normalization and - ResNet1dXVector extractor. + """Class extracting Wav2Vec2 + ResNet1d x-vectors from waveform. Attributes: Attributes: hf_feats: HFWav2Vec configuration dictionary or object. This is a warpper over Hugging Face Wav2Vec model. xvector: ResNet1dXVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. """ - def __init__(self, hf_feats, xvector): + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + xvector: Union[Dict, ResNet1dXVector], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): if isinstance(hf_feats, dict): - hf_feats = HFWav2Vec(**hf_feats) + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) else: - assert isinstance(hf_feats, HFWav2Vec) + assert isinstance(hf_feats, HFWav2Vec2) if isinstance(xvector, dict): + xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + if "class_name" in xvector: + del xvector["class_name"] xvector = ResNet1dXVector(**xvector) else: assert isinstance(xvector, ResNet1dXVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size - super().__init__(hf_feats, xvector) + super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + ResNet1dXVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index a471343c..f5f2c840 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -8,8 +8,10 @@ import torch import torch.nn as nn +# import torch.nn.functional as nnf from ...torch_model import TorchModel +from ...utils import remove_silence class HFWav2XVector(TorchModel): @@ -18,9 +20,234 @@ class HFWav2XVector(TorchModel): Attributes: hf_feats: hugging face model wrapper object. xvector: x-vector model object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. """ - def __init__(self, hf_feats, xvector): + def __init__( + self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" + ): + super().__init__() self.hf_feats = hf_feats self.xvector = xvector + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weights.data = torch.ones(num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start :] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.xvector.update_loss_margin(epoch) + + def forward_feats(self, x, x_lengths, return_feat_layers=None): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) + hf_output = self.hf_feats(x, x_lengths, return_hid_states=return_hid_states) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) + for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + y=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers + ) + output = self.xvector( + feats, + feat_lengths, + y, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + + if not return_feat_layers: + return output + + if not isinstance(output, dict): + # if the xvector just returned the logits we put then into a dictionary + # to append the hid feats later. + output["logits"] = output + + output["h_feats"] = hid_feats + return output + + def extract_embed( + self, + x, + x_lengths=None, + vad_samples=None, + chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + + feats, _, feat_lengths = self.forward_feats(x, x_lengths) + xvec_chunk_length = int(chunk_length * feats.size(-1) // x.size(-1)) + return self.xvector.extract_embed( + feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks + ) + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "xvector", + "feat_fusion_start", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + + hf_cfg = self.hf_feats.get_config() + xvec_cfg = self.xvector.get_config() + del hf_cfg["class_name"] + del xvec_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "xvector": xvec_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=( + "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers" + ), + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]" + ), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + help="xvector options", + ) diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py new file mode 100644 index 00000000..89e7120e --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py @@ -0,0 +1,74 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from jsonargparse import ArgumentParser, ActionParser +from typing import Union, Dict, Optional + +import torch +import torch.nn as nn + +from ..xvectors import ResNet1dXVector +from ...tpm import HFWavLM +from .hf_wav2xvector import HFWav2XVector + + +class HFWavLM2ResNet1dXVector(HFWav2XVector): + """Class extracting WavLM + ResNet1d x-vectors from waveform. + + Attributes: + Attributes: + hf_feats: HFWavLM configuration dictionary or object. + This is a warpper over Hugging Face WavLM model. + xvector: ResNet1dXVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the WavLM layers from "feat_fusion_start" to + the WavLM "num_layers". + feat_fusion_method: method to fuse the hidden layers from the WavLM model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWavLM], + xvector: Union[Dict, ResNet1dXVector], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + hf_feats = HFWavLM(**hf_feats) + else: + assert isinstance(hf_feats, HFWavLM) + + if isinstance(xvector, dict): + xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + xvector = ResNet1dXVector(**xvector) + else: + assert isinstance(xvector, ResNet1dXVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFWavLM.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWavLM.add_class_args(parser, prefix="hf_feats") + ResNet1dXVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 0c5a1698..27268e44 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -75,6 +75,7 @@ def extract_embed( if vad_feats is not None: feats, feat_lengths = remove_silence(feats, feat_lengths) + feats = feats.transpose(1, 2) return self.xvector.extract_embed( feats, feat_lengths, chunk_length, embed_layer, detach_chunks ) diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index 295824f3..706ee4ef 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -133,7 +133,6 @@ def get_config(self): def load(cls, file_path=None, cfg=None, state_dict=None): cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - try: del cfg["in_feats"] except: @@ -145,6 +144,7 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + @staticmethod def filter_args(**kwargs): base_args = XVector.filter_args(**kwargs) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index d11fb020..e07487d7 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -12,7 +12,7 @@ from ...layer_blocks import TDNNBlock from ...narchs import ClassifHead, TorchNALoader from ...torch_model import TorchModel -from ...utils import eval_nnet_by_chunks, scale_lengths +from ...utils import eval_nnet_by_chunks, scale_seq_lengths class XVector(TorchModel): @@ -209,7 +209,7 @@ def _post_enc(self, x, in_lengths=None, max_in_length=None): x = self.proj(x) if in_lengths is not None: - out_lengths = scale_lengths(in_lengths, x.size(-1), max_in_length) + out_lengths = scale_seq_lengths(in_lengths, x.size(-1), max_in_length) else: out_lengths = None @@ -224,6 +224,24 @@ def forward( return_classif_layers=None, return_logits=True, ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the encoder and + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time). + x_lengths: time lengths of the features with shape=(batch,). + y: target classes torch.long tensor with shape=(batch,). + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers). + """ if return_enc_layers is None and return_classif_layers is None: return self.forward_logits(x, x_lengths, y) @@ -236,11 +254,12 @@ def forward_logits(self, x, x_lengths=None, y=None): """Forward function Args: - x: input features tensor with shape=(batch, in_feats, time) - y: target classes torch.long tensor with shape=(batch,) + x: input features tensor with shape=(batch, in_feats, time). + x_lengths: time lengths of the features with shape=(batch,). + y: target classes torch.long tensor with shape=(batch,). Returns: - class logits tensor with shape=(batch, num_classes) + class logits tensor with shape=(batch, num_classes). """ max_in_length = x.size(-1) x = self._pre_enc(x) @@ -259,7 +278,21 @@ def forward_hid_feats( return_classif_layers=None, return_logits=False, ): - """forwards hidden representations in the x-vector network""" + """forwards hidden representations in the x-vector network + + Args: + x: input features tensor with shape=(batch, in_feats, time). + x_lengths: time lengths of the features with shape=(batch,). + y: target classes torch.long tensor with shape=(batch,). + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers). + """ max_in_length = x.size(-1) x = self._pre_enc(x) h_enc, x = self.encoder_net.forward_hid_feats( diff --git a/hyperion/torch/tpm/__init__.py b/hyperion/torch/tpm/__init__.py new file mode 100644 index 00000000..dfa5c14b --- /dev/null +++ b/hyperion/torch/tpm/__init__.py @@ -0,0 +1,6 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .hf import HFWav2Vec2, HFHubert, HFWavLM diff --git a/hyperion/torch/tpm/hf/__init__.py b/hyperion/torch/tpm/hf/__init__.py new file mode 100644 index 00000000..4db1c95d --- /dev/null +++ b/hyperion/torch/tpm/hf/__init__.py @@ -0,0 +1,8 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .hf_wav2vec2 import HFWav2Vec2 +from .hf_hubert import HFHubert +from .hf_wavlm import HFWavLM diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py new file mode 100644 index 00000000..889aed03 --- /dev/null +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -0,0 +1,553 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import os +import logging +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from typing import Optional, Tuple, Union, List, Callable + +import torch +import torch.nn as nn + +from transformers import HubertModel, HubertConfig + +from ...utils.ddp import ddp_wait_for_all_procs, ddp_get_rank +from .hf_wav2vec_base import HFWav2VecBase + + +class HFHubert(HFWav2VecBase): + r"""This is wrapper over HuggingFace Hubert model. + See documentation: https://huggingface.co/docs/transformers/main/en/model_doc/hubert + + This wrapper makes the HugginFace model to have the same interface + as other hyperion models. It also add extra functionalities. + + The config. parameters are the same as in the HuggingFace HubertConfig class. + + Attributes: + pretrained_model_path (`str`, defaults to None): file path or HuggingFace Hub path to + pre-trained model. + normalize_input (`bool`, defaults to True): whether or not to zero-mean unit-variance + normalize the input. + use_input_attention_mask (`bool`, defaults to False): whether we should input an + attention mask to the wav2vec model. + vocab_size (`int`, defaults to 32): vocabulary size of the + model. Defines the different tokens that can be represented by the + *inputs_ids* passed to the forward method. + hidden_size (`int`, defaults to 768): dimensionality of the encoder layers and + the pooler layer. + num_hidden_layers (`int`, defaults to 12): number of hidden layers in the + Transformer encoder. + num_attention_heads (`int`, defaults to 12): number of attention heads for + each attention layer in the Transformer encoder. + intermediate_size (`int`, defaults to 3072): dimensionality of the + feed-forward layer in the Transformer encoder. + hidden_act (`str` or `function`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the encoder and pooler. + If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout (`float`, defaults to 0.1): the dropout probability for all + fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, defaults to 0.1): the dropout probability for all + intermediate layer in feedforward transformer layers. + attention_dropout (`float`, defaults to 0.1): the dropout ratio for the + attention probabilities. + layerdrop (`float`, defaults to 0.1): prob. of dropping a layer. + initializer_range (`float`, defaults to 0.02): the standard deviation of the + truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, defaults to 1e-12): the epsilon used by the layer + normalization layers. + feat_extract_norm (`str`, defaults to `"group"`): + the norm to be applied to 1D convolutional layers in feature encoder. + One of `"group"` for group normalization of only the first 1D convolutional + layer or `"layer"` for layer normalization of all 1D convolutional layers. + feat_proj_dropout (`float`, defaults to 0.0): the dropout probability for output + of the feature encoder. + feat_extract_activation (`str, `optional`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the 1D convolutional layers of the feature + extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + conv_dim (`Tuple[int]`, defaults to `(512, 512, 512, 512, 512, 512, 512)`): + a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. + conv_stride (`Tuple[int]`, defaults to `(5, 2, 2, 2, 2, 2, 2)`): + a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length + of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. + conv_kernel (`Tuple[int]`, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The + length of *conv_kernel* defines the number of convolutional layers and has to match the length of + *conv_dim*. + conv_bias (`bool`, defaults to `False`): whether the 1D convolutional layers have a bias. + num_conv_pos_embeddings (`int`, defaults to 128): + number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer. + num_conv_pos_embedding_groups (`int`, defaults to 16): + number of groups of 1D convolutional positional embeddings layer. + do_stable_layer_norm (`bool`, defaults to `False`): + whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is + True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is + False` corresponds to applying layer norm after the attention layer. + apply_spec_augment (`bool`, defaults to `True`): + whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://arxiv.org/abs/1904.08779). + mask_time_prob (`float`, defaults to 0.05): + percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, defaults to 10): + length of vector span along the time axis. + mask_time_min_masks (`int`, defaults to 2),: + the minimum number of masks of length `mask_time_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks'' + mask_feature_prob (`float`, defaults to 0.0): + percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, defaults to 10): + length of vector span along the feature axis. + mask_feature_min_masks (`int`, defaults to 0): + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' + cache_dir (str or os.PathLike): path to a directory in which a downloaded pretrained + model configuration should be cached if the standard cache should not be used. + force_download (`bool`, defaults to `False`): whether or not to force the (re-)download + the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, defaults to `False`): whether or not to delete incompletely + received files. Will attempt to resume the download if such a file exists. + revision(`str`, defaults to `"main"`): the specific model version to use. + It can be a branch name, a tag name, or a commit id. + ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path + and inits the model from the configuration. This is set to True for models that have already + been finetuned. + """ + + def __init__( + self, + pretrained_model_path: Optional[Union[str, os.PathLike]] = None, + normalize_input: bool = True, + use_input_attention_mask: bool = False, + vocab_size: int = 32, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: Union[str, Callable] = "gelu", + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + layerdrop: float = 0.1, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + feat_extract_norm: str = "group", + feat_proj_dropout: float = 0.0, + feat_extract_activation: Union[str, Callable] = "gelu", + conv_dim: Tuple[int] = (512, 512, 512, 512, 512, 512, 512), + conv_stride: Tuple[int] = (5, 2, 2, 2, 2, 2, 2), + conv_kernel: Tuple[int] = (10, 3, 3, 3, 3, 3, 3), + conv_bias: bool = False, + num_conv_pos_embeddings: int = 128, + num_conv_pos_embedding_groups: int = 16, + do_stable_layer_norm: bool = False, + apply_spec_augment: bool = True, + mask_time_prob: float = 0.05, + mask_time_length: int = 10, + mask_time_min_masks: int = 2, + mask_feature_prob: float = 0.0, + mask_feature_length: int = 10, + mask_feature_min_masks: int = 0, + cache_dir: Union[str, os.PathLike] = "./.cache/hyperion_hf", + force_download: bool = False, + resume_download: bool = False, + revision: str = "main", + drop_layers_gt: Optional[int] = None, + ignore_pretrained: bool = False, + ): + + super().__init__( + pretrained_model_path=pretrained_model_path, + normalize_input=normalize_input, + use_input_attention_mask=use_input_attention_mask, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + drop_layers_gt=drop_layers_gt, + ignore_pretrained=ignore_pretrained, + ) + + if pretrained_model_path is not None and not ignore_pretrained: + logging.info(f"Downloading HF model from {pretrained_model_path}") + rank = ddp_get_rank() + if rank == 0: + # rank 0 downloads the model from HF web + self.hf_model = HubertModel.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + # all ranks wait until the model is downloaded + ddp_wait_for_all_procs() + if rank > 0: + # the rest of ranks should read the configuration from the cache. + self.hf_model = HubertModel.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + ddp_wait_for_all_procs() + self.hf_model.config.layerdrop = 0.0 + else: + hf_config = HubertConfig( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + layerdrop=0.0, # layerdrop, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + feat_extract_norm=feat_extract_norm, + feat_extract_activation=feat_extract_activation, + conv_dim=conv_dim, + conv_stride=conv_stride, + conv_kernel=conv_kernel, + conv_bias=conv_bias, + num_conv_pos_embeddings=num_conv_pos_embeddings, + num_conv_pos_embedding_groups=num_conv_pos_embedding_groups, + do_stable_layer_norm=do_stable_layer_norm, + apply_spec_augment=apply_spec_augment, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + ) + self.hf_model = HubertModel(hf_config) + + if drop_layers_gt is not None: + self.drop_upper_layers(drop_layers_gt) + + self.ignore_pretrained = True + + @property + def num_encoder_layers(self): + return self.hf_config.num_hidden_layers + + @property + def hidden_size(self): + return self.hf_config.hidden_size + + def drop_upper_layers(self, max_layers: int): + if max_layers >= self.hf_config.num_hidden_layers: + return + + layers = self.hf_model.encoder.layers + self.hf_model.encoder.layers = nn.ModuleList( + [l for i, l in enumerate(layers) if i < max_layers] + ) + self.hf_config.num_hidden_layers = max_layers + + def get_config(self): + """Returns the configuration arguments for the object in a dictionary.""" + config = self.hf_model.config.to_dict() + config = self.filter_args(**config) + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + args_base = HFWav2VecBase.filter_args(**kwargs) + valid_args = ( + "vocab_size", + "hidden_size", + "num_hidden_layers", + "num_attention_heads", + "intermediate_size", + "hidden_act", + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "layerdrop", + "initializer_range", + "layer_norm_eps", + "feat_extract_norm", + "feat_extract_activation", + "conv_dim", + "conv_stride", + "conv_kernel", + "conv_bias", + "num_conv_pos_embeddings", + "num_conv_pos_embedding_groups", + "do_stable_layer_norm", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_class_args(parser) + + parser.add_argument( + "--vocab-size", + default=32, + type=int, + help=( + "vocabulary size of the " + "model. Defines the different tokens that can be represented by the " + "*inputs_ids* passed to the forward method." + ), + ) + parser.add_argument( + "--hidden-size", + default=768, + type=int, + help=("dimensionality of the encoder layers and the pooler layer."), + ) + parser.add_argument( + "--num-hidden-layers", + default=12, + type=int, + help=("number of hidden layers in the Transformer encoder"), + ) + parser.add_argument( + "--num-attention-heads", + default=12, + type=int, + help=( + "number of attention heads for " + "each attention layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--intermediate-size", + default=3072, + type=int, + help=( + "dimensionality of the " "feed-forward layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--hidden-act", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear " + "activation function (function or string) in the encoder and pooler" + ), + ) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--layerdrop", + default=0.1, + type=float, + help=("prob. of dropping a layer"), + ) + parser.add_argument( + "--initializer-range", + default=0.02, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--layer-norm-eps", + default=1e-12, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--feat-extract-norm", + default="group", + choices=["group", "layer"], + help=( + "the norm to be applied to 1D convolutional layers in feature encoder. " + "One of `group` for group normalization of only the first 1D convolutional " + "layer or `layer` for layer normalization of all 1D convolutional layers" + ), + ) + parser.add_argument( + "--feat-proj-dropout", + default=0.1, + type=float, + help=("the dropout probability for output of the feature encoder"), + ) + parser.add_argument( + "--feat-extract-activation", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear activation function (function or string) in the 1D " + "convolutional layers of the feature extractor" + ), + ) + parser.add_argument( + "--conv-dim", + default=[512, 512, 512, 512, 512, 512, 512], + nargs="+", + type=int, + help=( + "a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the " + "feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers" + ), + ) + parser.add_argument( + "--conv-stride", + default=[5, 2, 2, 2, 2, 2, 2], + nargs="+", + type=int, + help=( + "a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-kernel", + default=[10, 3, 3, 3, 3, 3, 3], + nargs="+", + type=int, + help=( + "a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-bias", + default=False, + action=ActionYesNo, + help=("whether the 1D convolutional layers have a bias"), + ) + parser.add_argument( + "--num-conv-pos-embeddings", + default=128, + type=int, + help=( + "number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional " + "embeddings layer" + ), + ) + parser.add_argument( + "--num-conv-pos-embedding-groups", + default=16, + type=int, + help=("number of groups of 1D convolutional positional embeddings layer"), + ) + parser.add_argument( + "--do-stable-layer-norm", + default=False, + action=ActionYesNo, + help=( + "whether to apply *stable* layer norm architecture of the Transformer encoder" + ), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py new file mode 100644 index 00000000..63a7cf99 --- /dev/null +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -0,0 +1,668 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import os +import logging +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from typing import Optional, Tuple, Union, List, Callable + +import torch +import torch.nn as nn + +from transformers import Wav2Vec2Model, Wav2Vec2Config + +from ...utils.ddp import ddp_wait_for_all_procs, ddp_get_rank +from .hf_wav2vec_base import HFWav2VecBase + + +class HFWav2Vec2(HFWav2VecBase): + r"""This is wrapper over HuggingFace Wav2Vec2 model. + See documentation: https://huggingface.co/docs/transformers/model_doc/wav2vec2 + + This wrapper makes the HugginFace model to have the same interface + as other hyperion models. It also add extra functionalities. + + The config. parameters are the same as in the HuggingFace Wav2Vec2Config class. + + Attributes: + pretrained_model_path (`str`, defaults to None): file path or HuggingFace Hub path to + pre-trained model. + normalize_input (`bool`, defaults to True): whether or not to zero-mean unit-variance + normalize the input. + use_input_attention_mask (`bool`, defaults to False): whether we should input an + attention mask to the wav2vec model. + vocab_size (`int`, defaults to 32): vocabulary size of the + model. Defines the different tokens that can be represented by the + *inputs_ids* passed to the forward method. + hidden_size (`int`, defaults to 768): dimensionality of the encoder layers and + the pooler layer. + num_hidden_layers (`int`, defaults to 12): number of hidden layers in the + Transformer encoder. + num_attention_heads (`int`, defaults to 12): number of attention heads for + each attention layer in the Transformer encoder. + intermediate_size (`int`, defaults to 3072): dimensionality of the + feed-forward layer in the Transformer encoder. + hidden_act (`str` or `function`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the encoder and pooler. + If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout (`float`, defaults to 0.1): the dropout probability for all + fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, defaults to 0.1): the dropout probability for all + intermediate layer in feedforward transformer layers. + attention_dropout (`float`, defaults to 0.1): the dropout ratio for the + attention probabilities. + layerdrop (`float`, defaults to 0.1): prob. of dropping a layer. + initializer_range (`float`, defaults to 0.02): the standard deviation of the + truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, defaults to 1e-12): the epsilon used by the layer + normalization layers. + feat_extract_norm (`str`, defaults to `"group"`): + the norm to be applied to 1D convolutional layers in feature encoder. + One of `"group"` for group normalization of only the first 1D convolutional + layer or `"layer"` for layer normalization of all 1D convolutional layers. + feat_proj_dropout (`float`, defaults to 0.0): the dropout probability for output + of the feature encoder. + feat_extract_activation (`str, `optional`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the 1D convolutional layers of the feature + extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + conv_dim (`Tuple[int]`, defaults to `(512, 512, 512, 512, 512, 512, 512)`): + a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. + conv_stride (`Tuple[int]`, defaults to `(5, 2, 2, 2, 2, 2, 2)`): + a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length + of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. + conv_kernel (`Tuple[int]`, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The + length of *conv_kernel* defines the number of convolutional layers and has to match the length of + *conv_dim*. + conv_bias (`bool`, defaults to `False`): whether the 1D convolutional layers have a bias. + num_conv_pos_embeddings (`int`, defaults to 128): + number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer. + num_conv_pos_embedding_groups (`int`, defaults to 16): + number of groups of 1D convolutional positional embeddings layer. + do_stable_layer_norm (`bool`, defaults to `False`): + whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is + True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is + False` corresponds to applying layer norm after the attention layer. + apply_spec_augment (`bool`, defaults to `True`): + whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://arxiv.org/abs/1904.08779). + mask_time_prob (`float`, defaults to 0.05): + percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, defaults to 10): + length of vector span along the time axis. + mask_time_min_masks (`int`, defaults to 2),: + the minimum number of masks of length `mask_time_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks'' + mask_feature_prob (`float`, defaults to 0.0): + percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, defaults to 10): + length of vector span along the feature axis. + mask_feature_min_masks (`int`, defaults to 0): + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' + add_adapter (`bool`, defaults to `False`): + whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for + warm-starting Wav2Vec2 for SpeechEncoderDecoder models. + adapter_kernel_size (`int`, defaults to 3): + kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adapter_stride (`int`, defaults to 2): + stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + num_adapter_layers (`int`, defaults to 3): + number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + output_hidden_size (`int`, defaults to None): + dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant + if `add_adapter is True`. + cache_dir (str or os.PathLike): path to a directory in which a downloaded pretrained + model configuration should be cached if the standard cache should not be used. + force_download (`bool`, defaults to `False`): whether or not to force the (re-)download + the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, defaults to `False`): whether or not to delete incompletely + received files. Will attempt to resume the download if such a file exists. + revision(`str`, defaults to `"main"`): the specific model version to use. + It can be a branch name, a tag name, or a commit id. + ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path + and inits the model from the configuration. This is set to True for models that have already + been finetuned. + """ + + def __init__( + self, + pretrained_model_path: Optional[Union[str, os.PathLike]] = None, + normalize_input: bool = True, + use_input_attention_mask: bool = False, + vocab_size: int = 32, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: Union[str, Callable] = "gelu", + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + layerdrop: float = 0.1, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + feat_extract_norm: str = "group", + feat_proj_dropout: float = 0.0, + feat_extract_activation: Union[str, Callable] = "gelu", + conv_dim: Tuple[int] = (512, 512, 512, 512, 512, 512, 512), + conv_stride: Tuple[int] = (5, 2, 2, 2, 2, 2, 2), + conv_kernel: Tuple[int] = (10, 3, 3, 3, 3, 3, 3), + conv_bias: bool = False, + num_conv_pos_embeddings: int = 128, + num_conv_pos_embedding_groups: int = 16, + do_stable_layer_norm: bool = False, + apply_spec_augment: bool = True, + mask_time_prob: float = 0.05, + mask_time_length: int = 10, + mask_time_min_masks: int = 2, + mask_feature_prob: float = 0.0, + mask_feature_length: int = 10, + mask_feature_min_masks: int = 0, + add_adapter: bool = False, + adapter_kernel_size: int = 3, + adapter_stride: int = 2, + num_adapter_layers: int = 3, + output_hidden_size: Optional[int] = None, + cache_dir: Union[str, os.PathLike] = "./.cache/hyperion_hf", + force_download: bool = False, + resume_download: bool = False, + revision: str = "main", + drop_layers_gt: Optional[int] = None, + ignore_pretrained: bool = False, + ): + + super().__init__( + pretrained_model_path=pretrained_model_path, + normalize_input=normalize_input, + use_input_attention_mask=use_input_attention_mask, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + drop_layers_gt=drop_layers_gt, + ignore_pretrained=ignore_pretrained, + ) + + if pretrained_model_path is not None and not ignore_pretrained: + logging.info(f"Downloading HF model from {pretrained_model_path}") + rank = ddp_get_rank() + if rank == 0: + # rank 0 downloads the model from HF web + self.hf_model = Wav2Vec2Model.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + # all ranks wait until the model is downloaded + ddp_wait_for_all_procs() + if rank > 0: + # the rest of ranks should read the configuration from the cache. + self.hf_model = Wav2Vec2Model.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + ddp_wait_for_all_procs() + self.hf_model.config.layerdrop = 0.0 + else: + hf_config = Wav2Vec2Config( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + layerdrop=0.0, # layerdrop, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + feat_extract_norm=feat_extract_norm, + feat_extract_activation=feat_extract_activation, + conv_dim=conv_dim, + conv_stride=conv_stride, + conv_kernel=conv_kernel, + conv_bias=conv_bias, + num_conv_pos_embeddings=num_conv_pos_embeddings, + num_conv_pos_embedding_groups=num_conv_pos_embedding_groups, + do_stable_layer_norm=do_stable_layer_norm, + apply_spec_augment=apply_spec_augment, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + add_adapter=add_adapter, + adapter_kernel_size=adapter_kernel_size, + adapter_stride=adapter_stride, + num_adapter_layers=num_adapter_layers, + output_hidden_size=output_hidden_size, + ) + self.hf_model = Wav2Vec2Model(hf_config) + + if drop_layers_gt is not None: + self.drop_upper_layers(drop_layers_gt) + + self.ignore_pretrained = True + + @property + def num_encoder_layers(self): + return self.hf_config.num_hidden_layers + + @property + def hidden_size(self): + return self.hf_config.hidden_size + + def drop_upper_layers(self, max_layers: int): + if max_layers >= self.hf_config.num_hidden_layers: + return + + layers = self.hf_model.encoder.layers + self.hf_model.encoder.layers = nn.ModuleList( + [l for i, l in enumerate(layers) if i < max_layers] + ) + self.hf_config.num_hidden_layers = max_layers + + if self.hf_model.adapter is not None: + del self.hf_model.adapter + self.hf_model.adapter = None + self.hf_config.add_adapter = False + + def get_config(self): + """Returns the configuration arguments for the object in a dictionary.""" + config = self.hf_model.config.to_dict() + config = self.filter_args(**config) + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + args_base = HFWav2VecBase.filter_args(**kwargs) + valid_args = ( + "vocab_size", + "hidden_size", + "num_hidden_layers", + "num_attention_heads", + "intermediate_size", + "hidden_act", + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "layerdrop", + "initializer_range", + "layer_norm_eps", + "feat_extract_norm", + "feat_extract_activation", + "conv_dim", + "conv_stride", + "conv_kernel", + "conv_bias", + "num_conv_pos_embeddings", + "num_conv_pos_embedding_groups", + "do_stable_layer_norm", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + "add_adapter", + "adapter_kernel_size", + "adapter_stride", + "num_adapter_layers", + "output_hidden_size", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_class_args(parser) + + parser.add_argument( + "--vocab-size", + default=32, + type=int, + help=( + "vocabulary size of the " + "model. Defines the different tokens that can be represented by the " + "*inputs_ids* passed to the forward method." + ), + ) + parser.add_argument( + "--hidden-size", + default=768, + type=int, + help=("dimensionality of the encoder layers and the pooler layer."), + ) + parser.add_argument( + "--num-hidden-layers", + default=12, + type=int, + help=("number of hidden layers in the Transformer encoder"), + ) + parser.add_argument( + "--num-attention-heads", + default=12, + type=int, + help=( + "number of attention heads for " + "each attention layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--intermediate-size", + default=3072, + type=int, + help=( + "dimensionality of the " "feed-forward layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--hidden-act", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear " + "activation function (function or string) in the encoder and pooler" + ), + ) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--layerdrop", + default=0.1, + type=float, + help=("prob. of dropping a layer"), + ) + parser.add_argument( + "--initializer-range", + default=0.02, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--layer-norm-eps", + default=1e-12, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--feat-extract-norm", + default="group", + choices=["group", "layer"], + help=( + "the norm to be applied to 1D convolutional layers in feature encoder. " + "One of `group` for group normalization of only the first 1D convolutional " + "layer or `layer` for layer normalization of all 1D convolutional layers" + ), + ) + parser.add_argument( + "--feat-proj-dropout", + default=0.1, + type=float, + help=("the dropout probability for output of the feature encoder"), + ) + parser.add_argument( + "--feat-extract-activation", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear activation function (function or string) in the 1D " + "convolutional layers of the feature extractor" + ), + ) + parser.add_argument( + "--conv-dim", + default=[512, 512, 512, 512, 512, 512, 512], + nargs="+", + type=int, + help=( + "a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the " + "feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers" + ), + ) + parser.add_argument( + "--conv-stride", + default=[5, 2, 2, 2, 2, 2, 2], + nargs="+", + type=int, + help=( + "a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-kernel", + default=[10, 3, 3, 3, 3, 3, 3], + nargs="+", + type=int, + help=( + "a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-bias", + default=False, + action=ActionYesNo, + help=("whether the 1D convolutional layers have a bias"), + ) + parser.add_argument( + "--num-conv-pos-embeddings", + default=128, + type=int, + help=( + "number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional " + "embeddings layer" + ), + ) + parser.add_argument( + "--num-conv-pos-embedding-groups", + default=16, + type=int, + help=("number of groups of 1D convolutional positional embeddings layer"), + ) + parser.add_argument( + "--do-stable-layer-norm", + default=False, + action=ActionYesNo, + help=( + "whether to apply *stable* layer norm architecture of the Transformer encoder" + ), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + parser.add_argument( + "--add-adapter", + default=False, + action=ActionYesNo, + help=( + "whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder" + ), + ) + parser.add_argument( + "--adapter-kernel-size", + default=3, + type=int, + help=("kernel size of the convolutional layers in the adapter network"), + ) + parser.add_argument( + "--adapter-stride", + default=2, + type=int, + help=("stride of the convolutional layers in the adapter network"), + ) + parser.add_argument( + "--num-adapter-layers", + default=3, + type=int, + help=( + "number of convolutional layers that should be used in the adapter network" + ), + ) + parser.add_argument( + "--output-hidden-size", + default=None, + type=int, + help=( + "dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*." + " Only relevant if `add_adapter is True" + ), + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + """ + Things I think I don't need: + feat_quantizer_dropout (`float`, defaults to 0.0): the dropout probabilitiy for quantized feature encoder states. + final_dropout (`float`, defaults to 0.1): the dropout probability for the + final projection layer of [`Wav2Vec2ForCTC`]. + um_codevectors_per_group (`int`, defaults to 320): + number of entries in each quantization codebook (group). + num_codevector_groups (`int`, defaults to 2): + number of codevector groups for product codevector quantization. + contrastive_logits_temperature (`float`, defaults to 0.1): + the temperature *kappa* in the contrastive loss. + feat_quantizer_dropout (`float`, defaults to 0.0): + the dropout probabilitiy for the output of the feature encoder that's used by the quantizer. + num_negatives (`int`, defaults to 100): + number of negative samples for the contrastive loss. + codevector_dim (`int`, defaults to 256): + dimensionality of the quantized feature vectors. + proj_codevector_dim (`int`, defaults to 256): + dimensionality of the final projection of both the quantized and the transformer features. + diversity_loss_weight (`int`, defaults to 0.1): + the weight of the codebook diversity loss component. + ctc_loss_reduction (`str`, defaults to `"sum"`): + Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an + instance of [`Wav2Vec2ForCTC`]. + ctc_zero_infinity (`bool`, defaults to `False`): + whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly + occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance + of [`Wav2Vec2ForCTC`]. + use_weighted_layer_sum (`bool`, defaults to `False`): + whether to use a weighted average of layer outputs with learned weights. Only relevant when using an + instance of [`Wav2Vec2ForSequenceClassification`]. + classifier_proj_size (`int`, defaults to 256): + dimensionality of the projection before token mean-pooling for classification. + tdnn_dim (`Tuple[int]`, defaults to `(512, 512, 512, 512, 1500)`): + a tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN* + module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers. + tdnn_kernel (`Tuple[int]`, defaults to `(5, 3, 3, 1, 1)`): + a tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the + *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*. + tdnn_dilation (`Tuple[int]`, defaults to `(1, 2, 3, 1, 1)`): + a tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the + *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. + xvector_output_dim (`int`, defaults to 512): + dimensionality of the *XVector* embedding vectors. + """ diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py new file mode 100644 index 00000000..0b862d62 --- /dev/null +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -0,0 +1,331 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import os +import logging +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo + +from typing import Optional, Tuple, Union, List + +import torch +import torch.nn as nn + +from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor + +from ...torch_model import TorchModel +from ...utils import seq_lengths_to_mask, scale_seq_lengths +from ...utils.ddp import ddp_wait_for_all_procs, ddp_get_rank + + +class HFWav2VecBase(TorchModel): + """Base class for Wav2Vec style models (Wav2Vec2, Hubert, WavLM, ...) in HuggingFace. + + This class includes the proprocessing steps, common to all models. + + Attributes: + pretrained_model_path (`str`, or os.PathLike, defaults to None): file path or + HuggingFace Hub path to pre-trained model. + normalize_input (`bool`, defaults to True): whether or not to zero-mean unit-variance + normalize the input. + use_input_attention_mask (`bool`, defaults to False): whether we should input an + attention mask to the wav2vec model. + cache_dir (str or os.PathLike): path to a directory in which a downloaded pretrained + model configuration should be cached if the standard cache should not be used. + force_download (`bool`, defaults to `False`): whether or not to force the (re-)download + the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, defaults to `False`): whether or not to delete incompletely + received files. Will attempt to resume the download if such a file exists. + revision(`str`, defaults to `"main"`): the specific model version to use. + It can be a branch name, a tag name, or a commit id. + drop_layers_gt (`int` defaults to None): drop encoder layers greater than this value (in [1, num_encoder_layers]). + If None, the model is not changed. + ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path + and inits the model from the configuration. This is set to True for models that have already + been finetuned. + """ + + def __init__( + self, + pretrained_model_path: Optional[Union[str, os.PathLike]] = None, + normalize_input: bool = True, + use_input_attention_mask: bool = False, + cache_dir: Union[str, os.PathLike] = "./.cache/hyperion_hf", + force_download: bool = False, + resume_download: bool = False, + revision: str = "main", + drop_layers_gt: Optional[int] = None, + ignore_pretrained: bool = False, + ): + super().__init__() + self.pretrained_model_path = pretrained_model_path + self.cache_dir = cache_dir + self.force_download = force_download + self.resume_download = resume_download + self.revision = revision + self.drop_layers_gt = drop_layers_gt + self.ignore_pretrained = ignore_pretrained + + if pretrained_model_path is not None and not ignore_pretrained: + logging.info( + f"Downloading config for HF preprocessor from {pretrained_model_path}" + ) + rank = ddp_get_rank() + if rank == 0: + # rank 0 downloads the model from HF web + try: + # some models donot have config for processor because do not have + # tokenizer, first we try to donwload feature_extractor config + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + except: + # if fails, we try to download full processor config + processor = Wav2Vec2Processor.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + feature_extractor = processor.feature_extractor + + # all ranks wait until the model is downloaded + ddp_wait_for_all_procs() + if rank > 0: + # the rest of ranks should read the configuration from the cache. + try: + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + except: + # if fails, we try to download full processor config + processor = Wav2Vec2Processor.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + feature_extractor = processor.feature_extractor + + ddp_wait_for_all_procs() + normalize_input = feature_extractor.do_normalize + use_input_attention_mask = feature_extractor.return_attention_mask + + self.normalize_input = normalize_input + self.use_input_attention_mask = use_input_attention_mask + + def __deepcopy__(self, memo): + """Reimplementation of deepcopy for Hugging Face models. + The weight_norm in the Conv. Pos. Encoder of Wav2Vec models make the default deepcopy to fail. + """ + cls = self.__class__ # Extract the class of the object + cfg = self.get_config() + del cfg["class_name"] + # Create a new instance of the object based on extracted class + new_obj = cls(**cfg) + memo[id(self)] = new_obj + new_obj.load_state_dict(self.state_dict()) + device = next(self.parameters()).device + new_obj.to(device) + print( + "deepcopy", + next(self.parameters()).device, + next(new_obj.parameters()).device, + flush=True, + ) + return new_obj + + @property + def hf_config(self): + return self.hf_model.config + + def _normalize(self, x, x_mask=None): + """Normalizes the audio to have zero mean and unit variance.""" + if x_mask is None: + x = x - x.mean(dim=1, keepdim=True) + std = torch.sqrt((x ** 2).mean(dim=1, keepdim=True) + 1e-7) + x = x / std + else: + x_mask = x_mask.to(dtype=x.dtype) + x_samples = torch.mean(x_mask, dim=1, keepdim=True) + x_mean = torch.mean(x * x_mask, dim=1, keepdim=True) / x_samples + x2_mean = torch.mean(x ** 2 * x_mask, dim=1, keepdim=True) / x_samples + std = torch.sqrt(x2_mean - x_mean ** 2 + 1e-7) + x = (x - x_mean) / std + return x + + def _preprocess(self, x, x_lengths=None): + """Prepares input audio to be used as input to wav2vec style model.""" + x_mask = seq_lengths_to_mask(x_lengths, x.size(-1), dtype=torch.long) + if self.normalize_input: + x = self._normalize(x, x_lengths) + + if self.use_input_attention_mask: + x_mask = None + + return x, x_mask + + def forward( + self, + x: torch.Tensor, + x_lengths: Optional[torch.LongTensor] = None, + return_attentions: bool = False, + return_hid_states: bool = False, + ): + r"""Forward function for wav2vec style models. + + Args: + x: input audio of shape = (batch, sequence_length). + x_lengths: lengths of the audio waveforms in samples with shape = (batch,). + return_attentions: whether or not to return the attentions tensors of + all attention layers. + return_hid_states: whether or not to return the hidden states of all layers. + + Returns: + Dictionary with: + last_hidden_state: sequence of hidden-states at the output of the last + layer of the model (torch.FloatTensor of shape + (batch_size, sequence_length, hidden_size)). + extract_features: sequence of extracted feature vectors of the last + convolutional layer of the model. (torch.FloatTensor of shape + (batch_size, sequence_length, conv_dim[-1]) + hidden_states: hidden-states of the model at the output of each layer + plus the initial embedding outputs (tuple(torch.FloatTensor)). + attentions: Attentions weights after the attention softmax, used to + compute the weighted average in the self-attention heads + (tuple(torch.FloatTensor)). + """ + max_in_length = x.size(-1) + x, x_mask = self._preprocess(x, x_lengths) + output = self.hf_model( + x, + x_mask, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) + max_out_length = output.last_hidden_state.size(1) + feat_lengths = ( + None + if x_lengths is None + else scale_seq_lengths(x_lengths, max_out_length, max_in_length) + ) + output["hidden_states_lengths"] = feat_lengths + + return output + + def get_config(self): + """Returns the configuration arguments for the object in a dictionary.""" + + config = { + "pretrained_model_path": self.pretrained_model_path, + "normalize_input": self.normalize_input, + "use_input_attention_mask": self.use_input_attention_mask, + "cache_dir": self.cache_dir, + "force_download": self.force_download, + "resume_download": self.resume_download, + "revision": self.revision, + "drop_layers_gt": self.drop_layers_gt, + "ignore_pretrained": self.ignore_pretrained, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def save(self, file_path: str): + """Saves the model to disk.""" + self.ignore_pretrained = True + self.save(file_path) + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "pretrained_model_path", + "normalize_input", + "use_input_attention_mask", + "cache_dir", + "force_download", + "resume_download", + "revision", + "drop_layers_gt", + "ignore_pretrained", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--pretrained-model-path", + default=None, + help=("file path or HuggingFace Hub path to pre-trained model"), + ) + parser.add_argument( + "--normalize-input", + default=True, + action=ActionYesNo, + help=("whether or not to zero-mean unit-variance normalize the input"), + ) + parser.add_argument( + "--use-input-attention-mask", + default=False, + action=ActionYesNo, + help=("whether we should input an attention mask to the wav2vec model"), + ) + parser.add_argument( + "--cache-dir", + default="./.cache/hyperion_hf", + help=( + "path to a directory in which a downloaded pretrained model " + "configuration should be cached if the standard cache should not be used" + ), + ) + parser.add_argument( + "--force-download", + default=False, + action=ActionYesNo, + help=( + "whether or not to force the (re-)download the model weights " + "and configuration files and override thecached versions if they exist" + ), + ) + parser.add_argument( + "--resume-download", + default=False, + action=ActionYesNo, + help=( + "whether or not to delete incompletely received files. " + "Will attempt to resume the download if such a file exists" + ), + ) + parser.add_argument( + "--revision", + default="main", + help=( + "the specific model version to use. It can be a branch name, " + "a tag name, or a commit id. " + ), + ) + parser.add_argument( + "--drop-layers-gt", + default=None, + type=int, + help=("drop encoder layers greater than this value."), + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py new file mode 100644 index 00000000..1e8a5e8d --- /dev/null +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -0,0 +1,622 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import os +import logging +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from typing import Optional, Tuple, Union, List, Callable + +import torch +import torch.nn as nn + +from transformers import WavLMModel, WavLMConfig + +from ...utils.ddp import ddp_wait_for_all_procs, ddp_get_rank +from .hf_wav2vec_base import HFWav2VecBase + + +class HFWavLM(HFWav2VecBase): + r"""This is wrapper over HuggingFace WavLM model. + See documentation: https://huggingface.co/docs/transformers/model_doc/wavlm + + This wrapper makes the HugginFace model to have the same interface + as other hyperion models. It also add extra functionalities. + + The config. parameters are the same as in the HuggingFace WavLMConfig class. + + Attributes: + pretrained_model_path (`str`, defaults to None): file path or HuggingFace Hub path to + pre-trained model. + normalize_input (`bool`, defaults to True): whether or not to zero-mean unit-variance + normalize the input. + use_input_attention_mask (`bool`, defaults to False): whether we should input an + attention mask to the wav2vec model. + vocab_size (`int`, defaults to 32): vocabulary size of the + model. Defines the different tokens that can be represented by the + *inputs_ids* passed to the forward method. + hidden_size (`int`, defaults to 768): dimensionality of the encoder layers and + the pooler layer. + num_hidden_layers (`int`, defaults to 12): number of hidden layers in the + Transformer encoder. + num_attention_heads (`int`, defaults to 12): number of attention heads for + each attention layer in the Transformer encoder. + intermediate_size (`int`, defaults to 3072): dimensionality of the + feed-forward layer in the Transformer encoder. + hidden_act (`str` or `function`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the encoder and pooler. + If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout (`float`, defaults to 0.1): the dropout probability for all + fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, defaults to 0.1): the dropout probability for all + intermediate layer in feedforward transformer layers. + attention_dropout (`float`, defaults to 0.1): the dropout ratio for the + attention probabilities. + layerdrop (`float`, defaults to 0.1): prob. of dropping a layer. + initializer_range (`float`, defaults to 0.02): the standard deviation of the + truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, defaults to 1e-12): the epsilon used by the layer + normalization layers. + feat_extract_norm (`str`, defaults to `"group"`): + the norm to be applied to 1D convolutional layers in feature encoder. + One of `"group"` for group normalization of only the first 1D convolutional + layer or `"layer"` for layer normalization of all 1D convolutional layers. + feat_proj_dropout (`float`, defaults to 0.0): the dropout probability for output + of the feature encoder. + feat_extract_activation (`str, `optional`, defaults to `"gelu"`): the non-linear + activation function (function or string) in the 1D convolutional layers of the feature + extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + conv_dim (`Tuple[int]`, defaults to `(512, 512, 512, 512, 512, 512, 512)`): + a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. + conv_stride (`Tuple[int]`, defaults to `(5, 2, 2, 2, 2, 2, 2)`): + a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length + of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. + conv_kernel (`Tuple[int]`, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The + length of *conv_kernel* defines the number of convolutional layers and has to match the length of + *conv_dim*. + conv_bias (`bool`, defaults to `False`): whether the 1D convolutional layers have a bias. + num_conv_pos_embeddings (`int`, defaults to 128): + number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer. + num_conv_pos_embedding_groups (`int`, defaults to 16): + number of groups of 1D convolutional positional embeddings layer. + do_stable_layer_norm (`bool`, defaults to `False`): + whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is + True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is + False` corresponds to applying layer norm after the attention layer. + apply_spec_augment (`bool`, defaults to `True`): + whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://arxiv.org/abs/1904.08779). + mask_time_prob (`float`, defaults to 0.05): + percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, defaults to 10): + length of vector span along the time axis. + mask_time_min_masks (`int`, defaults to 2),: + the minimum number of masks of length `mask_time_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks'' + mask_feature_prob (`float`, defaults to 0.0): + percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, defaults to 10): + length of vector span along the feature axis. + mask_feature_min_masks (`int`, defaults to 0): + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' + add_adapter (`bool`, defaults to `False`): + whether a convolutional network should be stacked on top of the WavLM Encoder. Can be very useful for + warm-starting WavLM for SpeechEncoderDecoder models. + adapter_kernel_size (`int`, defaults to 3): + kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adapter_stride (`int`, defaults to 2): + stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + num_adapter_layers (`int`, defaults to 3): + number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + output_hidden_size (`int`, defaults to None): + dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant + if `add_adapter is True`. + cache_dir (str or os.PathLike): path to a directory in which a downloaded pretrained + model configuration should be cached if the standard cache should not be used. + force_download (`bool`, defaults to `False`): whether or not to force the (re-)download + the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, defaults to `False`): whether or not to delete incompletely + received files. Will attempt to resume the download if such a file exists. + revision(`str`, defaults to `"main"`): the specific model version to use. + It can be a branch name, a tag name, or a commit id. + ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path + and inits the model from the configuration. This is set to True for models that have already + been finetuned. + """ + + def __init__( + self, + pretrained_model_path: Optional[Union[str, os.PathLike]] = None, + normalize_input: bool = True, + use_input_attention_mask: bool = False, + vocab_size: int = 32, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: Union[str, Callable] = "gelu", + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + layerdrop: float = 0.1, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + feat_extract_norm: str = "group", + feat_proj_dropout: float = 0.0, + feat_extract_activation: Union[str, Callable] = "gelu", + conv_dim: Tuple[int] = (512, 512, 512, 512, 512, 512, 512), + conv_stride: Tuple[int] = (5, 2, 2, 2, 2, 2, 2), + conv_kernel: Tuple[int] = (10, 3, 3, 3, 3, 3, 3), + conv_bias: bool = False, + num_conv_pos_embeddings: int = 128, + num_conv_pos_embedding_groups: int = 16, + do_stable_layer_norm: bool = False, + apply_spec_augment: bool = True, + mask_time_prob: float = 0.05, + mask_time_length: int = 10, + mask_time_min_masks: int = 2, + mask_feature_prob: float = 0.0, + mask_feature_length: int = 10, + mask_feature_min_masks: int = 0, + add_adapter: bool = False, + adapter_kernel_size: int = 3, + adapter_stride: int = 2, + num_adapter_layers: int = 3, + output_hidden_size: Optional[int] = None, + cache_dir: Union[str, os.PathLike] = "./.cache/hyperion_hf", + force_download: bool = False, + resume_download: bool = False, + revision: str = "main", + drop_layers_gt: Optional[int] = None, + ignore_pretrained: bool = False, + ): + + super().__init__( + pretrained_model_path=pretrained_model_path, + normalize_input=normalize_input, + use_input_attention_mask=use_input_attention_mask, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + drop_layers_gt=drop_layers_gt, + ignore_pretrained=ignore_pretrained, + ) + + if pretrained_model_path is not None and not ignore_pretrained: + logging.info(f"Downloading HF model from {pretrained_model_path}") + rank = ddp_get_rank() + if rank == 0: + # rank 0 downloads the model from HF web + self.hf_model = WavLMModel.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + revision=revision, + ) + # all ranks wait until the model is downloaded + ddp_wait_for_all_procs() + if rank > 0: + # the rest of ranks should read the configuration from the cache. + self.hf_model = WavLMModel.from_pretrained( + pretrained_model_path, + cache_dir=cache_dir, + force_download=False, + resume_download=False, + revision=revision, + ) + ddp_wait_for_all_procs() + self.hf_model.config.layerdrop = 0.0 + else: + hf_config = WavLMConfig( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + layerdrop=0.0, # layerdrop, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + feat_extract_norm=feat_extract_norm, + feat_extract_activation=feat_extract_activation, + conv_dim=conv_dim, + conv_stride=conv_stride, + conv_kernel=conv_kernel, + conv_bias=conv_bias, + num_conv_pos_embeddings=num_conv_pos_embeddings, + num_conv_pos_embedding_groups=num_conv_pos_embedding_groups, + do_stable_layer_norm=do_stable_layer_norm, + apply_spec_augment=apply_spec_augment, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + add_adapter=add_adapter, + adapter_kernel_size=adapter_kernel_size, + adapter_stride=adapter_stride, + num_adapter_layers=num_adapter_layers, + output_hidden_size=output_hidden_size, + ) + self.hf_model = WavLMModel(hf_config) + + if drop_layers_gt is not None: + self.drop_upper_layers(drop_layers_gt) + + self.ignore_pretrained = True + + @property + def num_encoder_layers(self): + return self.hf_config.num_hidden_layers + + @property + def hidden_size(self): + return self.hf_config.hidden_size + + def drop_upper_layers(self, max_layers: int): + if max_layers >= self.hf_config.num_hidden_layers: + return + + layers = self.hf_model.encoder.layers + self.hf_model.encoder.layers = nn.ModuleList( + [l for i, l in enumerate(layers) if i < max_layers] + ) + self.hf_config.num_hidden_layers = max_layers + + if self.hf_model.adapter is not None: + del self.hf_model.adapter + self.hf_model.adapter = None + self.hf_config.add_adapter = False + + def get_config(self): + """Returns the configuration arguments for the object in a dictionary.""" + config = self.hf_model.config.to_dict() + config = self.filter_args(**config) + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + args_base = HFWav2VecBase.filter_args(**kwargs) + valid_args = ( + "vocab_size", + "hidden_size", + "num_hidden_layers", + "num_attention_heads", + "intermediate_size", + "hidden_act", + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "layerdrop", + "initializer_range", + "layer_norm_eps", + "feat_extract_norm", + "feat_extract_activation", + "conv_dim", + "conv_stride", + "conv_kernel", + "conv_bias", + "num_conv_pos_embeddings", + "num_conv_pos_embedding_groups", + "do_stable_layer_norm", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + "add_adapter", + "adapter_kernel_size", + "adapter_stride", + "num_adapter_layers", + "output_hidden_size", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_class_args(parser) + + parser.add_argument( + "--vocab-size", + default=32, + type=int, + help=( + "vocabulary size of the " + "model. Defines the different tokens that can be represented by the " + "*inputs_ids* passed to the forward method." + ), + ) + parser.add_argument( + "--hidden-size", + default=768, + type=int, + help=("dimensionality of the encoder layers and the pooler layer."), + ) + parser.add_argument( + "--num-hidden-layers", + default=12, + type=int, + help=("number of hidden layers in the Transformer encoder"), + ) + parser.add_argument( + "--num-attention-heads", + default=12, + type=int, + help=( + "number of attention heads for " + "each attention layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--intermediate-size", + default=3072, + type=int, + help=( + "dimensionality of the " "feed-forward layer in the Transformer encoder" + ), + ) + parser.add_argument( + "--hidden-act", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear " + "activation function (function or string) in the encoder and pooler" + ), + ) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--layerdrop", + default=0.1, + type=float, + help=("prob. of dropping a layer"), + ) + parser.add_argument( + "--initializer-range", + default=0.02, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--layer-norm-eps", + default=1e-12, + type=float, + help=( + "the standard deviation of the " + "truncated_normal_initializer for initializing all weight matrices" + ), + ) + parser.add_argument( + "--feat-extract-norm", + default="group", + choices=["group", "layer"], + help=( + "the norm to be applied to 1D convolutional layers in feature encoder. " + "One of `group` for group normalization of only the first 1D convolutional " + "layer or `layer` for layer normalization of all 1D convolutional layers" + ), + ) + parser.add_argument( + "--feat-proj-dropout", + default=0.1, + type=float, + help=("the dropout probability for output of the feature encoder"), + ) + parser.add_argument( + "--feat-extract-activation", + default="gelu", + choices=["gelu", "relu", "selu", "gelu_new"], + help=( + "the non-linear activation function (function or string) in the 1D " + "convolutional layers of the feature extractor" + ), + ) + parser.add_argument( + "--conv-dim", + default=[512, 512, 512, 512, 512, 512, 512], + nargs="+", + type=int, + help=( + "a tuple of integers defining the number of input and output channels of each 1D convolutional layer in the " + "feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers" + ), + ) + parser.add_argument( + "--conv-stride", + default=[5, 2, 2, 2, 2, 2, 2], + nargs="+", + type=int, + help=( + "a tuple of integers defining the stride of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-kernel", + default=[10, 3, 3, 3, 3, 3, 3], + nargs="+", + type=int, + help=( + "a tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder" + ), + ) + parser.add_argument( + "--conv-bias", + default=False, + action=ActionYesNo, + help=("whether the 1D convolutional layers have a bias"), + ) + parser.add_argument( + "--num-conv-pos-embeddings", + default=128, + type=int, + help=( + "number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional " + "embeddings layer" + ), + ) + parser.add_argument( + "--num-conv-pos-embedding-groups", + default=16, + type=int, + help=("number of groups of 1D convolutional positional embeddings layer"), + ) + parser.add_argument( + "--do-stable-layer-norm", + default=False, + action=ActionYesNo, + help=( + "whether to apply *stable* layer norm architecture of the Transformer encoder" + ), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + parser.add_argument( + "--add-adapter", + default=False, + action=ActionYesNo, + help=( + "whether a convolutional network should be stacked on top of the WavLM Encoder" + ), + ) + parser.add_argument( + "--adapter-kernel-size", + default=3, + type=int, + help=("kernel size of the convolutional layers in the adapter network"), + ) + parser.add_argument( + "--adapter-stride", + default=2, + type=int, + help=("stride of the convolutional layers in the adapter network"), + ) + parser.add_argument( + "--num-adapter-layers", + default=3, + type=int, + help=( + "number of convolutional layers that should be used in the adapter network" + ), + ) + parser.add_argument( + "--output-hidden-size", + default=None, + type=int, + help=( + "dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*." + " Only relevant if `add_adapter is True" + ), + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 2755bbbe..1821b674 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -99,7 +99,6 @@ def __init__( ): self.model = model - # self.optimizer = optim self.loss = loss self.epochs = epochs self.cur_epoch = cur_epoch @@ -334,7 +333,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_size = data.shape[0] with self.amp_autocast(): - output = self.model(data, **self.amp_args) + output = self.model(data) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() @@ -374,7 +373,7 @@ def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): ) def update_model(self): - + """Updates the model and does gradding clipping.""" if self.use_amp: if self.grad_clip > 0: self.grad_scaler.unscale_(self.optimizer) @@ -393,6 +392,7 @@ def update_model(self): self.optimizer.step() def _make_optimizer(self, optim, model, oss=False): + """Makes an optimizer object.""" if isinstance(optim, torch.optim.Optimizer): return optim @@ -405,6 +405,7 @@ def _make_optimizer(self, optim, model, oss=False): return optimizer def _make_lr_sched(self, lr_sched, optim): + """Makes a Learning Rate scheduler object.""" if lr_sched is None or isinstance(lr_sched, LRS): return lr_sched diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 3e704bd5..a643ca7f 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -12,6 +12,7 @@ from ..utils import MetricAcc from .torch_trainer import TorchTrainer +from torch.distributed.elastic.multiprocessing.errors import record class XVectorTrainer(TorchTrainer): @@ -107,6 +108,7 @@ def __init__( cpu_offload=cpu_offload, ) + @record def train_epoch(self, data_loader): """Training epoch loop @@ -129,18 +131,28 @@ def train_epoch(self, data_loader): batch_size = data.shape[0] with self.amp_autocast(): + # logging.info( + # f"in_model rank={self.rank} batch={batch} x={data} mxx={data.max()} avgx={data.mean()}" + # ) output = self.model(data, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps + # logging.info( + # f"out_model rank={self.rank} batch={batch} y={output} loss={loss.item()}" + # ) if self.use_amp: + # logging.info("in_backward rank=%d batch=%d", self.rank, batch) self.grad_scaler.scale(loss).backward() + # logging.info("out_backward rank=%d batch=%d", self.rank, batch) else: loss.backward() if (batch + 1) % self.grad_acc_steps == 0: if self.lr_scheduler is not None and not self.in_swa: self.lr_scheduler.on_opt_step() + # logging.info("in_update rank=%d batch=%d", self.rank, batch) self.update_model() + # logging.info("out_update rank=%d batch=%d", self.rank, batch) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index 22af492c..3a4692dc 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -5,7 +5,7 @@ from .devices import open_device from .metric_acc import MetricAcc -from .masking import seq_lengths_to_mask, scale_lengths +from .masking import seq_lengths_to_mask, scale_seq_lengths from .collation import collate_seq_1d, collate_seq_2d, collate_seq_nd from .eval_utils import eval_nnet_by_chunks, eval_nnet_overlap_add from .vad_utils import remove_silence diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 48a8bcfe..7038cff3 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -4,7 +4,7 @@ """ import os import logging - +import datetime import torch import torch.nn as nn import torch.distributed as dist @@ -61,7 +61,11 @@ def ddp_init( logging.info( f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port}" ) - dist.init_process_group("nccl", rank=rank, world_size=world_size) + dist.init_process_group( + "nccl", + rank=rank, + world_size=world_size, + ) torch.tensor([0]).to(gpu_id) return gpu_id, rank, world_size @@ -73,6 +77,23 @@ def ddp_cleanup(): pass +def ddp_wait_for_all_procs(): + if dist.is_initialized(): + dist.barrier() + + +def ddp_get_rank_world_size(): + if dist.is_initialized(): + return dist.get_rank(), dist.get_world_size() + return 0, 1 + + +def ddp_get_rank(): + if dist.is_initialized(): + return dist.get_rank() + return 0 + + class TorchDDP(nn.parallel.DistributedDataParallel): def __getattr__(self, name): try: diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py index b6ccd5ef..1bb5a644 100644 --- a/hyperion/torch/utils/masking.py +++ b/hyperion/torch/utils/masking.py @@ -7,7 +7,7 @@ import torch.nn as nn -def scale_lengths(lengths, max_out_length, max_in_length=None): +def scale_seq_lengths(lengths, max_out_length, max_in_length=None): if lengths is None: return None From e9cb8a36975692c9ecd6c9a11171a4d719a826df Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 9 May 2022 19:23:27 -0400 Subject: [PATCH 010/154] added noam and triangular schedulers, added option to change dropout and specaugment options in wav2vec models --- ...dnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh | 51 +++++++++++++ ...dnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh | 2 +- hyperion/torch/lr_schedulers/__init__.py | 3 + hyperion/torch/lr_schedulers/factory.py | 43 ++++++++++- hyperion/torch/lr_schedulers/invpow_lr.py | 4 +- hyperion/torch/lr_schedulers/noam_lr.py | 44 +++++++++++ hyperion/torch/lr_schedulers/triangular_lr.py | 75 +++++++++++++++++++ hyperion/torch/torch_model.py | 6 ++ hyperion/torch/tpm/hf/hf_hubert.py | 20 +++++ hyperion/torch/tpm/hf/hf_wav2vec2.py | 46 ++++++++++++ hyperion/torch/tpm/hf/hf_wav2vec_base.py | 65 ++++++++++++++-- hyperion/torch/tpm/hf/hf_wavlm.py | 20 +++++ 12 files changed, 368 insertions(+), 11 deletions(-) create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh create mode 100644 hyperion/torch/lr_schedulers/noam_lr.py create mode 100644 hyperion/torch/lr_schedulers/triangular_lr.py diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh new file mode 100644 index 00000000..b40ff3d1 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh @@ -0,0 +1,51 @@ +# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 + +# hugging face model +hf_model_name=wav2vec2base + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +dropout=0 +embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 +nnet_num_epochs=70 + + +lr=0.001 +#lr=0.005 +xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75" + +nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v3 #v1 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0060.pth +nnet=$nnet_dir/swa_model_ep0076.pth +nnet=$nnet_dir/model_ep0060.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=6 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh index a021d5a1..24bc799a 100644 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh @@ -33,7 +33,7 @@ nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${d nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth -nnet=$nnet_dir/model_ep0060.pth +nnet=$nnet_dir/swa_model_ep0064.pth # back-end diff --git a/hyperion/torch/lr_schedulers/__init__.py b/hyperion/torch/lr_schedulers/__init__.py index f0a3465e..be77dc15 100644 --- a/hyperion/torch/lr_schedulers/__init__.py +++ b/hyperion/torch/lr_schedulers/__init__.py @@ -8,4 +8,7 @@ from .red_lr_on_plateau import ReduceLROnPlateau from .exp_lr import ExponentialLR from .cos_lr import CosineLR, AdamCosineLR +from .invpow_lr import InvPowLR +from .noam_lr import NoamLR +from .triangular_lr import TriangularLR from .factory import LRSchedulerFactory diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index 9e185a7c..10b47ab2 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -10,6 +10,8 @@ from .exp_lr import ExponentialLR from .invpow_lr import InvPowLR from .cos_lr import CosineLR, AdamCosineLR +from .noam_lr import NoamLR +from .triangular_lr import TriangularLR class LRSchedulerFactory(object): @@ -34,6 +36,8 @@ def create( eps=1e-8, min_lr=0, warmup_steps=0, + d_model=None, + lr_factor=1, update_lr_on_opt_step=False, ): @@ -61,6 +65,15 @@ def create( update_lr_on_opt_step=update_lr_on_opt_step, ) + if lrsch_type == "noam_lr": + return NoamLR( + optimizer, + d_model, + lr_factor, + min_lr=min_lr, + warmup_steps=warmup_steps, + ) + if lrsch_type == "cos_lr": return CosineLR( optimizer, @@ -73,6 +86,16 @@ def create( update_lr_on_opt_step=update_lr_on_opt_step, ) + if lrsch_type == "cos_lr": + return TriangularLR( + optimizer, + t, + t_mul, + min_lr=min_lr, + gamma=gamma, + update_lr_on_opt_step=update_lr_on_opt_step, + ) + if lrsch_type == "adamcos_lr": return AdamCosineLR( optimizer, @@ -122,6 +145,8 @@ def filter_args(**kwargs): "eps", "min_lr", "warmup_steps", + "lr_factor", + "d_model", "update_lr_on_opt_step", ) @@ -144,6 +169,8 @@ def add_class_args(parser, prefix=None): "cos_lr", "adamcos_lr", "red_lr_on_plateau", + "noam_lr", + "triangular_lr", ], help=( "Learning rate schedulers: None, Exponential," @@ -173,13 +200,13 @@ def add_class_args(parser, prefix=None): "--t-mul", default=1, type=int, - help=("Period multiplicator for each restart in cos lr"), + help=("Period multiplicator for each restart in cos/triangular lr"), ) parser.add_argument( "--gamma", default=1 / 100, type=float, - help=("LR decay rate for each restart in cos lr"), + help=("LR decay rate for each restart in cos/triangular lr"), ) parser.add_argument( @@ -248,6 +275,18 @@ def add_class_args(parser, prefix=None): help=("Number of batches to warmup lr"), ) + parser.add_argument( + "--d-model", + default=None, + type=int, + help=("Transformer model hidden dimension"), + ) + parser.add_argument( + "--lr-factor", + default=1, + type=float, + help=("learning rate scaling factor for Noam schedule"), + ) parser.add_argument( "--update-lr-on-opt-step", default=False, diff --git a/hyperion/torch/lr_schedulers/invpow_lr.py b/hyperion/torch/lr_schedulers/invpow_lr.py index 53aa28dc..7590a64c 100644 --- a/hyperion/torch/lr_schedulers/invpow_lr.py +++ b/hyperion/torch/lr_schedulers/invpow_lr.py @@ -10,7 +10,7 @@ class InvPowLR(LRScheduler): - """inverse power learning rate scheduler.""" + """inverse power decay learning rate scheduler.""" def __init__( self, @@ -23,7 +23,7 @@ def __init__( step=0, update_lr_on_opt_step=False, ): - super(InvPowLR, self).__init__( + super().__init__( optimizer, min_lr, warmup_steps, epoch, step, update_lr_on_opt_step ) self.power = power diff --git a/hyperion/torch/lr_schedulers/noam_lr.py b/hyperion/torch/lr_schedulers/noam_lr.py new file mode 100644 index 00000000..edce0605 --- /dev/null +++ b/hyperion/torch/lr_schedulers/noam_lr.py @@ -0,0 +1,44 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import math +from turtle import up +import torch + +from .invpow_lr import InvPowLR + + +class NoamLR(InvPowLR): + """Optimizer used for Transformers in + Attention is all You Need: https://arxiv.org/pdf/1706.03762.pdf + + This is Inverse Power Law decay scheduler with parameters that depend on + the transformer hidden dimension. + + Attributes: + + """ + + def __init__( + self, + optimizer, + d_model, + lr_factor=1, + min_lr=0, + warmup_steps=0, + epoch=0, + step=0, + ): + lr = lr_factor / math.sqrt(d_model * warmup_steps) + print("noam_lr", lr, flush=True) + for group in optimizer.param_groups: + group["lr"] = lr + super().__init__( + optimizer, + min_lr=min_lr, + warmup_steps=warmup_steps, + epoch=epoch, + step=step, + update_lr_on_opt_step=True, + ) diff --git a/hyperion/torch/lr_schedulers/triangular_lr.py b/hyperion/torch/lr_schedulers/triangular_lr.py new file mode 100644 index 00000000..add8a13c --- /dev/null +++ b/hyperion/torch/lr_schedulers/triangular_lr.py @@ -0,0 +1,75 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +import math +import logging + +import torch + +from .lr_scheduler import LRScheduler + + +class TriangularLR(LRScheduler): + r"""Sets cyclid triangular learning rate schedule as proposed in + .. Cyclical Learning Rates for Training Neural Networks: + https://arxiv.org/abs/1506.01186 + + .. math:: + \mathrm{cycle} = \mathrm{floor}(1 + \frac{T_{cur}}{T_{max}}) + x = \mathrm{abs}(2\frac{T_{cur}}{T_{max}}-2\mathrm{cycle}+1) + \eta_t = \eta_{min} + (\eta_{max} - \eta_{min})\max(0, 1-x) + + """ + + def __init__( + self, + optimizer, + T, + T_mul=1, + min_lr=0, + gamma=1, + last_restart=0, + num_restarts=0, + epoch=0, + step=0, + update_lr_on_opt_step=False, + ): + + super().__init__(optimizer, min_lr, 0, epoch, step, update_lr_on_opt_step) + self.T = T + self.T_mul = T_mul + self.last_restart = last_restart + self.num_restarts = num_restarts + self.gamma = gamma + + def on_epoch_begin(self, epoch=None, epoch_updates=1, **kwargs): + super().on_epoch_begin(epoch) + if self.update_lr_on_opt_step: + # T has to correspond to an integer number of epochs + T = int(math.ceil(self.T / epoch_updates) * epoch_updates) + if self.T != T: + logging.info("readjusting triangular_lr T %d -> %d" % (self.T, T)) + self.T = T + + def get_lr(self, step): + x = step - self.last_restart + + if x >= self.T: + self.last_restart = step + x = 0 + self.T *= self.T_mul + self.num_restarts += 1 + logging.info( + "triangular_lr warm-restart=%d T=%d" % (self.num_restarts, self.T) + ) + + alpha = self.gamma ** self.num_restarts + x = math.abs(2 * x / self.T - 1) + + return [ + eta_min + (alpha * eta_max - eta_min) * math.max(0, 1 - x) + for eta_max, eta_min in zip(self.base_lrs, self.min_lrs) + ] diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index dc5de737..2e4deac6 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -8,8 +8,14 @@ import torch import torch.nn as nn +torch_model_registry = {} + class TorchModel(nn.Module): + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + torch_model_registry[cls.__name__] = cls + def get_config(self): config = {"class_name": self.__class__.__name__} diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index 889aed03..34f40cc8 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -127,6 +127,10 @@ class HFHubert(HFWav2VecBase): ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path and inits the model from the configuration. This is set to True for models that have already been finetuned. + override_dropouts (`bool` defaults to False): if True, it ingnores the dropout probs. in the pretrained model + and uses the ones passed as arguments. + override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. + configuration in the pretrained model and uses the ones passed in the arguments. """ def __init__( @@ -169,6 +173,8 @@ def __init__( revision: str = "main", drop_layers_gt: Optional[int] = None, ignore_pretrained: bool = False, + override_dropouts: bool = False, + override_spec_augment: bool = False, ): super().__init__( @@ -181,6 +187,8 @@ def __init__( revision=revision, drop_layers_gt=drop_layers_gt, ignore_pretrained=ignore_pretrained, + override_dropouts=override_dropouts, + override_spec_augment=override_spec_augment, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -208,6 +216,18 @@ def __init__( ) ddp_wait_for_all_procs() self.hf_model.config.layerdrop = 0.0 + self.change_hyperparams( + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + ) else: hf_config = HubertConfig( vocab_size=vocab_size, diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index 63a7cf99..0b0302eb 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -140,6 +140,10 @@ class HFWav2Vec2(HFWav2VecBase): ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path and inits the model from the configuration. This is set to True for models that have already been finetuned. + override_dropouts (`bool` defaults to False): if True, it ingnores the dropout probs. in the pretrained model + and uses the ones passed as arguments. + override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. + configuration in the pretrained model and uses the ones passed in the arguments. """ def __init__( @@ -187,6 +191,8 @@ def __init__( revision: str = "main", drop_layers_gt: Optional[int] = None, ignore_pretrained: bool = False, + override_dropouts: bool = False, + override_spec_augment: bool = False, ): super().__init__( @@ -199,6 +205,8 @@ def __init__( revision=revision, drop_layers_gt=drop_layers_gt, ignore_pretrained=ignore_pretrained, + override_dropouts=override_dropouts, + override_spec_augment=override_spec_augment, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -226,6 +234,18 @@ def __init__( ) ddp_wait_for_all_procs() self.hf_model.config.layerdrop = 0.0 + self.change_hyperparams( + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + ) else: hf_config = Wav2Vec2Config( vocab_size=vocab_size, @@ -278,6 +298,32 @@ def num_encoder_layers(self): def hidden_size(self): return self.hf_config.hidden_size + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.0, + **kwargs, + ): + import transformers.models.wav2vec2.modeling_wav2vec2 as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + t.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.Wav2Vec2Attention): + module.dropout = activation_dropout + if isinstance(module, t.Wav2Vec2FeatureProjection): + module.intermediate_dropout.p = activation_dropout + def drop_upper_layers(self, max_layers: int): if max_layers >= self.hf_config.num_hidden_layers: return diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index 0b862d62..9f5c353b 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -45,6 +45,10 @@ class HFWav2VecBase(TorchModel): ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path and inits the model from the configuration. This is set to True for models that have already been finetuned. + override_dropouts (`bool` defaults to False): if True, it ingnores the dropout probs. in the pretrained model + and uses the ones passed as arguments. + override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. + configuration in the pretrained model and uses the ones passed in the arguments. """ def __init__( @@ -58,6 +62,8 @@ def __init__( revision: str = "main", drop_layers_gt: Optional[int] = None, ignore_pretrained: bool = False, + override_dropouts: bool = False, + override_spec_augment: bool = False, ): super().__init__() self.pretrained_model_path = pretrained_model_path @@ -67,6 +73,8 @@ def __init__( self.revision = revision self.drop_layers_gt = drop_layers_gt self.ignore_pretrained = ignore_pretrained + self.override_dropouts = override_dropouts + self.override_spec_augment = override_spec_augment if pretrained_model_path is not None and not ignore_pretrained: logging.info( @@ -139,14 +147,37 @@ def __deepcopy__(self, memo): new_obj.load_state_dict(self.state_dict()) device = next(self.parameters()).device new_obj.to(device) - print( - "deepcopy", - next(self.parameters()).device, - next(new_obj.parameters()).device, - flush=True, - ) return new_obj + def change_hyperparams(self, **kwargs): + if self.override_spec_augment: + self.change_spec_augment(**kwargs) + + if self.override_dropouts: + self.change_dropouts(**kwargs) + + def change_spec_augment( + self, + apply_spec_augment: bool = True, + mask_time_prob: float = 0.05, + mask_time_length: int = 10, + mask_time_min_masks: int = 2, + mask_feature_prob: float = 0.0, + mask_feature_length: int = 10, + mask_feature_min_masks: int = 0, + **kwargs, + ): + self.hf_model.config.apply_spec_augment = apply_spec_augment + self.hf_model.config.mask_time_prob = mask_time_prob + self.hf_model.config.mask_time_length = mask_time_length + self.hf_model.config.mask_time_min_masks = mask_time_min_masks + self.hf_model.config.mask_feature_prob = mask_feature_prob + self.hf_model.config.mask_feature_length = mask_feature_length + self.hf_model.config.mask_feature_min_masks = mask_feature_min_masks + + def change_dropouts(self, **kwargs): + pass # needs to be overloaded + @property def hf_config(self): return self.hf_model.config @@ -238,6 +269,8 @@ def get_config(self): "revision": self.revision, "drop_layers_gt": self.drop_layers_gt, "ignore_pretrained": self.ignore_pretrained, + "override_dropouts": self.override_dropouts, + "override_spec_augment": self.override_spec_augment, } base_config = super().get_config() @@ -260,6 +293,8 @@ def filter_args(**kwargs): "revision", "drop_layers_gt", "ignore_pretrained", + "override_dropouts", + "override_spec_augment", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args @@ -327,5 +362,23 @@ def add_class_args(parser, prefix=None, skip=set()): type=int, help=("drop encoder layers greater than this value."), ) + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--override-spec-augment", + default=False, + action=ActionYesNo, + help=( + "whether to use the spec augment config. passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index 1e8a5e8d..229c1871 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -140,6 +140,10 @@ class HFWavLM(HFWav2VecBase): ignore_pretrained (`bool` defaults to False): if True, it ignores the pretrained_model_path and inits the model from the configuration. This is set to True for models that have already been finetuned. + override_dropouts (`bool` defaults to False): if True, it ingnores the dropout probs. in the pretrained model + and uses the ones passed as arguments. + override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. + configuration in the pretrained model and uses the ones passed in the arguments. """ def __init__( @@ -187,6 +191,8 @@ def __init__( revision: str = "main", drop_layers_gt: Optional[int] = None, ignore_pretrained: bool = False, + override_dropouts: bool = False, + override_spec_augment: bool = False, ): super().__init__( @@ -199,6 +205,8 @@ def __init__( revision=revision, drop_layers_gt=drop_layers_gt, ignore_pretrained=ignore_pretrained, + override_dropouts=override_dropouts, + override_spec_augment=override_spec_augment, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -226,6 +234,18 @@ def __init__( ) ddp_wait_for_all_procs() self.hf_model.config.layerdrop = 0.0 + self.change_hyperparams( + hidden_dropout=hidden_dropout, + activation_dropout=activation_dropout, + attention_dropout=attention_dropout, + feat_proj_dropout=feat_proj_dropout, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, + mask_time_min_masks=mask_time_min_masks, + mask_feature_prob=mask_feature_prob, + mask_feature_length=mask_feature_length, + mask_feature_min_masks=mask_feature_min_masks, + ) else: hf_config = WavLMConfig( vocab_size=vocab_size, From 5c5ef7c0421ba0603d1dfa366e998fc5cb3613dd Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 16 May 2022 12:27:44 -0400 Subject: [PATCH 011/154] changed way that training modes work in TorchModels --- .../conf/wav2vec2base_do1_ecapatdnn512x2.yaml | 42 ++++++ .../wav2vec2base_specaug1_ecapatdnn512x2.yaml | 39 ++++++ .../wav2vec2base_specaug2_ecapatdnn512x2.yaml | 39 ++++++ .../wav2vec2base_specaug3_ecapatdnn512x2.yaml | 39 ++++++ .../wav2vec2base_specaug4_ecapatdnn512x2.yaml | 39 ++++++ .../wav2vec2base_specaug5_ecapatdnn512x2.yaml | 40 ++++++ ...nn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh | 56 ++++++++ ...nn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh} | 16 ++- hyperion/torch/torch_model.py | 127 ++++++++++++++++-- hyperion/torch/trainers/torch_trainer.py | 13 +- hyperion/torch/trainers/xvector_trainer.py | 10 -- hyperion/utils/ext_segment_list.py | 10 +- 12 files changed, 436 insertions(+), 34 deletions(-) create mode 100644 egs/voxceleb/v2/conf/wav2vec2base_do1_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2base_specaug1_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2base_specaug2_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2base_specaug3_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2base_specaug4_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh rename egs/voxceleb/v2/global_conf/{config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh => config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh} (65%) diff --git a/egs/voxceleb/v2/conf/wav2vec2base_do1_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_do1_ecapatdnn512x2.yaml new file mode 100644 index 00000000..f616073c --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_do1_ecapatdnn512x2.yaml @@ -0,0 +1,42 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_dropouts: true + activation_dropout: 0.1 + attention_dropout: 0.2 + hidden_dropout: 0.2 + feat_proj_dropout: 0.2 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug1_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug1_ecapatdnn512x2.yaml new file mode 100644 index 00000000..921f21a6 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug1_ecapatdnn512x2.yaml @@ -0,0 +1,39 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.5 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug2_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug2_ecapatdnn512x2.yaml new file mode 100644 index 00000000..410fd521 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug2_ecapatdnn512x2.yaml @@ -0,0 +1,39 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.25 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug3_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug3_ecapatdnn512x2.yaml new file mode 100644 index 00000000..96c70f98 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug3_ecapatdnn512x2.yaml @@ -0,0 +1,39 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.125 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug4_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug4_ecapatdnn512x2.yaml new file mode 100644 index 00000000..bb4613da --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug4_ecapatdnn512x2.yaml @@ -0,0 +1,39 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.0625 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml new file mode 100644 index 00000000..bf67ce48 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base + override_spec_augment: true + mask_time_prob: 0.150 + mask_feature_prob: 0.150 +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh new file mode 100644 index 00000000..8e4e4d93 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh @@ -0,0 +1,56 @@ +# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 + +# hugging face model +hf_model_name=wav2vec2base + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +dropout=0 +embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 +nnet_num_epochs=70 + + +lr=0.001 +#lr=0.005 +xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75 --model conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml --data.train.dataset.max-chunk-length 2 --data.train.dataset.min-chunk-length 2" + +nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v12 #v1 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0060.pth +nnet=$nnet_dir/swa_model_ep0076.pth +nnet=$nnet_dir/model_ep0060.pth +nnet=$nnet_dir/model_ep0030.pth +nnet=$nnet_dir/model_ep0040.pth +nnet=$nnet_dir/model_ep0030.pth +nnet=$nnet_dir/model_ep0020.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=6 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh similarity index 65% rename from egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh rename to egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh index 24bc799a..1509e46e 100644 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh @@ -23,18 +23,20 @@ margin_warmup=20 margin=0.3 nnet_num_epochs=70 -lr=0.002 + lr=0.001 -lr=0.005 +#lr=0.005 xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 10000 --trainer.lrsched.hold-steps 20000" +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75 --data.train.dataset.max-chunk-length 2 --data.train.dataset.min-chunk-length 2" -nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v2 #v1 +nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v4 #v1 nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth -nnet=$nnet_dir/swa_model_ep0064.pth - +nnet=$nnet_dir/model_ep0060.pth +nnet=$nnet_dir/swa_model_ep0076.pth +nnet=$nnet_dir/model_ep0060.pth +nnet=$nnet_dir/model_ep0030.pth +nnet=$nnet_dir/model_ep0020.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 2e4deac6..a9cffa33 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -3,7 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import os +from collections import OrderedDict as ODict from copy import deepcopy +from enum import Enum import torch import torch.nn as nn @@ -16,23 +18,29 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) torch_model_registry[cls.__name__] = cls + def __init__(self): + super().__init__() + self._train_mode = "full" + def get_config(self): config = {"class_name": self.__class__.__name__} - return config def copy(self): return deepcopy(self) - def save(self, file_path): - file_dir = os.path.dirname(file_path) - if not (os.path.isdir(file_dir)): - os.makedirs(file_dir, exist_ok=True) + def clone(self): + return deepcopy(self) - config = self.get_config() - torch.save( - {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()} - ) + def trainable_parameters(self, recurse: bool = True): + for param in self.parameters(recurse=recurse): + if param.requires_grad: + yield param + + def non_trainable_parameters(self, recurse: bool = True): + for param in self.parameters(recurse=recurse): + if not param.requires_grad: + yield param def freeze(self): for param in self.parameters(): @@ -42,6 +50,47 @@ def unfreeze(self): for param in self.parameters(): param.requires_grad = True + @property + def train_mode(self): + return self._train_mode + + @train_mode.setter + def train_mode(self, mode): + self.set_train_mode(mode) + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + + self._train_mode = mode + + def train(self, mode=None): + train_mode = self.train_mode if mode is None else mode + if train_mode == "full": + super().train() + elif train_mode == "frozen": + super().eval() + else: + raise ValueError(f"invalid train_mode={train_mode}") + + def valid_train_modes(self): + return ["full", "frozen"] + + def save(self, file_path): + file_dir = os.path.dirname(file_path) + if not (os.path.isdir(file_dir)): + os.makedirs(file_dir, exist_ok=True) + + config = self.get_config() + torch.save( + {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()} + ) + @staticmethod def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): model_data = None @@ -86,3 +135,63 @@ def device(self): ) return next(iter(devices)) + + @staticmethod + def _fix_cfg_compatibility(class_obj, cfg): + """Function that fixed compatibility issues with deprecated models + + Args: + class_obj: class type of the model. + cfg: configuration dictiory that inits the model. + + Returns: + Fixed configuration dictionary. + """ + # for compatibility with older x-vector models + XVector = torch_model_registry["xvector"] + if issubclass(class_obj, XVector): + # We renamed AM-softmax scale parameer s to cos_scale + if "s" in cfg: + cfg["cos_scale"] = cfg["s"] + del cfg["s"] + + return cfg + + @staticmethod + def auto_load(file_path, extra_objs={}, map_location=None): + + if map_location is None: + map_location = torch.device("cpu") + + model_data = torch.load(file_path, map_location=map_location) + cfg = model_data["model_cfg"] + class_name = cfg["class_name"] + del cfg["class_name"] + if class_name in torch_model_registry: + class_obj = torch_model_registry[class_name] + elif class_name in extra_objs: + class_obj = extra_objs[class_name] + else: + raise Exception("unknown object with class_name=%s" % (class_name)) + + state_dict = model_data["model_state_dict"] + + if "n_averaged" in state_dict: + del state_dict["n_averaged"] + + cfg = TorchModel._fix_cfg_compatibility(class_obj, cfg) + + import re + + p = re.compile("^module\.") + num_tries = 3 + for tries in range(num_tries): + try: + return class_obj.load(cfg=cfg, state_dict=state_dict) + except RuntimeError as err: + # remove module prefix when is trained with dataparallel + if tries == num_tries - 1: + # if it failed the 3 trials raise exception + raise err + # remove module prefix when is trained with dataparallel + state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 1821b674..1c8c89e3 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -54,7 +54,7 @@ class TorchTrainer(object): loggers: LoggerList object, loggers write training progress to std. output and file. ddp: if True use distributed data parallel training ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + train_mode: training mode in ['full', 'frozen'] use_amp: uses mixed precision training. log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger @@ -84,7 +84,7 @@ def __init__( loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, @@ -625,7 +625,7 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None, skip=[]): + def add_class_args(parser, prefix=None, train_modes=None, skip=[]): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -649,6 +649,13 @@ def add_class_args(parser, prefix=None, skip=[]): help="effective total batch size, if given, it overrides grad_acc_steps", ) parser.add_argument("--epochs", type=int, default=200, help="number of epochs") + if train_modes is not None: + parser.add_argument( + "--train-mode", + default="full", + choices=train_modes, + help=f"Available train modes for the model in {train_modes}", + ) parser.add_argument( "--log-interval", type=int, diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index a643ca7f..4fbbc895 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -131,28 +131,18 @@ def train_epoch(self, data_loader): batch_size = data.shape[0] with self.amp_autocast(): - # logging.info( - # f"in_model rank={self.rank} batch={batch} x={data} mxx={data.max()} avgx={data.mean()}" - # ) output = self.model(data, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps - # logging.info( - # f"out_model rank={self.rank} batch={batch} y={output} loss={loss.item()}" - # ) if self.use_amp: - # logging.info("in_backward rank=%d batch=%d", self.rank, batch) self.grad_scaler.scale(loss).backward() - # logging.info("out_backward rank=%d batch=%d", self.rank, batch) else: loss.backward() if (batch + 1) % self.grad_acc_steps == 0: if self.lr_scheduler is not None and not self.in_swa: self.lr_scheduler.on_opt_step() - # logging.info("in_update rank=%d batch=%d", self.rank, batch) self.update_model() - # logging.info("out_update rank=%d batch=%d", self.rank, batch) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): diff --git a/hyperion/utils/ext_segment_list.py b/hyperion/utils/ext_segment_list.py index 38a4a1b4..9c7d81d3 100644 --- a/hyperion/utils/ext_segment_list.py +++ b/hyperion/utils/ext_segment_list.py @@ -78,11 +78,11 @@ def create( ) if series_id is None: - u_file_id = self.segments["file_id"].unique() + u_file_id = segments.segments["file_id"].unique() files = pd.DataFrame({"file_id": u_file_id, "series_id": u_file_id}) else: - file_id = [f for f in v for k, v in series_id.items()] - series_id = [k for f in v for k, v in series_id.items()] + file_id = [f for k, v in series_id.items() for f in v] + series_id = [k for k, v in series_id.items() for f in v] files = pd.DataFrame({"file_id": file_id, "series_id": series_id}) if isinstance(name, str): @@ -128,8 +128,8 @@ def create_from_segment_list( u_file_id = segments["file_id"].unique() files = pd.DataFrame({"file_id": u_file_id, "series_id": u_file_id}) else: - file_id = [f for f in v for k, v in series_id.items()] - series_id = [k for f in v for k, v in series_id.items()] + file_id = [f for k, v in series_id.items() for f in v] + series_id = [k for k, v in series_id.items() for f in v] files = pd.DataFrame({"file_id": file_id, "series_id": series_id}) return cls(segments, ext_segments, files, index_column) From d7599f9ed8f8b6ed881362897d8651736c9ce86a Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 16 May 2022 18:43:51 -0400 Subject: [PATCH 012/154] fixed bugs introduced in TorchModel.train() --- ...nn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh | 1 - hyperion/torch/models/xvectors/xvector.py | 47 +++++++++++++++---- hyperion/torch/torch_model.py | 15 ++++-- hyperion/torch/trainers/ae_trainer.py | 4 +- hyperion/torch/trainers/dvae_trainer.py | 4 +- hyperion/torch/trainers/plda_trainer.py | 4 +- hyperion/torch/trainers/torch_trainer.py | 10 ++-- hyperion/torch/trainers/vae_trainer.py | 4 +- hyperion/torch/trainers/vq_dvae_trainer.py | 4 +- hyperion/torch/trainers/vq_vae_trainer.py | 4 +- .../torch/trainers/xvector_adv_trainer.py | 14 ++---- .../trainers/xvector_adv_trainer_from_wav.py | 8 ++-- hyperion/torch/trainers/xvector_trainer.py | 2 +- .../trainers/xvector_trainer_deep_feat_reg.py | 2 +- .../xvector_trainer_deep_feat_reg_from_wav.py | 4 +- .../trainers/xvector_trainer_from_wav.py | 5 +- 16 files changed, 78 insertions(+), 54 deletions(-) diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh index 8e4e4d93..942fb336 100644 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh @@ -37,7 +37,6 @@ nnet=$nnet_dir/swa_model_ep0076.pth nnet=$nnet_dir/model_ep0060.pth nnet=$nnet_dir/model_ep0030.pth nnet=$nnet_dir/model_ep0040.pth -nnet=$nnet_dir/model_ep0030.pth nnet=$nnet_dir/model_ep0020.pth diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index e07487d7..008f595c 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -3,7 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging +from enum import Enum from jsonargparse import ArgumentParser, ActionParser +from typing import Optional import torch import torch.nn as nn @@ -15,6 +17,12 @@ from ...utils import eval_nnet_by_chunks, scale_seq_lengths +class XVectorTrainMode(Enum): + full = 0 + frozen = 1 + ft_embed_affine = 2 + + class XVector(TorchModel): """x-Vector base class""" @@ -543,19 +551,38 @@ def freeze_preembed_layers(self): layer_list = [l for l in range(self.embed_layer)] self.classif_net.freeze_layers(layer_list) - def train_mode(self, mode="ft-embed-affine"): - if mode == "ft-full" or mode == "train": - self.train() + def set_train_mode(self, mode): + if mode == self._train_mode: return - self.encoder_net.eval() - if self.proj is not None: - self.proj.eval() + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.freeze_preembed_layers() + else: + raise ValueError(f"invalid train_mode={mode}") + + self._train_mode = mode + + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode == "ft-embed-affine": + self.encoder_net.eval() + if self.proj is not None: + self.proj.eval() + + self.pool_net.eval() + self.classif_net.train() + layer_list = [l for l in range(self.embed_layer)] + self.classif_net.put_layers_in_eval_mode(layer_list) + else: + raise ValueError(f"invalid train_mode={train_mode}") - self.pool_net.eval() - self.classif_net.train() - layer_list = [l for l in range(self.embed_layer)] - self.classif_net.put_layers_in_eval_mode(layer_list) + def valid_train_modes(self): + return ["full", "frozen", "ft-embed-affine"] @staticmethod def filter_args(**kwargs): diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index a9cffa33..fdd17701 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -6,6 +6,7 @@ from collections import OrderedDict as ODict from copy import deepcopy from enum import Enum +from typing import Optional import torch import torch.nn as nn @@ -69,15 +70,21 @@ def set_train_mode(self, mode): self._train_mode = mode - def train(self, mode=None): - train_mode = self.train_mode if mode is None else mode + def _train(self, train_mode: str): if train_mode == "full": - super().train() + super().train(True) elif train_mode == "frozen": - super().eval() + super().train(False) else: raise ValueError(f"invalid train_mode={train_mode}") + def train(self, mode: bool = True): + if not mode: + super().train(False) + return + + self._train(self.train_mode) + def valid_train_modes(self): return ["full", "frozen"] diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 4bd6790a..c1debdb6 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -117,7 +117,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): if isinstance(data, (tuple, list)): @@ -167,7 +167,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 5649cc01..02c4fb6e 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -113,7 +113,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): @@ -174,7 +174,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index dd797996..ea5e57af 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -131,7 +131,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc() batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, (data, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) @@ -205,7 +205,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 1c8c89e3..4cf90d1a 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -214,6 +214,7 @@ def fit(self, train_data, val_data=None): """ self.exp_path.mkdir(parents=True, exist_ok=True) self._compute_grad_acc_steps(train_data) + self.set_train_mode() if self.do_swa and self.cur_epoch >= self.swa_start: self.in_swa = True @@ -260,10 +261,7 @@ def fit(self, train_data, val_data=None): self.save_swa_model(logs) def set_train_mode(self): - if self.train_mode == "train": - self.model.train() - else: - self.model.train_mode(self.train_mode) + self.model.train_mode = self.train_mode def train_epoch(self, data_loader): """Training epoch loop @@ -273,7 +271,7 @@ def train_epoch(self, data_loader): """ metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, (data, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: @@ -323,7 +321,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.train() else: log_tag = "val_" self.model.eval() diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index bc72bbe2..ae193209 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -108,7 +108,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): @@ -162,7 +162,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index ac87ba5a..840d70d6 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -107,7 +107,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): @@ -166,7 +166,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 1b13bac1..cb09ba00 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -107,7 +107,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, data in enumerate(data_loader): @@ -166,7 +166,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 7dee1303..91c75823 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -128,19 +128,13 @@ def __init__( % (p_attack, 1.0 / self.grad_acc_steps) ) - # if data_parallel: - # # change model in attack by the data parallel version - # self.attack.model = self.model - # # make loss function in attack data parallel - # self.attack.make_data_parallel() - def train_epoch(self, data_loader): self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, (data, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) @@ -157,7 +151,7 @@ def train_epoch(self, data_loader): max_delta = torch.max(torch.abs(data_adv - data)).item() logging.info("adv attack max perturbation=%f" % (max_delta)) data = data_adv - self.set_train_mode() + self.model.train() self.optimizer.zero_grad() @@ -196,7 +190,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() @@ -210,7 +204,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() data = self.attack.generate(data, target) if swa_update_bn: - self.set_train_mode() + self.model.train() with torch.no_grad(): with self.amp_autocast(): diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 0719f350..2797e678 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -137,7 +137,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, (data, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) @@ -156,7 +156,7 @@ def train_epoch(self, data_loader): # logging.info('zz {} {}'.format(data[z], data_adv[z])) # logging.info('adv attack max perturbation=%f' % (max_delta)) data = data_adv - self.set_train_mode() + self.model.train() self.optimizer.zero_grad() @@ -198,7 +198,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() @@ -212,7 +212,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() data = self.attack.generate(data, target) if swa_update_bn: - self.set_train_mode() + self.model.train() with torch.no_grad(): feats = self.feat_extractor(data) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 4fbbc895..8c39a345 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -120,7 +120,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, (data, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index e6014750..58235961 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -141,7 +141,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, (data, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index dafeb0c5..da9d064a 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -138,7 +138,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() - self.set_train_mode() + self.model.train() for batch, (data, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) @@ -239,7 +239,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index a8f9da99..6445ae3e 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -125,8 +125,7 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.feat_extractor.train() - self.set_train_mode() - + self.model.train() for batch, (data, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: @@ -177,7 +176,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.set_train_mode() + self.model.train() else: log_tag = "val_" self.model.eval() From 8a7a6fdafbc565baa86decf1f210fa3d06a571a8 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 20 May 2022 16:20:43 -0400 Subject: [PATCH 013/154] added training models to xvector and wav2vec2xvector models --- egs/voxceleb/v2/conf/train_data_default.yaml | 4 +- hyperion/torch/lr_schedulers/cos_lr.py | 52 +++++++++-- hyperion/torch/lr_schedulers/exp_lr.py | 18 +++- hyperion/torch/lr_schedulers/factory.py | 47 ++++++++++ hyperion/torch/lr_schedulers/invpow_lr.py | 15 +++- hyperion/torch/lr_schedulers/lr_scheduler.py | 13 ++- hyperion/torch/lr_schedulers/noam_lr.py | 24 ++++- .../torch/lr_schedulers/red_lr_on_plateau.py | 4 +- hyperion/torch/lr_schedulers/triangular_lr.py | 17 ++++ .../models/wav2xvectors/hf_wav2xvector.py | 90 ++++++++++++++++++- hyperion/torch/torch_model.py | 4 +- hyperion/torch/tpm/hf/hf_hubert.py | 2 +- hyperion/torch/tpm/hf/hf_wav2vec2.py | 2 +- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 9 +- hyperion/torch/tpm/hf/hf_wavlm.py | 2 +- hyperion/torch/trainers/torch_trainer.py | 12 ++- .../trainers/xvector_trainer_from_wav.py | 3 +- 17 files changed, 285 insertions(+), 33 deletions(-) diff --git a/egs/voxceleb/v2/conf/train_data_default.yaml b/egs/voxceleb/v2/conf/train_data_default.yaml index 72c77204..d41c1507 100644 --- a/egs/voxceleb/v2/conf/train_data_default.yaml +++ b/egs/voxceleb/v2/conf/train_data_default.yaml @@ -1,6 +1,6 @@ dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 + max_chunk_length: 3.0 + min_chunk_length: 3.0 aug_cfg: conf/reverb_noise_aug.yaml wav_scale: 1 sampler: diff --git a/hyperion/torch/lr_schedulers/cos_lr.py b/hyperion/torch/lr_schedulers/cos_lr.py index 83b9206f..5caf12bb 100644 --- a/hyperion/torch/lr_schedulers/cos_lr.py +++ b/hyperion/torch/lr_schedulers/cos_lr.py @@ -25,16 +25,27 @@ class CosineLR(LRScheduler): When epoch=-1, sets initial lr as lr. It has been proposed in - `SGDR: Stochastic Gradient Descent with Warm Restarts`_. - - Args: - optimizer (Optimizer): Wrapped optimizer. - T_max (int): Maximum number of iterations. - eta_min (float): Minimum learning rate. Default: 0. - epoch (int): The index of last epoch. Default: -1. - .. _SGDR\: Stochastic Gradient Descent with Warm Restarts: https://arxiv.org/abs/1608.03983 + + Attributes: + optimizer: Pytorch optimizer object. + T: period of the cycle. + T_mul: period multiplier, after each cycle the period is multiplied by T_mul. + hold_steps: number of steps until the lr starts decaying. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + warm_restarts: whether or not to do warm restarts. + gamma: after each period, the maximum lr is multiplied by gamma. + last_restart: what is the step when the last restart happened, , this is used + to restart the training from a checkpoint. + num_restarts: how many restarts, we have done, this is used to restart the + training from a checkpoint. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. """ def __init__( @@ -53,7 +64,7 @@ def __init__( update_lr_on_opt_step=False, ): - super(CosineLR, self).__init__( + super().__init__( optimizer, min_lr, warmup_steps, epoch, step, update_lr_on_opt_step ) self.T = T @@ -108,6 +119,29 @@ def get_lr(self, step): class AdamCosineLR(CosineLR): + r"""Set the learning rate of each parameter group using a cosine annealing + schedule when using adam optimizer + + Attributes: + optimizer: Pytorch optimizer object. + T: period of the cycle. + T_mul: period multiplier, after each cycle the period is multiplied by T_mul. + hold_steps: number of steps until the lr starts decaying. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + warm_restarts: whether or not to do warm restarts. + gamma: after each period, the maximum lr is multiplied by gamma. + last_restart: what is the step when the last restart happened, , this is used + to restart the training from a checkpoint. + num_restarts: how many restarts, we have done, this is used to restart the + training from a checkpoint. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ + def __init__( self, optimizer, diff --git a/hyperion/torch/lr_schedulers/exp_lr.py b/hyperion/torch/lr_schedulers/exp_lr.py index cbe00a01..66edf436 100644 --- a/hyperion/torch/lr_schedulers/exp_lr.py +++ b/hyperion/torch/lr_schedulers/exp_lr.py @@ -10,7 +10,21 @@ class ExponentialLR(LRScheduler): - """Exponential learning rate scheduler.""" + """Exponential learning rate scheduler. + + Attributes: + optimizer: Pytorch optimizer object. + decay_rate: the lr is multiplied by `decay_rate` after `decay_ste.ps` + decay_steps: number of decay steps. + hold_steps: number of steps until the lr starts decaying. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ def __init__( self, @@ -24,7 +38,7 @@ def __init__( step=0, update_lr_on_opt_step=False, ): - super(ExponentialLR, self).__init__( + super().__init__( optimizer, min_lr, warmup_steps, epoch, step, update_lr_on_opt_step ) self.decay_rate = decay_rate diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index 10b47ab2..1a542bf2 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -40,6 +40,53 @@ def create( lr_factor=1, update_lr_on_opt_step=False, ): + """Creates a learning rate scheduler object. + + Args: + optimizer: Pytorch optimizer object. + lrsched_type: type of scheduler in ["none", "exp_lr", "invpow_lr", + "cos_lr", "adamcos_lr", "red_lr_on_plateau", "noam_lr", + "triangular_lr"]. + decay_rate: the lr is multiplied by `decay_rate` after `decay_ste.ps` + decay_steps: number of decay steps. + power: the step/epoch number is ellebated to this power to compute the decay. + hold_steps: number of steps until the lr starts decaying. + t: period of the cycle. + t_mul: period multiplier, after each cycle the period is multiplied by T_mul. + warm_restarts: whether or not to do warm restarts. + gamma: after each period, the maximum lr is multiplied by gamma, in cyclid schedulers. + monitor: which metric to monitor in RedLROnPlateau scheduler. + mode (str): One of `min`, `max`. In `min` mode, lr will + be reduced when the quantity monitored has stopped + decreasing; in `max` mode it will be reduced when the + quantity monitored has stopped increasing. Default: 'min'. + factor (float): Factor by which the learning rate will be + reduced. new_lr = lr * factor. Default: 0.1. + patience (int): Number of epochs with no improvement after + which learning rate will be reduced. For example, if + `patience = 2`, then we will ignore the first 2 epochs + with no improvement, and will only decrease the LR after the + 3rd epoch if the loss still hasn't improved then. + threshold (float): Threshold for measuring the new optimum, + to only focus on significant changes. Default: 1e-4. + threshold_mode (str): One of `rel`, `abs`. In `rel` mode, + dynamic_threshold = best * ( 1 + threshold ) in 'max' + mode or best * ( 1 - threshold ) in `min` mode. + In `abs` mode, dynamic_threshold = best + threshold in + `max` mode or best - threshold in `min` mode. Default: 'rel'. + cooldown (int): Number of epochs to wait before resuming + normal operation after lr has been reduced. Default: 0. + eps (float): Minimal decay applied to lr. If the difference + between new and old lr is smaller than eps, the update is + ignored. Default: 1e-8. + d_model: hidden dimension of transformer model. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + d_model: hidden dimension of transformer model. + lr_factor: multiplies the Noam lr by this number. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ if lrsch_type == "none": return None diff --git a/hyperion/torch/lr_schedulers/invpow_lr.py b/hyperion/torch/lr_schedulers/invpow_lr.py index 7590a64c..db420a0f 100644 --- a/hyperion/torch/lr_schedulers/invpow_lr.py +++ b/hyperion/torch/lr_schedulers/invpow_lr.py @@ -10,7 +10,20 @@ class InvPowLR(LRScheduler): - """inverse power decay learning rate scheduler.""" + """inverse power decay learning rate scheduler. + + Attributes: + optimizer: Pytorch optimizer object. + power: the step/epoch number is ellebated to this power to compute the decay. + hold_steps: number of steps until the lr starts decaying. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ def __init__( self, diff --git a/hyperion/torch/lr_schedulers/lr_scheduler.py b/hyperion/torch/lr_schedulers/lr_scheduler.py index 2ad1740e..5cbb3ff1 100644 --- a/hyperion/torch/lr_schedulers/lr_scheduler.py +++ b/hyperion/torch/lr_schedulers/lr_scheduler.py @@ -9,7 +9,18 @@ class LRScheduler(object): - """Base class for learning rate schedulers""" + """Base class for learning rate schedulers. + + Attributes: + optimizer: Pytorch optimizer object. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. + """ def __init__( self, diff --git a/hyperion/torch/lr_schedulers/noam_lr.py b/hyperion/torch/lr_schedulers/noam_lr.py index edce0605..4acdc3b9 100644 --- a/hyperion/torch/lr_schedulers/noam_lr.py +++ b/hyperion/torch/lr_schedulers/noam_lr.py @@ -3,8 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math -from turtle import up -import torch +import logging + +# import torch from .invpow_lr import InvPowLR @@ -17,6 +18,14 @@ class NoamLR(InvPowLR): the transformer hidden dimension. Attributes: + optimizer: Pytorch optimizer object. + d_model: hidden dimension of transformer model. + lr_factor: multiplies the Noam lr by this number. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. """ @@ -31,9 +40,16 @@ def __init__( step=0, ): lr = lr_factor / math.sqrt(d_model * warmup_steps) - print("noam_lr", lr, flush=True) + logging.info("Noam lr=%f", lr) + # we scale the lr taking account the relative + # learning rates in the param_groups + # in order to be able to have different lr for + # different modules of the model + max_lr = 0 + for group in optimizer.param_groups: + max_lr = max(lr, max_lr) for group in optimizer.param_groups: - group["lr"] = lr + group["lr"] = lr * group["lr"] / max_lr super().__init__( optimizer, min_lr=min_lr, diff --git a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py index 8d9eb4bf..7a2e82f8 100644 --- a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py +++ b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py @@ -21,6 +21,7 @@ class ReduceLROnPlateau(LRScheduler): Attributes: optimizer (Optimizer): optimizer. + monitor: which metric to monitor. mode (str): One of `min`, `max`. In `min` mode, lr will be reduced when the quantity monitored has stopped decreasing; in `max` mode it will be reduced when the @@ -45,6 +46,7 @@ class ReduceLROnPlateau(LRScheduler): min_lr (float or list): A scalar or a list of scalars. A lower bound on the learning rate of all param groups or each group respectively. Default: 0. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. eps (float): Minimal decay applied to lr. If the difference between new and old lr is smaller than eps, the update is ignored. Default: 1e-8. @@ -64,7 +66,7 @@ def __init__( warmup_steps=0, eps=1e-8, ): - super(ReduceLROnPlateau, self).__init__( + super().__init__( optimizer, min_lr, warmup_steps, diff --git a/hyperion/torch/lr_schedulers/triangular_lr.py b/hyperion/torch/lr_schedulers/triangular_lr.py index add8a13c..c2b66c42 100644 --- a/hyperion/torch/lr_schedulers/triangular_lr.py +++ b/hyperion/torch/lr_schedulers/triangular_lr.py @@ -22,6 +22,23 @@ class TriangularLR(LRScheduler): x = \mathrm{abs}(2\frac{T_{cur}}{T_{max}}-2\mathrm{cycle}+1) \eta_t = \eta_{min} + (\eta_{max} - \eta_{min})\max(0, 1-x) + Attributes: + optimizer: Pytorch optimizer object. + T: period of the cycle. + T_mul: period multiplier, after each cycle the period is multiplied by T_mul. + hold_steps: number of steps until the lr starts decaying. + min_lr: minimum learning rate. + warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. + gamma: after each period, the maximum lr is multiplied by gamma. + last_restart: what is the step when the last restart happened, , this is used + to restart the training from a checkpoint. + num_restarts: how many restarts, we have done, this is used to restart the + training from a checkpoint. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_lr_on_opt_step: if True, updates the lr each time we update the model, + otherwise after each epoch. """ def __init__( diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index f5f2c840..cb8ff1d0 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging +import contextlib from jsonargparse import ArgumentParser, ActionParser import torch @@ -35,6 +36,7 @@ def __init__( self.xvector = xvector self.feat_fusion_start = feat_fusion_start self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext self._make_fuser() def _make_fuser(self): @@ -96,7 +98,8 @@ def forward_feats(self, x, x_lengths, return_feat_layers=None): if return_feat_layers is None and self.feat_fusion_method == "last" else True ) - hf_output = self.hf_feats(x, x_lengths, return_hid_states=return_hid_states) + with self._hf_context: + hf_output = self.hf_feats(x, x_lengths, return_hid_states=return_hid_states) feat_lengths = hf_output["hidden_states_lengths"] if return_hid_states: hid_feats = hf_output["hidden_states"] @@ -192,6 +195,91 @@ def extract_embed( feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks ) + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_feat_fuser() + self.freeze_hf_feats() + self.xvector.freeze_preembed_layers() + elif mode in ["ft-xvector", "ft-xvector-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode == "ft-embed-affine": + self.hf_feats.train() + self.xvector._train("ft-embed_affine") + elif train_mode in [ + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.xvector._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + @staticmethod def filter_args(**kwargs): valid_args = ( diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index fdd17701..af3a305c 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -57,6 +57,7 @@ def train_mode(self): @train_mode.setter def train_mode(self, mode): + print("hola3", mode, flush=True) self.set_train_mode(mode) def set_train_mode(self, mode): @@ -85,7 +86,8 @@ def train(self, mode: bool = True): self._train(self.train_mode) - def valid_train_modes(self): + @staticmethod + def valid_train_modes(): return ["full", "frozen"] def save(self, file_path): diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index 34f40cc8..82ce70bd 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -192,9 +192,9 @@ def __init__( ) if pretrained_model_path is not None and not ignore_pretrained: - logging.info(f"Downloading HF model from {pretrained_model_path}") rank = ddp_get_rank() if rank == 0: + logging.info(f"Downloading HF model from {pretrained_model_path}") # rank 0 downloads the model from HF web self.hf_model = HubertModel.from_pretrained( pretrained_model_path, diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index 0b0302eb..e91fe8c4 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -210,10 +210,10 @@ def __init__( ) if pretrained_model_path is not None and not ignore_pretrained: - logging.info(f"Downloading HF model from {pretrained_model_path}") rank = ddp_get_rank() if rank == 0: # rank 0 downloads the model from HF web + logging.info(f"Downloading HF model from {pretrained_model_path}") self.hf_model = Wav2Vec2Model.from_pretrained( pretrained_model_path, cache_dir=cache_dir, diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index 9f5c353b..87f19835 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -77,11 +77,11 @@ def __init__( self.override_spec_augment = override_spec_augment if pretrained_model_path is not None and not ignore_pretrained: - logging.info( - f"Downloading config for HF preprocessor from {pretrained_model_path}" - ) rank = ddp_get_rank() if rank == 0: + logging.info( + f"Downloading config for HF preprocessor from {pretrained_model_path}" + ) # rank 0 downloads the model from HF web try: # some models donot have config for processor because do not have @@ -178,6 +178,9 @@ def change_spec_augment( def change_dropouts(self, **kwargs): pass # needs to be overloaded + def freeze_feature_encoder(self): + self.hf_model.freeze_feature_encoder() + @property def hf_config(self): return self.hf_model.config diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index 229c1871..c75cb6e8 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -210,9 +210,9 @@ def __init__( ) if pretrained_model_path is not None and not ignore_pretrained: - logging.info(f"Downloading HF model from {pretrained_model_path}") rank = ddp_get_rank() if rank == 0: + logging.info(f"Downloading HF model from {pretrained_model_path}") # rank 0 downloads the model from HF web self.hf_model = WavLMModel.from_pretrained( pretrained_model_path, diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 4cf90d1a..8dfad9ce 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -127,6 +127,8 @@ def __init__( self.swa_anneal_epochs = swa_anneal_epochs self.amp_args = {} + self.set_train_mode() + if device is not None: self.model.to(device) if loss is not None: @@ -214,7 +216,6 @@ def fit(self, train_data, val_data=None): """ self.exp_path.mkdir(parents=True, exist_ok=True) self._compute_grad_acc_steps(train_data) - self.set_train_mode() if self.do_swa and self.cur_epoch >= self.swa_start: self.in_swa = True @@ -261,7 +262,8 @@ def fit(self, train_data, val_data=None): self.save_swa_model(logs) def set_train_mode(self): - self.model.train_mode = self.train_mode + # self.model.train_mode = self.train_mode + self.model.set_train_mode(self.train_mode) def train_epoch(self, data_loader): """Training epoch loop @@ -313,7 +315,8 @@ def validation_epoch(self, data_loader, swa_update_bn=False): """Validation epoch loop Args: - data_loader: PyTorch data loader return input/output pairs + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. """ metric_acc = MetricAcc(self.device) @@ -607,6 +610,7 @@ def filter_args(**kwargs): "use_amp", "ddp_type", "grad_clip", + "grad_clip_norm", "swa_start", "swa_lr", "swa_anneal_epochs", @@ -617,9 +621,9 @@ def filter_args(**kwargs): "use_tensorboard", "use_wandb", "wandb", + "train_mode", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args @staticmethod diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 6445ae3e..d75936d8 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -168,7 +168,8 @@ def validation_epoch(self, data_loader, swa_update_bn=False): """Validation epoch loop Args: - data_loader: PyTorch data loader return input/output pairs + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. """ metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() From 90ef40a49c1845f53b9dd59f6c5179d542b9d3b8 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 2 Jun 2022 10:27:25 -0400 Subject: [PATCH 014/154] wavlm phase 1 --- ..._wavlmbaseplus_ecapatdnn512x3_default.yaml | 6 + ...aseplus_ecapatdnn512x3_phase1_default.yaml | 6 + .../v2/conf/trainer_phase1_adam_default.yaml | 20 + .../v2/conf/trainer_phase1_sgd_default.yaml | 18 + .../v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml | 38 ++ ...nfig_wavlmbaseplus_ecapatdnn512x3_v1.10.sh | 36 ++ ...onfig_wavlmbaseplus_ecapatdnn512x3_v1.9.sh | 36 ++ hyperion/bin/train_wav2vec2xvector.py | 4 +- hyperion/np/classifiers/linear_gbe1.py | 264 ------------- hyperion/np/np_model.py | 101 ++++- hyperion/np/pdfs/core/exp_family.py | 133 ++++++- hyperion/np/pdfs/core/normal.py | 156 ++++++-- hyperion/np/pdfs/core/normal_diag_cov.py | 125 ++++++- hyperion/np/pdfs/core/pdf.py | 30 +- hyperion/np/pdfs/hmm/hmm.py | 37 +- hyperion/np/pdfs/jfa/jfa_total.py | 169 +++++++-- .../np/pdfs/mixtures/exp_family_mixture.py | 335 +++++++++++++++-- hyperion/np/pdfs/mixtures/gmm.py | 161 +++++++- hyperion/np/pdfs/mixtures/gmm_diag_cov.py | 163 +++++++- .../np/pdfs/mixtures/gmm_tied_diag_cov.py | 102 ++++- hyperion/np/pdfs/plda/frplda.py | 132 ++++++- hyperion/np/pdfs/plda/plda.py | 172 +++++++-- hyperion/np/pdfs/plda/plda_base.py | 351 +++++++++++++----- hyperion/np/pdfs/plda/splda.py | 151 +++++++- hyperion/np/score_norm/adapt_s_norm.py | 24 +- hyperion/np/score_norm/s_norm.py | 14 +- hyperion/np/score_norm/score_norm.py | 16 +- hyperion/np/score_norm/t_norm.py | 8 + hyperion/np/score_norm/tz_norm.py | 16 +- hyperion/np/score_norm/z_norm.py | 8 + hyperion/np/score_norm/zt_norm.py | 18 +- hyperion/np/transforms/cent_whiten_up.py | 6 +- hyperion/np/transforms/lda.py | 8 - hyperion/torch/layers/margin_losses.py | 154 +++++++- .../models/wav2xvectors/hf_wav2xvector.py | 4 +- .../models/xvectors/efficient_net_xvector.py | 4 + .../torch/models/xvectors/resnet1d_xvector.py | 56 +-- .../torch/models/xvectors/resnet_xvector.py | 4 + .../torch/models/xvectors/spinenet_xvector.py | 4 + .../torch/models/xvectors/tdnn_xvector.py | 4 + .../models/xvectors/transformer_xvector_v1.py | 4 + hyperion/torch/models/xvectors/xvector.py | 50 ++- hyperion/torch/narchs/classif_head.py | 36 ++ hyperion/torch/trainers/torch_trainer.py | 7 +- notebooks/tutorial_jsalt22/ivectors.ipynb | 226 +++++++++++ 45 files changed, 2783 insertions(+), 634 deletions(-) create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_default.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml create mode 100644 egs/voxceleb/v2/conf/trainer_phase1_adam_default.yaml create mode 100644 egs/voxceleb/v2/conf/trainer_phase1_sgd_default.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.9.sh delete mode 100644 hyperion/np/classifiers/linear_gbe1.py create mode 100644 notebooks/tutorial_jsalt22/ivectors.ipynb diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_default.yaml new file mode 100644 index 00000000..8574a1cf --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_default.yaml @@ -0,0 +1,6 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: wavlmbaseplus_ecapatdnn512x3.yaml +trainer: trainer_phase1_sgd_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml new file mode 100644 index 00000000..8574a1cf --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml @@ -0,0 +1,6 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: wavlmbaseplus_ecapatdnn512x3.yaml +trainer: trainer_phase1_sgd_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/trainer_phase1_adam_default.yaml b/egs/voxceleb/v2/conf/trainer_phase1_adam_default.yaml new file mode 100644 index 00000000..03c5cc84 --- /dev/null +++ b/egs/voxceleb/v2/conf/trainer_phase1_adam_default.yaml @@ -0,0 +1,20 @@ +optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 4e-4 +lrsched: + lrsch_type: exp_lr + decay_steps: 8000 + hold_steps: 40000 + min_lr: 1.0e-05 + decay_rate: 0.5 + warmup_steps: 1000 + update_lr_on_opt_step: true +use_amp: true +log_interval: 1000 +epochs: 30 +eff_batch_size: 1024 +train_mode: hf-feats-frozen-nograd diff --git a/egs/voxceleb/v2/conf/trainer_phase1_sgd_default.yaml b/egs/voxceleb/v2/conf/trainer_phase1_sgd_default.yaml new file mode 100644 index 00000000..7fc848a0 --- /dev/null +++ b/egs/voxceleb/v2/conf/trainer_phase1_sgd_default.yaml @@ -0,0 +1,18 @@ +optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 +lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 2100 + hold_steps: 1000 + min_lr: 4e-4 + warmup_steps: 1000 + update_lr_on_opt_step: true +use_amp: true +log_interval: 1000 +epochs: 30 +eff_batch_size: 1024 +train_mode: hf-feats-frozen-nograd diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml new file mode 100644 index 00000000..787e3718 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml @@ -0,0 +1,38 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 3 + dropout_rate: 0.0 +feat_fusion_method: last diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh new file mode 100644 index 00000000..47af1f43 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh @@ -0,0 +1,36 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +xvec_train_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml +xvec_train_args="--model.xvector.margin-warmup-epochs 5 --trainer.lrsched.decay-steps 4200 --trainer.lrsched.warmup-steps 1500 --trainer.lrsched.hold-steps 1500 --trainer.epochs 60 --model.feat-fusion-method weighted-avg --model.feat-fusion-start 2 --model.xvector.intertop-margin 0.1" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.10 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0060.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.9.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.9.sh new file mode 100644 index 00000000..dccd01e1 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.9.sh @@ -0,0 +1,36 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +xvec_train_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml +xvec_train_args="--model.xvector.margin-warmup-epochs 5 --trainer.lrsched.decay-steps 3150 --trainer.lrsched.warmup-steps 1500 --trainer.lrsched.hold-steps 1500 --trainer.epochs 45 --model.feat-fusion-method weighted-avg --model.feat-fusion-start 2 --model.xvector.intertop-margin 0.1" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.9 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0045.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index cbb37bb3..c673f5c9 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -154,7 +154,9 @@ def make_parser(model_class): ) model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, prefix="trainer") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) parser.add_argument("--seed", type=int, default=1123581321, help="random seed") parser.add_argument( diff --git a/hyperion/np/classifiers/linear_gbe1.py b/hyperion/np/classifiers/linear_gbe1.py deleted file mode 100644 index 8c5df381..00000000 --- a/hyperion/np/classifiers/linear_gbe1.py +++ /dev/null @@ -1,264 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import numpy as np - -from ...hyp_defs import float_cpu -from ..np_model import NPModel -from ...utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax - - -class LinearGBE(NPModel): - def __init__( - self, - mu=None, - W=None, - update_mu=True, - update_W=True, - x_dim=1, - num_classes=None, - balance_class_weight=True, - do_map=False, - r_mu=16, - r_W=16, - **kwargs - ): - super(LinearGBE, self).__init__(**kwargs) - if mu is not None: - num_classes = mu.shape[0] - x_dim = mu.shape[1] - - self.mu = mu - self.W = W - self.update_mu = update_mu - self.update_W = update_W - self.x_dim = x_dim - self.num_classes = num_classes - self.balance_class_weight = balance_class_weight - self.A = None - self.b = None - self.do_map = do_map - self.r_mu = r_mu - self.r_W = r_W - - self._compute_Ab() - - def get_config(self): - config = { - "update_mu": self.update_mu, - "update_W": self.update_W, - "x_dim": self.x_dim, - "num_classes": self.num_classes, - "balance_class_weight": self.balance_class_weight, - "do_map": self.do_map, - "r_mu": self.r_mu, - "r_W": self.r_W, - } - base_config = super(LinearGBE, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def predict(self, x, normalize=False, return_full_llk=False): - logp = np.dot(x, self.A) + self.b - - if return_full_llk: - K = 0.5 * logdet_pdmat(self.W) - 0.5 * self.x_dim * np.log(2 * np.pi) - K += -0.5 * np.sum(np.dot(x, self.W) * x, axis=1, keepdims=True) - logp += K - - if normalize: - logp = np.log(softmax(logp, axis=1)) - - return logp - - def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): - - assert class_ids is not None or p_theta is not None - - self.x_dim = x.shape[-1] - if self.num_classes is None: - if class_ids is not None: - self.num_classes = np.max(class_ids) + 1 - else: - self.num_classes = p_theta.shape[-1] - - if class_ids is not None: - p_theta = int2onehot(class_ids, self.num_classes) - - if sample_weight is not None: - p_theta = sample_weight[:, None] * p_theta - - N = np.sum(p_theta, axis=0) - - F = np.dot(p_theta.T, x) - - mu0 = self.mu - xbar = mu0 - if self.update_mu: - xbar = F / N[:, None] - if self.do_map: - alpha = (N / (N + self.r_mu))[:, None] - self.mu = (1 - alpha) * mu0 + alpha * xbar - else: - self.mu = xbar - - if self.update_W: - if self.do_map: - r_W = self.r_W - alpha = (N / (N + r_W))[:, None] - S0 = invert_pdmat(self.W, return_inv=True)[-1] - if self.balance_class_weight: - S = (self.num_classes - np.sum(alpha)) * S0 - else: - S = self.num_classes * self.r_W * S0 - else: - r_W = 0 - S = np.zeros((x.shape[1], x.shape[1]), dtype=float_cpu()) - - for k in range(self.num_classes): - delta = x - xbar[k] - S_k = np.dot(p_theta[:, k] * delta.T, delta) - if self.do_map: - mu_delta = xbar[k] - mu0[k] - S_k += self.r_W * alpha[k] * np.outer(mu_delta, mu_delta) - - if self.balance_class_weight: - S_k /= N[k] + r_W - - S += S_k - - if self.balance_class_weight: - S /= self.num_classes - else: - S /= self.num_classes * r_w + np.sum(N) - - self.W = invert_pdmat(S, return_inv=True)[-1] - - self._compute_Ab() - - def save_params(self, f): - params = {"mu": self.mu, "W": self.W} - self._save_params_from_dict(f, params) - - @classmethod - def load_params(cls, f, config): - param_list = ["mu", "W"] - params = cls._load_params_to_dict(f, config["name"], param_list) - kwargs = dict(list(config.items()) + list(params.items())) - return cls(**kwargs) - - def _compute_Ab(self): - if self.mu is not None and self.W is not None: - self.A = np.dot(self.W, self.mu.T) - self.b = -0.5 * np.sum(self.mu.T * self.A, axis=0) - - @staticmethod - def filter_args(**kwargs): - - valid_args = ( - "update_mu", - "update_W", - "balance_class_weight", - "do_map", - "r_mu", - "r_W", - "name", - ) - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - filter_train_args = filter_args - - @staticmethod - def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - p2 = "" - else: - p1 = "--" + prefix + "." - p2 = prefix + "." - - parser.add_argument( - p1 + "no-update-mu", - default=True, - action="store_false", - help="not update mu", - ) - parser.add_argument( - p1 + "no-update-W", - dest=(p2 + "update_W"), - default=True, - action="store_false", - help="not update W", - ) - parser.add_argument( - p1 + "balance-class-weight", - dest=(p2 + "balance_class_weight"), - default=False, - action="store_true", - help="Balances the weight of each class when computing W", - ) - parser.add_argument( - p1 + "do-map", - dest=(p2 + "do_map"), - default=False, - action="store_true", - help="does MAP adaptation", - ) - parser.add_argument( - p1 + "r-mu", - dest=(p2 + "r_mu"), - default=16, - type=float, - help="relevance factor for the means", - ) - parser.add_argument( - p1 + "r-w", - dest=(p2 + "r_W"), - default=16, - type=float, - help="relevance factor for the variances", - ) - - parser.add_argument( - p1 + "name", dest=(p2 + "name"), default="lgbe", help="model name" - ) - - @staticmethod - def filter_eval_args(**kwargs): - valid_args = ("model_file", "normalize", "return_full_llk") - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod - def add_argparse_eval_args(parser, prefix=None): - if prefix is None: - p1 = "--" - p2 = "" - else: - p1 = "--" + prefix + "." - p2 = prefix + "." - - parser.add_argument( - p1 + "model-file", - dest=(p2 + "model_file"), - required=True, - help=("model file"), - ) - parser.add_argument( - p1 + "normalize", - dest=(p2 + "normalize"), - default=False, - action="store_true", - help=("normalizes the ouput probabilities to sum to one"), - ) - parser.add_argument( - p1 + "return-full-llk", - dest=(p2 + "return_full_llk"), - default=False, - action="store_true", - help=("evaluates full gaussian likelihood instead of linear function"), - ) - - add_argparse_args = add_class_args - add_argparse_train_args = add_class_args - add_argparse_eval_args = add_eval_args diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py index 35717a82..a53135e6 100644 --- a/hyperion/np/np_model.py +++ b/hyperion/np/np_model.py @@ -2,7 +2,6 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from abc import ABCMeta, abstractmethod import os import json from copy import deepcopy @@ -14,36 +13,62 @@ class NPModel(object): - __metaclass__ = ABCMeta + """Base class for machine learning models based on numpy. + + Attributes: + name: optional identifier for the model. + """ def __init__(self, name=None, **kwargs): self.name = name self._is_init = False def copy(self): + """Returns a clone of the model.""" + return deepcopy(self) + + def clone(self): + """Returns a clone of the model.""" return deepcopy(self) @property def is_init(self): + """Returns True if the model has been initialized.""" return self._is_init def init_to_false(self): + """Sets the model as non initialized.""" self._is_init = False - @abstractmethod def initialize(self): pass - @abstractmethod - def fit(self, x, sample_weights=None, x_val=None, sample_weights_val=None): - pass + def fit(self, x, sample_weight=None, x_val=None, sample_weight_val=None): + """Trains the model. + + Args: + x: train data matrix with shape (num_samples, x_dim). + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + sample_weight_val: weight of each sample in the val. loss. + """ + raise NotImplementedError() - @abstractmethod def fit_generator(self, x, x_val=None): - pass + """Trains the model from a data generator function. + + Args: + x: train data generation function. + x_val: validation data generation function. + """ + raise NotImplementedError() - @abstractmethod def save(self, file_path): + """Saves the model to file. + + Args: + file_path: filename path. + """ file_dir = os.path.dirname(file_path) if not (os.path.isdir(file_dir)): os.makedirs(file_dir, exist_ok=True) @@ -52,11 +77,24 @@ def save(self, file_path): f.create_dataset("config", data=np.array(config, dtype="S")) self.save_params(f) - @abstractmethod def save_params(self, f): - assert True, "save_params method not defined for %s" % (self.__class__.__name__) + """Saves the model paramters into the file. + + Args: + f: file handle. + """ + raise NotImplementedError( + f"save_params method not defined for {self.__class__.__name__}" + ) def _save_params_from_dict(self, f, params, dtypes=None): + """Saves a dictionary of model parameters into the file. + + Args: + f: file handle. + params: dictionary of model parameters. + dtypes: dictionary indicating the dtypes of the model parameters. + """ if dtypes is None: dtypes = dict((k, float_save()) for k in params) @@ -74,6 +112,14 @@ def _save_params_from_dict(self, f, params, dtypes=None): @classmethod def load_config(cls, file_path): + """Loads the model configuration from file. + + Args: + file_path: path to the file where the model is stored. + + Returns: + Dictionary containing the model configuration. + """ try: with h5py.File(file_path, "r") as f: json_str = str(np.asarray(f["config"]).astype("U")) @@ -84,6 +130,14 @@ def load_config(cls, file_path): @classmethod def load(cls, file_path): + """Loads the model from file. + + Args: + file_path: path to the file where the model is stored. + + Returns: + Model object. + """ with h5py.File(file_path, "r") as f: json_str = str(np.asarray(f["config"]).astype("U")) config = cls.load_config_from_json(json_str) @@ -91,10 +145,31 @@ def load(cls, file_path): @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ return cls(name=config["name"]) @staticmethod def _load_params_to_dict(f, name, params, dtypes=None): + """Loads the model parameters from file to a dictionary. + + Args: + f: file handle. + name: model identifier or None. + params: parameter names. + dtypes: dictionary containing the dtypes of the parameters. + + Returns: + Dictionary with model parameters. + """ if dtypes is None: dtypes = dict((k, float_cpu()) for k in params) if name is None: @@ -113,12 +188,13 @@ def _load_params_to_dict(f, name, params, dtypes=None): param_dict[k] = None return param_dict - @abstractmethod def get_config(self): + """Returns the model configuration dict.""" config = {"class_name": self.__class__.__name__, "name": self.name} return config def to_json(self, **kwargs): + """Returns model config as json string.""" # Piece of code borrowed from keras def get_json_type(obj): # if obj is any numpy type @@ -136,4 +212,5 @@ def get_json_type(obj): @staticmethod def load_config_from_json(json_str): + """Converts json string into dict.""" return json.loads(json_str) diff --git a/hyperion/np/pdfs/core/exp_family.py b/hyperion/np/pdfs/core/exp_family.py index 44fc172c..c91469e7 100644 --- a/hyperion/np/pdfs/core/exp_family.py +++ b/hyperion/np/pdfs/core/exp_family.py @@ -5,20 +5,27 @@ import numpy as np -from abc import ABCMeta, abstractmethod from .pdf import PDF class ExpFamily(PDF): - __metaclass__ = ABCMeta + """Base class for exponential family distribution. + + p(x) = h(x) exp(\eta u(x) - A) + + Attributes: + eta: natural parameters of the distribution. + x_dim: data dimension. + """ def __init__(self, eta=None, **kwargs): - super(ExpFamily, self).__init__(**kwargs) + super().__init__(**kwargs) self.eta = eta self.A = None @property def is_init(self): + """Returns True if the model has been initialized.""" if not self._is_init: self._compute_nat_std() if self.eta is not None and self.A is not None: @@ -29,6 +36,21 @@ def is_init(self): def fit( self, x, sample_weight=None, x_val=None, sample_weight_val=None, batch_size=None ): + """Trains the model. + + Args: + x: train data matrix with shape (num_samples, x_dim). + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + sample_weight_val: weight of each sample in the val. loss. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) self.Mstep(N, u_x) @@ -44,23 +66,49 @@ def fit( return elbo def log_h(self, x): + """Computes log h(x) of the exp. family.""" return 0 def accum_log_h(self, x, sample_weight=None): + """Accumlates log h(x)""" if sample_weight is None: return np.sum(self.log_h(x)) return np.sum(sample_weight * self.log_h(x)) def compute_suff_stats(self, x): + """Computes sufficient stats for a data sample.""" return x def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): + """Accumlates sufficient statistis over several data samples. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ if u_x is not None or batch_size is None: return self._accum_suff_stats_1batch(x, u_x, sample_weight) else: return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): + """Accumlates sufficient statistis over several data samples for a single batch. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ if u_x is None: u_x = self.compute_suff_stats(x) if sample_weight is None: @@ -72,6 +120,18 @@ def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): return N, acc_u_x def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): + """Accumlates sufficient statistis over several data samples for multiple batches. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ sw_i = None for i1 in range(0, x.shape[0], batch_size): i2 = np.minimum(i1 + batch_size, x.shape[0]) @@ -87,23 +147,56 @@ def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): u_x += u_x_i return N, u_x - def add_suff_stats(self, N, u_x): + def sum_suff_stats(self, N, u_x): + """Sums suff. stats from muttiple sub-processes. + + Args: + N: zero order stats with shape = (num_proc,) + u_x: higher order stats with shape = (num_proc, u(x)_dim). + + Args: + Accumalted N and u_x. + """ assert len(N) == len(u_x) acc_N = N[1] acc_u_x = u_x[1] for i in range(1, len(N)): acc_N += N - acc_u_x += u[i] + acc_u_x += u_x[i] return acc_N, acc_u_x def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): + """Expectation step, accumlates suff. stats. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - @abstractmethod def Mstep(self, stats): + """Maximization step.""" pass def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None): + """Evidence lower bound. + + Args: + x: data samples with shape = (num_samples, x_dim). + u_x: accumlated u(x) (optional). + log_h: accumlated log h(x) (optional). + sample_weight: weigth of each sample in the loss function. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + log p(X) of the data. + """ assert self.is_init if u_x is None: N, u_x = self.accum_suff_stats( @@ -114,12 +207,33 @@ def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None return log_h + np.inner(u_x, self.eta) - N * self.A def log_prob(self, x, u_x=None, method="nat"): + """log p(x) of each data sample. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + method: the probability is computed using standard ("std") or + natural parameters ("nat"). + + Returns: + log p(x) with shape (num_samples,) + """ if method == "nat": return self.log_prob_nat(x, u_x) else: return self.log_prob_std(x) def log_prob_nat(self, x, u_x=None): + """log p(x) of each data sample computed using the + natural parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + + Returns: + log p(x) with shape (num_samples,) + """ assert self.is_init if u_x is None: u_x = self.compute_suff_stats(x) @@ -127,31 +241,32 @@ def log_prob_nat(self, x, u_x=None): @staticmethod def compute_A_nat(eta): + """Computes A_theta from the natural param.""" raise NotImplementedError() @staticmethod def compute_A_std(params): + """Computes A_theta from the standard param.""" raise NotImplementedError() @staticmethod def compute_eta(param): + """Computes the natural param. from the standard param.""" raise NotImplementedError() @staticmethod def compute_std(eta): + """Computes the standard param. from the natural param.""" raise NotImplementedError() - @abstractmethod def _compute_nat_params(self): pass - @abstractmethod def _compute_std_params(self): pass def _compute_nat_std(self): pass - @abstractmethod def validate(self): pass diff --git a/hyperion/np/pdfs/core/normal.py b/hyperion/np/pdfs/core/normal.py index ed60edb7..4c3c70cf 100644 --- a/hyperion/np/pdfs/core/normal.py +++ b/hyperion/np/pdfs/core/normal.py @@ -4,9 +4,7 @@ """ import numpy as np -import h5py import scipy.linalg as la -from scipy.special import erf from ....hyp_defs import float_cpu from ....utils.plotting import ( @@ -28,6 +26,17 @@ class Normal(ExpFamily): + """Class for Normal distribution with full covariance. + + Attributes: + mu: mean with shape (x_dim,) or None. + Lambda: precision with shape (x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + def __init__( self, mu=None, @@ -37,7 +46,7 @@ def __init__( update_Lambda=True, **kwargs ): - super(Normal, self).__init__(**kwargs) + super().__init__(**kwargs) self.mu = mu self.Lambda = Lambda self.var_floor = var_floor @@ -51,6 +60,7 @@ def __init__( self._Sigma = None def _compute_nat_std(self): + """Comptues natural and standard parameters of the distribution.""" if self.mu is not None and self.Lambda is not None: self._validate_mu() self._validate_Lambda() @@ -62,6 +72,7 @@ def _compute_nat_std(self): @property def logLambda(self): + """log precision determinant.""" if self._logLambda is None: assert self.is_init f, L, logL = invert_pdmat(self.Lambda, return_logdet=True) @@ -71,6 +82,7 @@ def logLambda(self): @property def cholLambda(self): + """Cholesqy decomp. of the precision.""" if self._cholLambda is None: assert self.is_init f, L, logL = invert_pdmat(self.Lambda, return_logdet=True) @@ -80,26 +92,42 @@ def cholLambda(self): @property def Sigma(self): + """Covariance.""" if self._Sigma is None: assert self.is_init self._Sigma = invert_pdmat(self.Lambda, return_inv=True)[-1] return self._Sigma def initialize(self): + """Initializes the distribution.""" self.validate() self._compute_nat_std() def stack_suff_stats(self, F, S=None): + """Stacks F and S suff stats into single vector.""" if S is None: return F return np.hstack((F, S)) def unstack_suff_stats(self, stats): + """Decomposes suff. stats vector into F and S.""" F = stats[: self.x_dim] S = stats[self.x_dim :] return F, S def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): + """Accumlates sufficient statistis over several data samples. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: unused + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ if u_x is None: if sample_weight is None: N = x.shape[0] @@ -115,12 +143,23 @@ def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): return self._accum_suff_stats_1batch(x, u_x, sample_weight) def norm_suff_stats(self, N, u_x, return_order2=False): + """Normalizes accumlated sufficient statistics with the + mean and covariance of the distribution. + + Args: + N: zeroth order sufficient stats. + u_x: 1st and 2nd order stats. + return_order2: whether or not return normalizes 2nd order stats. + + Return: + Normalized N, F or N, [F, S]. + """ assert self.is_init F, S = self.unstack_suff_stats(u_x) F_norm = np.dot(F - N * self.mu, self.cholLambda.T) if return_order2: - SS = vec2symat(S) + SS = vec2symmat(S) Fmu = np.outer(self.F, self.mu) SS = SS - Fmu - Fmu.T + N * np.outer(self.mu, self.mu) SS = np.dot(self.cholLambda, np.dot(SS, self.cholLambda.T)) @@ -129,7 +168,13 @@ def norm_suff_stats(self, N, u_x, return_order2=False): return N, F_norm def Mstep(self, N, u_x): + """Maximization step. + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + + """ F, S = self.unstack_suff_stats(u_x) if self.update_mu: @@ -147,6 +192,15 @@ def Mstep(self, N, u_x): self._compute_nat_params() def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ assert self.is_init mah_dist2 = np.sum(np.dot(x - self.mu, self.cholLambda) ** 2, axis=1) return ( @@ -155,17 +209,17 @@ def log_prob_std(self, x): - 0.5 * mah_dist2 ) - # def eval_logcdf(self, x): - # delta = np.dot((x-self.mu), self.cholLambda) - # lk = 0.5*(1+erf(delta/np.sqrt(2))) - # print(x-self.mu) - # print(la.cholesky(self.Lambda,lower=True)) - # print(self.cholLambda) - # print(delta) - # print(lk) - # return np.sum(np.log(lk+1e-20), axis=-1) - def sample(self, num_samples, rng=None, seed=1024): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ assert self.is_init if rng is None: @@ -173,11 +227,9 @@ def sample(self, num_samples, rng=None, seed=1024): return rng.multivariate_normal(self.mu, self.Sigma, size=(num_samples,)).astype( float_cpu() ) - # x=rng.normal(size=(num_samples, self.x_dim)) - # cholS=la.cholesky(self.Sigma, lower=False, overwrite_a=True) - # return self.mu+np.dot(x, cholS) def get_config(self): + """Returns the model configuration dict.""" config = { "var_floor": self.var_floor, "update_mu": self.update_mu, @@ -187,7 +239,11 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + Args: + f: file handle. + """ assert self.is_init params = {"mu": self.mu, "Lambda": self.Lambda} @@ -195,8 +251,18 @@ def save_params(self, f): @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "Lambda"] - params = self._load_params_to_dict(f, config["name"], param_list) + params = cls._load_params_to_dict(f, config["name"], param_list) return cls( x_dim=config["x_dim"], mu=params["mu"], @@ -217,6 +283,7 @@ def _validate_eta(self): assert self.eta.shape[0] == (self.x_dim ** 2 + 3 * self.x_dim) / 2 def validate(self): + """Validates the parameters of the distribution.""" if self.mu is not None and self.Lambda is not None: self._validate_mu() self._validate_Lambda() @@ -226,18 +293,21 @@ def validate(self): @staticmethod def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" Lmu = np.dot(mu, Lambda) eta = np.hstack((Lmu, -symmat2vec(Lambda, diag_factor=0.5))) return eta @staticmethod def compute_x_dim_from_eta(eta): + """Computes data dim. from natural param.""" x_dim = 0.5 * (-3 + np.sqrt(9 + 8 * eta.shape[-1])) assert int(x_dim) == x_dim return int(x_dim) @staticmethod def compute_std(eta): + """Computes standard params. from the natural param.""" x_dim = Normal.compute_x_dim_from_eta(eta) eta1 = eta[:x_dim] eta2 = vec2symmat(eta[x_dim:], diag_factor=2) / 2 @@ -248,6 +318,7 @@ def compute_std(eta): @staticmethod def compute_A_nat(eta): + """Computes A from the natural param.""" x_dim = Normal.compute_x_dim_from_eta(eta) eta1 = eta[:x_dim] eta2 = vec2symmat(eta[x_dim:], diag_factor=2) / 2 @@ -259,6 +330,7 @@ def compute_A_nat(eta): @staticmethod def compute_A_std(mu, Lambda): + """Computes A from the standard params.""" x_dim = mu.shape[0] r1 = 0.5 * x_dim * np.log(2 * np.pi) r2 = -0.5 * logdet_pdmat(Lambda) @@ -266,15 +338,9 @@ def compute_A_std(mu, Lambda): return r1 + r2 + r3 def _compute_nat_params(self): + """Computes all natural params from mean and precision.""" self.eta = self.compute_eta(self.mu, self.Lambda) self.A = self.compute_A_std(self.mu, self.Lambda) - # self.A = self.compute_A_nat(self.eta) - # Lmu = np.dot(self.Lambda, self.mu[:, None]) - # muLmu = np.dot(self.mu, Lmu) - # lnr = 0.5*self.lnLambda - 0.5*self.x_dim*np.log(2*np.pi)-0.5*muLmu - # Lambda=np.copy(self.Lambda) - # Lambda[np.diag_indices(self.x_dim)] /= 2 - # self.eta=np.vstack((lnr, Lmu, symmat2vec(Lambda)[:, None])) def _compute_std_params(self): self.mu, self.Lambda = self.compute_std(self.eta) @@ -284,6 +350,14 @@ def _compute_std_params(self): @staticmethod def compute_suff_stats(x): + """Computes the sufficient stats. for each sample. + + Args: + x: data samples with shape (num_samples, x_dim). + + Returns: + Sufficient stats. for each data sample with shape (num_samples, u_dim). + """ d = x.shape[1] u = np.zeros((x.shape[0], int(d + d * (d + 1) / 2)), dtype=float_cpu()) u[:, :d] = x @@ -295,12 +369,28 @@ def compute_suff_stats(x): return u def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of the Gaussian in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ assert self.is_init mu = self.mu[feat_idx] C = invert_pdmat(self.Lambda, return_inv=True)[-1][feat_idx, feat_idx] plot_gaussian_1D(mu, C, num_sigmas, num_pts, **kwargs) def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of the Gaussian in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ assert self.is_init mu = self.mu[feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) @@ -308,6 +398,14 @@ def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): plot_gaussian_ellipsoid_2D(mu, C, num_sigmas, num_pts, **kwargs) def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of the Gaussian in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ assert self.is_init mu = self.mu[feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) @@ -315,6 +413,14 @@ def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): plot_gaussian_3D(mu, C, num_sigmas, num_pts, **kwargs) def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of the Gaussian in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ assert self.is_init mu = self.mu[feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) diff --git a/hyperion/np/pdfs/core/normal_diag_cov.py b/hyperion/np/pdfs/core/normal_diag_cov.py index cb21f84c..8a896cd5 100644 --- a/hyperion/np/pdfs/core/normal_diag_cov.py +++ b/hyperion/np/pdfs/core/normal_diag_cov.py @@ -4,12 +4,8 @@ """ import numpy as np -import h5py from scipy.special import erf -# import matplotlib.pyplot as plt -# import matplotlib.mlab as mlab - from ....hyp_defs import float_cpu from ....utils.plotting import ( plot_gaussian_1D, @@ -22,6 +18,17 @@ class NormalDiagCov(ExpFamily): + """Class for Normal distribution with diagonal covariance. + + Attributes: + mu: mean with shape (x_dim,) or None. + Lambda: precision with shape (x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + def __init__( self, mu=None, @@ -31,7 +38,7 @@ def __init__( update_Lambda=True, **kwargs ): - super(NormalDiagCov, self).__init__(**kwargs) + super().__init__(**kwargs) self.mu = mu self.Lambda = Lambda self.var_floor = var_floor @@ -45,6 +52,7 @@ def __init__( self._Sigma = None def _compute_nat_std(self): + """Comptues natural and standard parameters of the distribution.""" if self.mu is not None and self.Lambda is not None: self._validate_mu() self._validate_Lambda() @@ -56,6 +64,7 @@ def _compute_nat_std(self): @property def logLambda(self): + """log precision determinant.""" if self._logLambda is None: assert self.is_init self._logLambda = np.sum(np.log(self.Lambda)) @@ -63,6 +72,7 @@ def logLambda(self): @property def cholLambda(self): + """Square root of precision.""" if self._cholLambda is None: assert self.is_init self._cholLambda = np.sqrt(self.Lambda) @@ -70,27 +80,43 @@ def cholLambda(self): @property def Sigma(self): + "Variance of the distribution." if self._Sigma is None: assert self.is_init self._Sigma = 1.0 / self.Lambda return self._Sigma def initialize(self): + """Initializes the distribution.""" self.validate() self._compute_nat_std() assert self.is_init def stack_suff_stats(self, F, S=None): + """Stacks F and S suff stats into single vector.""" + if S is None: return F return np.hstack((F, S)) def unstack_suff_stats(self, stats): + """Decomposes suff. stats vector into F and S.""" F = stats[: self.x_dim] S = stats[self.x_dim :] return F, S def norm_suff_stats(self, N, u_x=None, return_order2=False): + """Normalizes accumlated sufficient statistics with the + mean and covariance of the distribution. + + Args: + N: zeroth order sufficient stats. + u_x: 1st and 2nd order stats. + return_order2: whether or not return normalizes 2nd order stats. + + Return: + Normalized N, F or N, [F, S]. + """ assert self.is_init F, S = self.unstack_suff_stats(u_x) F_norm = self.cholLambda * (F - N * self.mu) @@ -101,7 +127,13 @@ def norm_suff_stats(self, N, u_x=None, return_order2=False): return N, F_norm def Mstep(self, N, u_x): + """Maximization step. + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + + """ F, S = self.unstack_suff_stats(u_x) if self.update_mu: @@ -118,6 +150,15 @@ def Mstep(self, N, u_x): self._compute_nat_params() def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ assert self.is_init mah_dist2 = np.sum(((x - self.mu) * self.cholLambda) ** 2, axis=1) return ( @@ -127,12 +168,23 @@ def log_prob_std(self, x): ) def log_cdf(self, x): + """Log cumulative distribution function.""" assert self.is_init delta = (x - self.mu) * self.cholLambda lk = 0.5 * (1 + erf(delta / np.sqrt(2))) return np.sum(np.log(lk + 1e-10), axis=-1) def sample(self, num_samples, rng=None, seed=1024): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ assert self.is_init if rng is None: rng = np.random.RandomState(seed) @@ -140,6 +192,7 @@ def sample(self, num_samples, rng=None, seed=1024): return self.mu + 1.0 / self.cholLambda * x def get_config(self): + """Returns the model configuration dict.""" config = { "var_floor": self.var_floor, "update_mu": self.update_mu, @@ -149,14 +202,29 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ assert self.is_init params = {"mu": self.mu, "Lambda": self.Lambda} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "Lambda"] - params = self._load_params_to_dict(f, config["name"], param_list) + params = cls._load_params_to_dict(f, config["name"], param_list) return cls( x_dim=config["x_dim"], mu=params["mu"], @@ -178,6 +246,7 @@ def _validate_eta(self): assert self.eta.shape[0] == self.x_dim * 2 def validate(self): + """Validates the parameters of the distribution.""" if self.mu is not None and self.Lambda is not None: self._validate_mu() self._validate_Lambda() @@ -187,12 +256,14 @@ def validate(self): @staticmethod def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" Lmu = Lambda * mu eta = np.hstack((Lmu, -0.5 * Lambda)) return eta @staticmethod def compute_std(eta): + """Computes standard params. from the natural param.""" x_dim = int(eta.shape[0] / 2) eta1 = eta[:x_dim] eta2 = eta[x_dim:] @@ -202,6 +273,7 @@ def compute_std(eta): @staticmethod def compute_A_nat(eta): + """Computes A from the natural param.""" x_dim = int(eta.shape[0] / 2) eta1 = eta[:x_dim] eta2 = eta[x_dim:] @@ -212,6 +284,7 @@ def compute_A_nat(eta): @staticmethod def compute_A_std(mu, Lambda): + """Computes A from the standard params.""" x_dim = mu.shape[0] r1 = 0.5 * x_dim * np.log(2 * np.pi) r2 = -0.5 * np.sum(np.log(Lambda)) @@ -234,6 +307,14 @@ def _compute_std_params(self): @staticmethod def compute_suff_stats(x): + """Computes the sufficient stats. for each sample. + + Args: + x: data samples with shape (num_samples, x_dim). + + Returns: + Sufficient stats. for each data sample with shape (num_samples, u_dim). + """ d = x.shape[1] u = np.zeros((x.shape[0], 2 * d), dtype=float_cpu()) u[:, :d] = x @@ -241,21 +322,53 @@ def compute_suff_stats(x): return u def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of the Gaussian in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[feat_idx] C = 1 / self.Lambda[feat_idx] plot_gaussian_1D(mu, C, num_sigmas, num_pts, **kwargs) def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of the Gaussian in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[feat_idx] C = np.diag(1.0 / self.Lambda[feat_idx]) plot_gaussian_ellipsoid_2D(mu, C, num_sigmas, num_pts, **kwargs) def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of the Gaussian in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[feat_idx] C = np.diag(1.0 / self.Lambda[feat_idx]) plot_gaussian_3D(mu, C, num_sigmas, num_pts, **kwargs) def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of the Gaussian in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[feat_idx] C = np.diag(1.0 / self.Lambda[feat_idx]) plot_gaussian_ellipsoid_3D(mu, C, num_sigmas, num_pts, **kwargs) diff --git a/hyperion/np/pdfs/core/pdf.py b/hyperion/np/pdfs/core/pdf.py index acd26105..82f4330d 100644 --- a/hyperion/np/pdfs/core/pdf.py +++ b/hyperion/np/pdfs/core/pdf.py @@ -5,32 +5,44 @@ import numpy as np -from abc import ABCMeta, abstractmethod from ...np_model import NPModel class PDF(NPModel): - __metaclass__ = ABCMeta + """Base class for probability density functions. + + Attributes: + x_dim: data dimension. + """ def __init__(self, x_dim=1, **kwargs): - super(PDF, self).__init__(**kwargs) + super().__init__(**kwargs) self.x_dim = x_dim def get_config(self): + """Returns the model configuration dict.""" config = {"x_dim": self.x_dim} - base_config = super(PDF, self).get_config() + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - @abstractmethod def log_prob(self, x): - pass + """Computes log probability of the data.""" + raise NotImplementedError() def eval_llk(self, x): + """Computes log likelihood of the data.""" return self.log_prob(x) - @abstractmethod def sample(self, num_samples): - pass + """Draws samples from the data distribution.""" + raise NotImplementedError() def generate(self, num_samples, **kwargs): - return self.generate(num_samples, **kwargs) + """Draws samples from the data distribution. + Args: + num_samples: number of samples to generate. + + Returns: + np.array of generated samples with shape=(num_samples, x_dim) + """ + return self.sample(num_samples, **kwargs) diff --git a/hyperion/np/pdfs/hmm/hmm.py b/hyperion/np/pdfs/hmm/hmm.py index b8497b38..704f0991 100644 --- a/hyperion/np/pdfs/hmm/hmm.py +++ b/hyperion/np/pdfs/hmm/hmm.py @@ -11,18 +11,19 @@ class HMM(PDF): - def __init__(self, **kwargs): - super(HMM, self).__init__( - num_states=1, - pi=None, - trans=None, - trans_mask=None, - update_pi=True, - update_trans=True, - tied_trans=False, - left_to_right=False, - **kwargs - ) + def __init__( + self, + num_states=1, + pi=None, + trans=None, + trans_mask=None, + update_pi=True, + update_trans=True, + tied_trans=False, + left_to_right=False, + **kwargs + ): + super().__init__(**kwargs) if pi is not None: num_states = len(pi) @@ -36,13 +37,13 @@ def __init__(self, **kwargs): self.tied_trans = tied_trans self.left_to_right = left_to_right - if left_to_rigth and (trans_mask is None): + if left_to_right and (trans_mask is None): self.trans_mask = np.triu(np.ones_like(self.trans)) self._log_pi = None self._log_trans = None - def reset_aux(): + def reset_aux(self): self._log_pi = None self._log_trans = None @@ -132,11 +133,11 @@ def compute_pz(self, x, return_Nzz=False, return_log_px=False): pz = softmax(log_alpha + log_beta, axis=-1) - if not (return_Nzz or return_elbo or return_log_px): + if not (return_Nzz or return_log_px): return pz r = [pz] - if return_pzz_acc: + if return_Nzz: x_e = np.expand_dims(axis=1) log_alpha_e = np.expand_dims(axis=-1) log_beta_e = np.expand_dims(axis=1) @@ -169,7 +170,7 @@ def Estep(self, x, stats_0=None): pz, Nzz = self.compute_pz(x, return_Nzz=True) Nz += pz[0] - Nzz += pzz + Nzz += Nzz stats = (Nz, Nzz) return pz, stats @@ -238,7 +239,7 @@ def sample(self, num_seqs, num_steps, rng=None, seed=1024): for t in range(1, num_steps): for k in range(self.num_states): index = x[:, t - 1, k] == 1 - n_k = num.sum(index) + n_k = np.sum(index) if n_k == 0: continue x[index] = rng.multinomial(1, self.trans[k], size=(n_k,)) diff --git a/hyperion/np/pdfs/jfa/jfa_total.py b/hyperion/np/pdfs/jfa/jfa_total.py index 4a11b5cf..993da9d6 100644 --- a/hyperion/np/pdfs/jfa/jfa_total.py +++ b/hyperion/np/pdfs/jfa/jfa_total.py @@ -4,7 +4,7 @@ """ import numpy as np -from scipy import linalg as sla +from scipy import linalg as la from ....hyp_defs import float_cpu from ....utils.math import ( @@ -18,8 +18,17 @@ class JFATotal(PDF): + """Class for joint factor analysis with total variability matrix (i-vectors). + Args: + K: number of gaussian components. + y_dim: dimension of total variability sub-space. + T: Total variability matrix with shape (y_dim, K * x_dim). + x_dim: data dimension. + + """ + def __init__(self, K, y_dim=None, T=None, **kwargs): - super(JFATotal, self).__init__(**kwargs) + super().__init__(**kwargs) if T is not None: y_dim = T.shape[0] @@ -32,10 +41,12 @@ def __init__(self, K, y_dim=None, T=None, **kwargs): self.__upptr = None def reset_aux(self): + """Resets auxiliary variables.""" self._TT = None @property - def is_init(): + def is_init(self): + """Returns True if the model has been initialized.""" if self._is_init: return True if self.T is not None: @@ -43,15 +54,36 @@ def is_init(): return self._is_init def initialize(self, N, F): - assert N.shape[0] == self.K + """Initializes the model. + Args: + N: zero order statistics (num_utterances, K). + F: first order statisticss (num_utterances, K * x_dim) + """ + assert N.shape[1] == self.K self.T = np.random.randn(self.y_dim, F.shape[1]).astype(float_cpu(), copy=False) def compute_py_g_x( self, N, F, G=None, return_cov=False, return_elbo=False, return_acc=False ): + """Computes the latent posterior P(Y|X). + + Args: + N: zero order statistics (num_utterances, K). + F: first order statisticss (num_utterances, K * x_dim). + G: logP(x| UBM, Z) to add to elbo (optional). + return_cov: whether or not to return the covariance of the posterior. + return_elbo: whther or not to return the ELBO. + return_acc: whther or not to return accumulated stats for EM algorithm. + + Returns: + y: latent mean (i-vector). + Posterior covariances. + ELBO + Ry accumlator for ML step with shape (y_dim, y_dim) + Py accumlator for MD step with shape (y_dim, y_dim) + """ assert self.is_init - x_dim = int(F.shape[1] / self.K) M = F.shape[0] y_dim = self.y_dim @@ -63,7 +95,7 @@ def compute_py_g_x( y = np.zeros((M, y_dim), dtype=float_cpu()) if return_cov: - Sy = np.zeros((M, y_dim * (y_dim + 1) / 2), dtype=float_cpu()) + Sy = np.zeros((M, int(y_dim * (y_dim + 1) // 2)), dtype=float_cpu()) else: Sy = None @@ -72,7 +104,7 @@ def compute_py_g_x( if return_acc: Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) - Ry = np.zeros((self.K, y_dim * (y_dim + 1) / 2), dtype=float_cpu()) + Ry = np.zeros((self.K, int(y_dim * (y_dim + 1) // 2)), dtype=float_cpu()) Li = np.zeros((self.y_dim, self.y_dim), dtype=float_cpu()) for i in range(N.shape[0]): @@ -94,7 +126,7 @@ def compute_py_g_x( if return_acc: iL += np.outer(y[i], y[i]) Py += iL - Ry += iL[self.__uppr] * N[i][:, None] + Ry += iL[self.__upptr] * N[i][:, None] if not return_tuple: return y @@ -107,7 +139,7 @@ def compute_py_g_x( if return_elbo: if G is not None: elbo += G - elbo += 0.5 * np.sum(VF * y, axis=-1) + elbo += 0.5 * np.sum(TF * y, axis=-1) r += [elbo] if return_acc: @@ -116,14 +148,24 @@ def compute_py_g_x( return tuple(r) def Estep(self, N, F, G=None): + """Computes the latent posterior P(Y|X). + + Args: + N: zero order statistics (num_utterances, K). + F: first order statisticss (num_utterances, K * x_dim). + G: logP(x| UBM, Z) to add to elbo (optional). + Results: + Tuple with stats needed by the maximization step: + ELBO, num_classes, accumulated y, Ry, Cy, Py + """ y, elbo, Ry, Py = self.compute_py_g_x( N, F, G, return_elbo=True, return_acc=True ) M = y.shape[0] y_acc = np.sum(y, axis=0) - Cy = np.dot(F, y) + Cy = np.dot(F.T, y) elbo = np.sum(elbo) @@ -131,10 +173,15 @@ def Estep(self, N, F, G=None): return stats def MstepML(self, stats): + """Maximum likelihood step. + + Args: + stats: tuple with statistics prouced by the estimation step. + """ _, M, y_acc, Ry, Cy, _ = stats T = np.zeros_like(self.T) Ryk = np.zeros((self.y_dim, self.y_dim), dtype=float_cpu()) - x_dim = T.shape[1] / self.K + x_dim = T.shape[1] // self.K for k in range(self.K): idx = k * x_dim Ryk[self._upptr] = Ry[k] @@ -145,9 +192,14 @@ def MstepML(self, stats): self.reset_aux() def MstepMD(self, stats): + """Minimum divergence step. + + Args: + stats: tuple with statistics prouced by the estimation step. + """ _, M, y_acc, Ry, Cy, Py = stats mu_y = y_acc / M - Cy = Py / M - np.outer(my_y, mu_y) + Cy = Py / M - np.outer(mu_y, mu_y) chol_Cy = la.cholesky(Cy, lower=False, overwrite_a=True) self.T = np.dot(chol_Cy, self.T) @@ -160,10 +212,30 @@ def fit( G=None, N_val=None, F_val=None, + G_val=None, epochs=20, ml_md="ml+md", md_epochs=None, ): + """Trains the model. + + Args: + N: zero order sufficient statistics for training data with shape (num_utterances, K). + F: first order sufficient statistics for training data with shape (num_utterances, K*x_dim). + G: logP(x| UBM, Z) for training data to add to elbo (optional). + N_val: zero order sufficient statistics for val data with shape (num_utterances, K). + F_val: first order sufficient statistics for val data with shape (num_utterances, K*x_dim). + G_val: logP(x| UBM, Z) for val data to add to elbo (optional). + epochs: number of EM steps. + ml_md: whether to do maximum likelihood estimation ("ml"), minimum divergence ("md") or both ("ml+md"). + md_epochs: in which epochs to do MD estimation, if None, MD is done in all epochs. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ use_ml = False if ml_md == "md" else True use_md = False if ml_md == "ml" else True @@ -178,7 +250,7 @@ def fit( stats = self.Estep(N, F, G) elbo[epoch] = stats[0] if N_val is not None and F_val is not None: - _, elbo_val_e = self.compute_py_x(N, F, G, return_elbo=True) + _, elbo_val_e = self.compute_py_x(N_val, F_val, G_val, return_elbo=True) elbo_val[epoch] = np.sum(elbo_val_e) if use_ml: @@ -187,7 +259,7 @@ def fit( self.MstepMD(stats) elbo_norm = elbo / np.sum(N) - if x_val is None: + if N_val is None: return elbo, elbo_norm else: elbo_val_norm = elbo_val / np.sum(N_val) @@ -195,38 +267,72 @@ def fit( @property def TT(self): + """ + Returns: + Matrices T_k T_k.T for Gaussian component k. + Matrices are vectorized and keep the upper triangular matrix + with shape = (K, y_dim (y_dim-1)/2 ) + """ if self._TT is None: - self._TT = self.compute_TT(self.T, self.K) + self._TT = self.compute_TT(self.T, self.K, self._upptr) return self._TT @property def _upptr(self): + """Upper triangular mask.""" if self.__upptr is None: - I = np.eye(self.y_dim, dtype=float_cpu()) - self.__upptr = np.triu(I).ravel() + self.__upptr = np.triu(np.ones(self.y_dim, dtype=np.bool)) return self.__upptr @staticmethod - def compute_TT(self, T, K, upptr): + def compute_TT(T, K, upptr): + """Computes T_k T_k.T matrices. + + Args: + T: Total variability factor loading matrix. + K: number of Gaussian components. + upptr: upper triangular mask. + + Returns: + Matrices T_k T_k.T for Gaussian component k. + Matrices are vectorized and keep the upper triangular matrix + with shape = (K, y_dim (y_dim-1)/2 ) + """ x_dim = int(T.shape[1] / K) y_dim = T.shape[0] - TT = np.zeros((K, y_dim * (y_dim + 1) / 2), dtype=float_cpu()) + TT = np.zeros((K, int(y_dim * (y_dim + 1) / 2)), dtype=float_cpu()) for k in range(K): idx = k * x_dim T_k = T[:, idx : idx + x_dim] TT_k = np.dot(T_k, T_k.T) - TT[k] = TT_k[self._upptr] + TT[k] = TT_k[upptr] return TT @staticmethod def compute_L(TT, N, upptr): - y_dim = self._upptr.shape[0] - I = np.eye(y_dim, dtype=float_cpu())[self._upptr] + """Computes P(y|x) precision. + + Args: + TT: T_k T_k.T matrices. + N: zero order statistics. + upptr: upper triangular mask. + + Returns: + Posterior precision vectorized to keep just the upper triangular matrix. + """ + y_dim = upptr.shape[0] + I = np.eye(y_dim, dtype=float_cpu())[upptr] return I + np.dot(N, TT) @staticmethod def normalize_T(T, chol_prec): + """Normalizes T by the covariances of the GMM. + + Args: + T: original total variability matrix. + chol_prec: cholesqy decomp. of the precisions of the GMM components. + """ Tnorm = np.zeros_like(T) K = chol_prec.shape[0] x_dim = int(T.shape[1] / K) @@ -239,20 +345,37 @@ def normalize_T(T, chol_prec): return Tnorm def get_config(self): + """Returns the model configuration dict.""" config = {"K": self.K} base_config = super(JFATotal, self).get_config() return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"T": self.T} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["T"] params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) def sample(self, num_samples): - pass + """Draws samples from the i-vector model.""" + raise NotImplementedError() diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index 143d7df5..f684e453 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -5,7 +5,6 @@ import numpy as np import logging -from abc import ABCMeta, abstractmethod from ....hyp_defs import float_cpu from ....utils.math import softmax, logsumexp @@ -14,7 +13,18 @@ class ExpFamilyMixture(PDF): - __metaclass__ = ABCMeta + """Base class for a mixture of exponential family distributions. + + p(x) = \sum_k h(x) exp(\eta_k u(x) - A_k) + + Attributes: + num_comp: number of components of the mixture. + pi: weights of the components. + eta: natural parameters of the distribution. + min_N: minimum number of samples for keeping the component. + update_pi: whether or Not to update the weights when optimizing. + x_dim: data dimension. + """ def __init__( self, num_comp=1, pi=None, eta=None, min_N=0, update_pi=True, **kwargs @@ -32,6 +42,7 @@ def __init__( @property def is_init(self): + """Returns True if the model has been initialized.""" if not self._is_init: if self.eta is not None and self.A is not None and self.pi is not None: self.validate() @@ -40,6 +51,7 @@ def is_init(self): @property def log_pi(self): + """Log weights""" if self._log_pi is None: self._log_pi = np.log(self.pi + 1e-15) return self._log_pi @@ -56,6 +68,22 @@ def fit( epochs=10, batch_size=None, ): + """Trains the model. + + Args: + x: train data matrix with shape (num_samples, x_dim). + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + sample_weight_val: weight of each sample in the val. loss. + epochs: number of EM steps. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ if not self.is_init: self.initialize(x) @@ -93,18 +121,35 @@ def fit_generator( workers=1, use_multiprocessing=False, ): - - do_validation = bool(validation_data) - val_gen = ( - hasattr(validation_data, "next") - or hasattr(validation_data, "__next__") - or isinstance(validation_data, Sequence) - ) - if val_gen and not validation_steps: + """Trains the model from data read by a generator function. + This function is deprecated. + + Args: + generator: train data generator function returning a tuple + (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. + train_steps: number of training steps / epoch + epochs: number of epochs. + val_data: val. data generator function returning a tuple + (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. + val_steps: number of validation steps / epoch + max_queue_size: max. size of the generator queue. + workers: number of workers in the generator. + use_multiprocessing: use multi-processing in the generator queue. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ + + do_validation = bool(val_data) + val_gen = hasattr(val_data, "next") or hasattr(val_data, "__next__") + if val_gen and not val_steps: raise ValueError( "When using a generator for validation data, " "you must specify a value for " - "`validation_steps`." + "`val_steps`." ) if do_validation and not val_gen: @@ -129,7 +174,7 @@ def fit_generator( if val_data is not None: if val_gen: N, u_x, log_h_val = self.Estep_generator( - generator, + val_data, train_steps, return_log_h=True, max_queue_size=max_queue_size, @@ -137,52 +182,100 @@ def fit_generator( use_multiprocessing=use_multiprocessing, ) else: - N, u_x = self.Estep(x_val, u_x_val, sample_weight_val) + N, u_x = self.Estep(val_data, u_x_val, sample_weight_val) elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) - if x_val is None: + if val_data is None: return elbo, elbo / x.shape[0] else: return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] def log_h(self, x): + """Computes log h(x) of the exp. family.""" return 0 def accum_log_h(self, x, sample_weight=None): + """Accumlates log h(x)""" if sample_weight is None: return np.sum(self.log_h(x)) return np.sum(sample_weight * self.log_h(x)) - def compute_log_pz(self, x, u_x=None, mode="nat"): - if u_x is None: - u_x = self.compute_suff_stats(x) - return np.dot(u_x, self.eta.T) - self.A + self.log_pi - def compute_pz(self, x, u_x=None, mode="nat"): + """Computes p(z|x) + + Args: + x: input data with shape (num_samples, x_dim). + u_x: precomputed sufficient stats with shape (num_samples, u_dim). + mode: whether to use natural (nat) or standard (std) parameters. + + Returns: + p(z|x) with shape (num_samples, num_comp) + """ if mode == "nat": return self.compute_pz_nat(x, u_x) else: return self.compute_pz_std(x) def compute_pz_nat(self, x, u_x=None): + """Computes p(z|x) using the natural parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: precomputed sufficient stats with shape (num_samples, u_dim). + + Returns: + p(z|x) with shape (num_samples, num_comp) + """ if u_x is None: u_x = self.compute_suff_stats(x) logr = np.dot(u_x, self.eta.T) - self.A + self.log_pi return softmax(logr) def compute_pz_std(self, x): + """Computes p(z|x) using the standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + p(z|x) with shape (num_samples, num_comp) + """ return self.compute_pz_nat(x) def compute_suff_stats(self, x): + """Computes sufficient stats for a data sample.""" return x def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): + """Accumlates sufficient statistis over several data samples. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ if u_x is not None or batch_size is None: return self._accum_suff_stats_1batch(x, u_x, sample_weight) else: return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): + """Accumlates sufficient statistis over several data samples for a single batch. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ if u_x is None: u_x = self.compute_suff_stats(x) z = self.compute_pz_nat(x, u_x) @@ -195,6 +288,18 @@ def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): return N, acc_u_x def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): + """Accumlates sufficient statistis over several data samples for multiple batches. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ sw_i = None for i1 in range(0, x.shape[0], batch_size): i2 = np.minimum(i1 + batch_size, x.shape[0]) @@ -213,6 +318,19 @@ def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): def accum_suff_stats_segments( self, x, segments, u_x=None, sample_weight=None, batch_size=None ): + """Accumlates sufficient statistis per each segment in an utterance. + + Args: + x: data samples of shape (num_samples, x_dim). + segments: segments t_start and t_end with shape (num_segments, 2). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ K = self.num_comp num_segments = len(segments) N = np.zeros((num_segments, K), dtype=float_cpu()) @@ -238,6 +356,21 @@ def accum_suff_stats_segments( def accum_suff_stats_segments_prob( self, x, prob, u_x=None, sample_weight=None, batch_size=None ): + """Accumlates sufficient statistis per each segment in an utterance, + Segments are defined by the probability for a frame to belong to the + segment + + Args: + x: data samples of shape (num_samples, x_dim). + prob: probability of belonging to a segments with shape (num_samples, num_segments). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ if u_x is not None or batch_size is None: return self._accum_suff_stats_segments_prob_1batch( x, prob, u_x, sample_weight @@ -299,6 +432,20 @@ def accum_suff_stats_sorttime( sample_weight=None, batch_size=None, ): + """Accumlates sufficient statistis over a sliding window. + + Args: + x: data samples of shape (num_samples, x_dim). + frame_length: frame length. + frame_shift: frame shift. + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ if u_x is not None or batch_size is None: return self._accum_suff_stats_sorttime_1batch( x, frame_length, frame_shift, u_x, sample_weight @@ -352,7 +499,7 @@ def _accum_suff_stats_sorttime_nbatches( num_frames = x.shape[0] num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) if num_segments == 1: - return self._accum_suff_stats_1batch(self, x, u_x, sample_weight) + return self._accum_suff_stats_1batch(self, x, None, sample_weight) num_segments_per_batch = np.floor((num_frames - frame_length) / frame_shift + 1) batch_size = int((num_segments_per_batch - 1) * frame_shift + frame_length) @@ -378,6 +525,18 @@ def _accum_suff_stats_sorttime_nbatches( return N, acc_u_x def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): + """Expectation step, accumlates suff. stats. + + Args: + x: data samples of shape (num_samples, x_dim). + u_x: sufficient stats for x with shape = (num_samples, u(x)_dim) (optional). + sample_weight: weight of each sample in the accumalation. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x) + """ return self.accum_suff_stats(x, u_x, sample_weight, batch_size) def Estep_generator( @@ -387,8 +546,24 @@ def Estep_generator( return_log_h, max_queue_size=10, workers=1, - use_multiprocessin=False, + use_multiprocessing=False, ): + """Expectation step, where data is read from a generator function. + + Args: + generator: data generator function returning a tuple + (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. + num_steps: number of steps / epoch + return_log_h: returns accumlated log h(x). + max_queue_size: max. size of the generator queue. + workers: number of workers in the generator. + use_multiprocessing: use multi-processing in the generator queue. + + Returns: + N zero order sufficient statistics (number of samples). + Accumlated sufficient statistics \sum u(x). + Accumlated log h(x) (optional). + """ wait_time = 0.01 # in secs queue = None N = None @@ -415,8 +590,8 @@ def Estep_generator( N += N_i acc_u_x += u_x_i finally: - if enqueuer is not None: - enqueuer.stop() + if queue is not None: + queue.stop() if return_log_h: return N, acc_u_x, log_h @@ -424,19 +599,41 @@ def Estep_generator( return N, acc_u_x def sum_suff_stats(self, N, u_x): + """Sums suff. stats from muttiple sub-processes. + + Args: + N: zero order stats with shape = (num_proc,) + u_x: higher order stats with shape = (num_proc, u(x)_dim). + + Args: + Accumalted N and u_x. + """ assert len(N) == len(u_x) acc_N = N[1] acc_u_x = u_x[1] for i in range(1, len(N)): - acc_N += N - acc_u_x += u[i] + acc_N += N[i] + acc_u_x += u_x[i] return acc_N, acc_u_x - @abstractmethod def Mstep(self, stats): + """Maximization step.""" pass def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None): + """Evidence lower bound. + + Args: + x: data samples with shape = (num_samples, x_dim). + u_x: accumlated u(x) (optional). + N: zero-th orders statistics (optional) + log_h: accumlated log h(x) (optional). + sample_weight: weigth of each sample in the loss function. + batch_size: accumlates sufficient statistics in batch_size blocks. + + Returns: + log p(X) of the data. + """ if u_x is None: N, u_x = self.accum_suff_stats( x, sample_weight=sample_weight, batch_size=batch_size @@ -446,30 +643,84 @@ def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None return log_h + np.sum(u_x * self.eta) + np.inner(N, self.log_pi - self.A) def log_prob(self, x, u_x=None, mode="nat"): + """log p(x) of each data sample. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + method: the probability is computed using standard ("std") or + natural parameters ("nat"). + + Returns: + log p(x) with shape (num_samples,) + """ if mode == "nat": return self.log_prob_nat(x, u_x) else: return self.log_prob_std(x) def log_prob_nat(self, x, u_x=None): + """log p(x) of each data sample computed using the + natural parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + + Returns: + log p(x) with shape (num_samples,) + """ if u_x is None: u_x = self.compute_suff_stats(x) llk_k = np.dot(u_x, self.eta.T) - self.A + self.log_pi llk = logsumexp(llk_k) return self.log_h(x) + llk - @abstractmethod def log_prob_std(self, x): - pass + """log p(x) of each data sample computed using the + standard parameters of the distribution. - def log_prob_nbest(self, x, u_x=None, mode="nat", nbest_mode="master", nbest=1): + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + + Returns: + log p(x) with shape (num_samples,) + """ + raise NotImplementedError() + + def log_prob_nbest(self, x, u_x=None, mode="nat", nbest_mode="ubm", nbest=1): + """log p(x) of each data sample computed using the N best components. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + method: the probability is computed using standard ("std") or + natural parameters ("nat"). + nbest_mode: if "ubm", it selects the best components. + nbest: number of best components, or selected components. + + Returns: + log p(x) with shape (num_samples,) + """ if mode == "nat": return self.log_prob_nbest_nat(x, u_x, nbest_mode=nbest_mode, nbest=nbest) else: - return self.log_prob_std(x, nbest_mode=nbest_mode, nbest=nbest) + return self.log_prob_nbest_std(x, nbest_mode=nbest_mode, nbest=nbest) def log_prob_nbest_nat(self, x, u_x=None, nbest_mode="master", nbest=1): - + """log p(x) of each data sample computed using the N best components + and natural parameters. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + nbest_mode: if "ubm", it selects the best components. + nbest: number of best components, or selected components. + + Returns: + log p(x) with shape (num_samples,) + """ if u_x is None: u_x = self.compute_suff_stats(x) if nbest_mode == "master": @@ -482,11 +733,23 @@ def log_prob_nbest_nat(self, x, u_x=None, nbest_mode="master", nbest=1): llk = logsumexp(llk_k) return self.log_h(x) + llk - @abstractmethod def log_prob_nbest_std(self, x, nbest_mode="master", nbest=1): - pass + """log p(x) of each data sample computed using the N best components + and standard parameters. + + Args: + x: input data with shape (num_samples, x_dim). + u_x: sufficient stats u(x) with shape (num_samples, u_dim). + nbest_mode: if "ubm", it selects the best components. + nbest: number of best components, or selected components. + + Returns: + log p(x) with shape (num_samples,) + """ + raise NotImplementedError() def get_config(self): + """Returns the model configuration dict.""" config = {"min_n": self.min_N, "update_pi": self.update_pi} base_config = super(ExpFamilyMixture, self).get_config() return dict(list(base_config.items()) + list(config.items())) @@ -515,24 +778,26 @@ def tuple2data(data): @staticmethod def compute_A_nat(eta): + """Computes A_theta from the natural param.""" raise NotImplementedError() @staticmethod def compute_A_std(params): + """Computes A_theta from the standard param.""" raise NotImplementedError() @staticmethod def compute_eta(param): + """Computes the natural param. from the standard param.""" raise NotImplementedError() @staticmethod def compute_std(eta): + """Computes the standard param. from the natural param.""" raise NotImplementedError() - @abstractmethod def _compute_nat_params(self): pass - @abstractmethod def _compute_std_params(self): pass diff --git a/hyperion/np/pdfs/mixtures/gmm.py b/hyperion/np/pdfs/mixtures/gmm.py index 391c59ee..4f6d599e 100644 --- a/hyperion/np/pdfs/mixtures/gmm.py +++ b/hyperion/np/pdfs/mixtures/gmm.py @@ -32,8 +32,23 @@ class GMM(ExpFamilyMixture): + """Class for GMM with full covariance. + + Attributes: + num_comp: number of components of the mixture (intered from pi). + pi: weights of the components. + mu: mean with shape (num_comp, x_dim,) or None. + Lambda: precision with shape (num_comp, x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + def __init__( self, + num_comp=1, + pi=None, mu=None, Lambda=None, var_floor=1e-3, @@ -41,7 +56,10 @@ def __init__( update_Lambda=True, **kwargs ): - super().__init__(**kwargs) + if mu is not None: + assert mu.ndim == 2 + kwargs["x_dim"] = mu.shape[1] + super().__init__(num_comp=num_comp, pi=pi, **kwargs) self.mu = mu self.Lambda = Lambda self.var_floor = var_floor @@ -55,6 +73,7 @@ def __init__( self._Sigma = None def _compute_gmm_nat_std(self): + """Comptues natural and standard parameters of the distribution.""" if self.mu is not None and self.Lambda is not None: self._validate_mu() self._validate_Lambda() @@ -65,6 +84,7 @@ def _compute_gmm_nat_std(self): self._compute_std_params() def compute_Lambda_aux(self): + """Comptues auxiliary variables derived from the precision.""" self._logLambda = np.zeros((self.num_comp,), dtype=float_cpu()) self._cholLambda = np.zeros( (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() @@ -76,18 +96,21 @@ def compute_Lambda_aux(self): @property def logLambda(self): + """log precision determinants.""" if self._logLambda is None: self.compute_Lambda_aux() return self._logLambda @property def cholLambda(self): + """Cholesqy decomp. of the precisions.""" if self._cholLambda is None: self.compute_Lambda_aux() return self._cholLambda @property def Sigma(self): + """Covariances.""" if self._Sigma is None: self._Sigma = np.zeros( (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() @@ -97,6 +120,7 @@ def Sigma(self): return self._Sigma def initialize(self, x=None): + """Initializes the distribution.""" if x is None and self.mu is None and self.eta is None: assert self.num_comp == 1 self._initialize_stdnormal() @@ -106,12 +130,19 @@ def initialize(self, x=None): self._compute_gmm_nat_std() def _initialize_stdnormal(self): + """Initializes a single component GMM with std. Normal.""" self.pi = np.array([1], dtype=float_cpu()) self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) self.Lambda = np.zeros((1, self.x_dim, self.x_dim), dtype=float_cpu()) self.Lambda[0] = np.eye(self.x_dim, dtype=float_cpu()) def _initialize_kmeans(self, num_comp, x): + """Initializes the GMM with K-Means. + + Args: + num_comp: number of components. + x: initialization data with shape (num_samples, x_dim). + """ if num_comp == 1: self.pi = np.array([1], dtype=float_cpu()) self.mu = np.mean(x, axis=0, keepdims=True) @@ -138,22 +169,35 @@ def _initialize_kmeans(self, num_comp, x): self.Lambda[k] = invert_pdmat(S, return_inv=True)[-1] def stack_suff_stats(self, F, S=None): + """Stacks F and S suff stats into single vector.""" if S is None: return F return np.hstack((F, S)) def unstack_suff_stats(self, stats): + """Decomposes suff. stats vector into F and S.""" F = stats[:, : self.x_dim] S = stats[:, self.x_dim :] return F, S def norm_suff_stats(self, N, u_x, return_order2=False): + """Normalizes accumlated sufficient statistics with the + mean and covariance of the distribution. + + Args: + N: zeroth order sufficient stats. + u_x: 1st and 2nd order stats. + return_order2: whether or not return normalized 2nd order stats. + + Return: + Normalized N, F or N, [F, S]. + """ F, S = self.unstack_suff_stats(u_x) F_norm = F - N[:, None] * self.mu for k in range(self.num_comp): F_norm[k] = np.dot(F_norm[k], self.cholLambda[k].T) if return_order2: - SS = vec2symat(S[k]) + SS = vec2symmat(S[k]) Fmu = np.outer(self.F[k], self.mu[k]) SS = SS - Fmu - Fmu.T + N * np.outer(self.mu[k], self.mu[k]) SS = np.dot(self.cholLambda[k], np.dot(SS, self.cholLambda[k].T)) @@ -163,7 +207,13 @@ def norm_suff_stats(self, N, u_x, return_order2=False): return N, F_norm def Mstep(self, N, u_x): + """Maximization step. + + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + """ F, S = self.unstack_suff_stats(u_x) if self.update_mu: @@ -187,15 +237,22 @@ def Mstep(self, N, u_x): N0 = N < self.min_N if np.any(N0): N[N0] = 0 - mu[N0] = 0 - S[N0] = 1 + self.mu[N0] = 0 + self.Lambda[N0] = np.eye(self.x_dim) self.pi = N / np.sum(N) self._log_pi = None self._compute_nat_params() def split_comp(self, K=2): + """Creates a new GMM with K x num_componentes. + Args: + K: multiplier for the number of components + + Returns: + GMM object. + """ num_comp = self.num_comp * K pi = np.repeat(self.pi, K) / K Lambda = np.repeat(self.Lambda, K, axis=0) * (K ** 2) @@ -218,6 +275,15 @@ def split_comp(self, K=2): return GMM(pi=pi, mu=mu, Lambda=Lambda, **config) def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) for k in range(self.num_comp): @@ -226,11 +292,25 @@ def log_prob_std(self, x): return logsumexp(llk_k, axis=-1) - def sample(self, num_samples, rng=None, seed=1024): + def sample(self, num_samples, rng=None, seed=1024, r=None): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ if rng is None: rng = np.random.RandomState(seed) - r = rng.multinomial(1, self.pi, size=(num_samples,)) + if r is None: + r = rng.multinomial(1, self.pi, size=(num_samples,)) + else: + num_samples = len(r) + x = np.zeros((num_samples, self.x_dim), dtype=float_cpu()) for k in range(self.num_comp): index = r[:, k] == 1 @@ -244,6 +324,7 @@ def sample(self, num_samples, rng=None, seed=1024): return x def get_config(self): + """Returns the model configuration dict.""" config = { "var_floor": self.var_floor, "update_mu": self.update_mu, @@ -253,11 +334,26 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"pi": self.pi, "mu": self.mu, "Lambda": self.Lambda} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["pi", "mu", "Lambda"] params = cls._load_params_to_dict(f, config["name"], param_list) return cls( @@ -275,6 +371,14 @@ def load_params(cls, f, config): @classmethod def load_from_kaldi(cls, file_path): + """Loads GMM from Kaldi file. + + Args: + file_path: kaldi file path. + + Returns: + Model object. + """ pi = None eta1 = None eta2 = None @@ -337,6 +441,7 @@ def _validate_eta(self): assert self.eta.shape[1] == (self.x_dim ** 2 + 3 * self.x_dim) / 2 def validate(self): + """Validates the parameters of the distribution.""" if self.pi is not None: self._validate_pi() @@ -349,6 +454,7 @@ def validate(self): @staticmethod def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" x_dim = mu.shape[-1] eta_dim = int((x_dim ** 2 + 3 * x_dim) / 2) eta = np.zeros((mu.shape[0], eta_dim), dtype=float_cpu()) @@ -359,6 +465,7 @@ def compute_eta(mu, Lambda): @staticmethod def compute_std(eta): + """Computes standard params. from the natural param.""" x_dim = Normal.compute_x_dim_from_eta(eta) mu = np.zeros((eta.shape[0], x_dim), dtype=float_cpu()) Lambda = np.zeros((eta.shape[0], x_dim, x_dim), dtype="float32") @@ -369,6 +476,7 @@ def compute_std(eta): @staticmethod def compute_A_nat(eta): + """Computes A from the natural param.""" A = np.zeros((eta.shape[0],), dtype=float_cpu()) for k in range(eta.shape[0]): A[k] = Normal.compute_A_nat(eta[k]) @@ -377,6 +485,7 @@ def compute_A_nat(eta): @staticmethod def compute_A_std(mu, Lambda): + """Computes A from the standard params.""" A = np.zeros((mu.shape[0],), dtype=float_cpu()) for k in range(mu.shape[0]): A[k] = Normal.compute_A_std(mu[k], Lambda[k]) @@ -395,6 +504,14 @@ def _compute_std_params(self): @staticmethod def compute_suff_stats(x): + """Computes the sufficient stats. for each sample. + + Args: + x: data samples with shape (num_samples, x_dim). + + Returns: + Sufficient stats. for each data sample with shape (num_samples, u_dim). + """ d = x.shape[1] u = np.zeros((x.shape[0], int(d + d * (d + 1) / 2)), dtype=float_cpu()) u[:, :d] = x @@ -406,12 +523,28 @@ def compute_suff_stats(x): return u def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of each GMM component in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] for k in range(mu.shape[0]): C = invert_pdmat(self.Lambda[k], return_inv=True)[-1][feat_idx, feat_idx] plot_gaussian_1D(mu[k], C, num_sigmas, num_pts, **kwargs) def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) for k in range(mu.shape[0]): @@ -419,6 +552,14 @@ def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): plot_gaussian_ellipsoid_2D(mu[k], C_k, num_sigmas, num_pts, **kwargs) def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) for k in range(mu.shape[0]): @@ -426,6 +567,14 @@ def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): plot_gaussian_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) for k in range(mu.shape[0]): diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index 46a30f81..4a0ba27d 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -21,8 +21,23 @@ class GMMDiagCov(ExpFamilyMixture): + """Class for GMM with diagonal covariance. + + Attributes: + num_comp: number of components of the mixture (intered from pi). + pi: weights of the components. + mu: mean with shape (num_comp, x_dim,) or None. + Lambda: precision with shape (num_comp, x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + def __init__( self, + num_comp=1, + pi=None, mu=None, Lambda=None, var_floor=1e-3, @@ -30,7 +45,11 @@ def __init__( update_Lambda=True, **kwargs ): - super().__init__(**kwargs) + if mu is not None: + assert mu.ndim == 2 + kwargs["x_dim"] = mu.shape[1] + + super().__init__(num_comp=num_comp, pi=pi, **kwargs) self.mu = mu self.Lambda = Lambda self.var_floor = var_floor @@ -55,23 +74,27 @@ def _compute_gmm_nat_std(self): @property def logLambda(self): + """log precision determinants.""" if self._logLambda is None: self._logLambda = np.sum(np.log(self.Lambda), axis=-1) return self._logLambda @property def cholLambda(self): + """Cholesqy decomp. of the precisions.""" if self._cholLambda is None: self._cholLambda = np.sqrt(self.Lambda) return self._cholLambda @property def Sigma(self): + """Covariances.""" if self._Sigma is None: self._Sigma = 1.0 / self.Lambda return self._Sigma def initialize(self, x=None): + """Initializes the distribution.""" if x is None and self.mu is None and self.eta is None: assert self.num_comp == 1 self._initialize_stdnormal() @@ -81,11 +104,18 @@ def initialize(self, x=None): self._compute_gmm_nat_std() def _initialize_stdnormal(self): + """Initializes a single component GMM with std. Normal.""" self.pi = np.array([1], dtype=float_cpu()) self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) self.Lambda = np.ones((1, self.x_dim), dtype=float_cpu()) def _initialize_kmeans(self, num_comp, x): + """Initializes the GMM with K-Means. + + Args: + num_comp: number of components. + x: initialization data with shape (num_samples, x_dim). + """ if num_comp == 1: self.pi = np.array([1], dtype=float_cpu()) self.mu = np.mean(x, axis=0, keepdims=True) @@ -104,17 +134,30 @@ def _initialize_kmeans(self, num_comp, x): self.Lambda[k] = 1 / np.std(x[r], axis=0) ** 2 def stack_suff_stats(self, F, S=None): + """Stacks F and S suff stats into single vector.""" if S is None: return F return np.hstack((F, S)) def unstack_suff_stats(self, stats): + """Decomposes suff. stats vector into F and S.""" F = stats[:, : self.x_dim] S = stats[:, self.x_dim :] return F, S def norm_suff_stats(self, N, u_x, return_order2=False): - F, S = self.unstack_suff_stats(acc_u_x) + """Normalizes accumlated sufficient statistics with the + mean and covariance of the distribution. + + Args: + N: zeroth order sufficient stats. + u_x: 1st and 2nd order stats. + return_order2: whether or not return normalized 2nd order stats. + + Return: + Normalized N, F or N, [F, S]. + """ + F, S = self.unstack_suff_stats(u_x) F_norm = self.cholLambda * (F - N[:, None] * self.mu) if return_order2: S = S - 2 * self.mu * F + N * self.mu ** 2 @@ -124,7 +167,13 @@ def norm_suff_stats(self, N, u_x, return_order2=False): return N, F_norm def Mstep(self, N, u_x): + """Maximization step. + + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + """ F, S = self.unstack_suff_stats(u_x) if self.update_mu: @@ -143,15 +192,23 @@ def Mstep(self, N, u_x): N0 = N < self.min_N if np.any(N0): N[N0] = 0 - mu[N0] = 0 - S[N0] = 1 + self.mu[N0] = 0 + self._Sigma[N0] = 1 + self.Lambda[N0] = 1 self.pi = N / np.sum(N) self._log_pi = None self._compute_nat_params() def split_comp(self, K=2): + """Creates a new GMM with K x num_componentes. + + Args: + K: multiplier for the number of components + Returns: + GMMDiagConv object. + """ std_dev = 1 / self.cholLambda num_comp = self.num_comp * K @@ -171,6 +228,15 @@ def split_comp(self, K=2): return GMMDiagCov(pi=pi, mu=mu, Lambda=Lambda, **config) def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) for k in range(self.num_comp): @@ -179,6 +245,7 @@ def log_prob_std(self, x): return logsumexp(llk_k, axis=-1) def log_cdf(self, x): + """Log cumulative distribution function.""" llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) for k in range(self.num_comp): delta = (x - self.mu[k]) * self.cholLambda[k] @@ -187,11 +254,24 @@ def log_cdf(self, x): return logsumexp(llk_k) - def sample(self, num_samples, rng=None, seed=1024): + def sample(self, num_samples=1, rng=None, seed=1024, r=None): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ if rng is None: rng = np.random.RandomState(seed) - r = rng.multinomial(1, self.pi, size=(num_samples,)) + if r is None: + r = rng.multinomial(1, self.pi, size=(num_samples,)) + else: + num_samples = len(r) x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) for k in range(self.num_comp): @@ -201,6 +281,7 @@ def sample(self, num_samples, rng=None, seed=1024): return x def get_config(self): + """Returns the model configuration dict.""" config = { "var_floor": self.var_floor, "update_mu": self.update_mu, @@ -210,13 +291,28 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"pi": self.pi, "mu": self.mu, "Lambda": self.Lambda} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["pi", "mu", "Lambda"] - params = self._load_params_to_dict(f, config["name"], param_list) + params = cls._load_params_to_dict(f, config["name"], param_list) return cls( x_dim=config["x_dim"], pi=params["pi"], @@ -232,6 +328,14 @@ def load_params(cls, f, config): @classmethod def load_from_kaldi(cls, file_path): + """Loads GMM from Kaldi file. + + Args: + file_path: kaldi file path. + + Returns: + Model object. + """ pi = None eta1 = None eta2 = None @@ -284,6 +388,7 @@ def _validate_eta(self): assert self.eta.shape[1] == self.x_dim * 2 def validate(self): + """Validates the parameters of the distribution.""" if self.pi is not None: self._validate_pi() @@ -296,12 +401,14 @@ def validate(self): @staticmethod def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" Lmu = Lambda * mu eta = np.hstack((Lmu, -0.5 * Lambda)) return eta @staticmethod def compute_std(eta): + """Computes standard params. from the natural param.""" x_dim = int(eta.shape[-1] / 2) eta1 = eta[:, :x_dim] eta2 = eta[:, x_dim:] @@ -311,6 +418,7 @@ def compute_std(eta): @staticmethod def compute_A_nat(eta): + """Computes A from the natural param.""" x_dim = int(eta.shape[-1] / 2) eta1 = eta[:, :x_dim] eta2 = eta[:, x_dim:] @@ -321,6 +429,7 @@ def compute_A_nat(eta): @staticmethod def compute_A_std(mu, Lambda): + """Computes A from the standard params.""" x_dim = mu.shape[1] r1 = 0.5 * x_dim * np.log(2 * np.pi) r2 = -0.5 * np.sum(np.log(Lambda), axis=-1) @@ -339,6 +448,14 @@ def _compute_std_params(self): @staticmethod def compute_suff_stats(x): + """Computes the sufficient stats. for each sample. + + Args: + x: data samples with shape (num_samples, x_dim). + + Returns: + Sufficient stats. for each data sample with shape (num_samples, u_dim). + """ d = x.shape[-1] u = np.zeros((x.shape[0], 2 * d), dtype=float_cpu()) u[:, :d] = x @@ -346,12 +463,28 @@ def compute_suff_stats(x): return u def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of each GMM component in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] C = 1 / self.Lambda[:, feat_idx] for k in range(mu.shape[0]): plot_gaussian_1D(mu[k], C[k], num_sigmas, num_pts, **kwargs) def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] C = 1 / self.Lambda[:, feat_idx] for k in range(mu.shape[0]): @@ -359,6 +492,14 @@ def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): plot_gaussian_ellipsoid_2D(mu[k], C_k, num_sigmas, num_pts, **kwargs) def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] C = 1 / self.Lambda[:, feat_idx] for k in range(mu.shape[0]): @@ -366,6 +507,14 @@ def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): plot_gaussian_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] C = 1 / self.Lambda[:, feat_idx] for k in range(mu.shape[0]): diff --git a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py index 87043cc4..ff02ec62 100644 --- a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py @@ -20,8 +20,23 @@ class GMMTiedDiagCov(GMMDiagCov): + """Class for GMM with diagonal covariance tied across components. + + Attributes: + num_comp: number of components of the mixture (intered from pi). + pi: weights of the components. + mu: mean with shape (num_comp, x_dim,) or None. + Lambda: precision with shape (num_comp, x_dim, x_dim) or None. + var_floor: variance floor. + update_mu: whether or not update mu when optimizing. + update_Lambda: wether or not update Lambda when optimizing. + x_dim: data dim (infered from mu if present) + """ + def __init__( self, + num_comp=1, + pi=None, mu=None, Lambda=None, var_floor=1e-3, @@ -30,6 +45,8 @@ def __init__( **kwargs ): super().__init__( + num_comp=num_comp, + pi=pi, mu=mu, Lambda=Lambda, var_floor=var_floor, @@ -49,11 +66,18 @@ def _compute_gmm_nat_std(self): self._compute_std_params() def _initialize_stdnormal(self): + """Initializes a single component GMM with std. Normal.""" self.pi = np.array([1], dtype=float_cpu()) self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) self.Lambda = np.ones((self.x_dim,), dtype=float_cpu()) def _initialize_kmeans(self, num_comp, x): + """Initializes the GMM with K-Means. + + Args: + num_comp: number of components. + x: initialization data with shape (num_samples, x_dim). + """ if num_comp == 1: self.pi = np.array([1], dtype=float_cpu()) self.mu = np.mean(x, axis=0, keepdims=True) @@ -75,7 +99,13 @@ def _initialize_kmeans(self, num_comp, x): self.Lambda = x.shape[0] / C def Mstep(self, N, u_x): + """Maximization step. + + Args: + N: zeroth order stats. + u_x: accumlated higher order stats. + """ F, S = self.unstack_suff_stats(u_x) if self.update_mu: @@ -95,15 +125,22 @@ def Mstep(self, N, u_x): N0 = N < self.min_N if np.any(N0): N[N0] = 0 - mu[N0] = 0 - S[N0] = 1 + self.mu[N0] = 0 + self.pi = N / np.sum(N) self._log_pi = None self._compute_nat_params() def split_comp(self, K=2): + """Creates a new GMM with K x num_componentes. + + Args: + K: multiplier for the number of components + Returns: + GMMTiedDiagConv object. + """ std_dev = 1 / self.cholLambda num_comp = self.num_comp * K @@ -122,6 +159,15 @@ def split_comp(self, K=2): return DiagGMMTiedCov(pi=pi, mu=mu, Lambda=self.Lambda, **config) def log_prob_std(self, x): + """log p(x) of each data sample computed using the + standard parameters of the distribution. + + Args: + x: input data with shape (num_samples, x_dim). + + Returns: + log p(x) with shape (num_samples,) + """ r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) for k in range(self.num_comp): @@ -130,6 +176,7 @@ def log_prob_std(self, x): return logsumexp(llk_k, axis=-1) def log_cdf(self, x): + """Log cumulative distribution function.""" llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) for k in range(self.num_comp): delta = (x - self.mu[k]) * self.cholLambda @@ -138,11 +185,24 @@ def log_cdf(self, x): return logsumexp(llk_k) - def sample(self, num_samples, rng=None, seed=1024): + def sample(self, num_samples=1, rng=None, seed=1024, r=None): + """Draws samples from the data distribution. + + Args: + num_samples: number of samples. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ if rng is None: rng = np.random.RandomState(seed) - r = rng.multinomial(1, self.pi, size=(num_samples,)) + if r is None: + r = rng.multinomial(1, self.pi, size=(num_samples,)) + else: + num_samples = len(r) x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) for k in range(self.num_comp): @@ -157,12 +217,14 @@ def _validate_Lambda(self): @staticmethod def compute_eta(mu, Lambda): + """Computes nat param. from mean and precision.""" Lmu = Lambda * mu eta = np.hstack((Lmu, -0.5 * np.tile(Lambda, (mu.shape[0], 1)))) return eta @staticmethod def compute_std(eta): + """Computes standard params. from the natural param.""" x_dim = int(eta.shape[-1] / 2) eta1 = eta[:, :x_dim] eta2 = eta[:, x_dim:] @@ -171,24 +233,56 @@ def compute_std(eta): return mu, Lambda def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): + """Plots one slice of each GMM component in 1d. + + Args: + feat_idx: feature index. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] C = 1 / self.Lambda[feat_idx] for k in range(mu.shape[0]): plot_gaussian_1D(mu[k], C, num_sigmas, num_pts, **kwargs) def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 2d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] C = np.diag(1 / self.Lambda[feat_idx]) for k in range(mu.shape[0]): plot_gaussian_ellipsoid_2D(mu[k], C, num_sigmas, num_pts, **kwargs) def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): + """Plots 2 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] C = np.diag(1 / self.Lambda[feat_idx]) for k in range(mu.shape[0]): plot_gaussian_3D(mu[k], C, num_sigmas, num_pts, **kwargs) def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + """Plots 3 dimensions of each GMM component in 3d. + + Args: + feat_idx: feature indeces. + num_sigmas: size of the plot in number of standard devs. + num_pts: number of points in the graph. + **kwargs: pyplot options. + """ mu = self.mu[:, feat_idx] C = np.diag(1 / self.Lambda[feat_idx]) for k in range(mu.shape[0]): diff --git a/hyperion/np/pdfs/plda/frplda.py b/hyperion/np/pdfs/plda/frplda.py index 137276c7..183725a7 100644 --- a/hyperion/np/pdfs/plda/frplda.py +++ b/hyperion/np/pdfs/plda/frplda.py @@ -12,6 +12,21 @@ class FRPLDA(PLDABase): + """Class for Full-rank PLDA (a.k.a. Two-Covariance Model) where + .. math:: + \mathbf{x}_{ij} = \mathbf{y}_i + \varepsilon_{ij} + + + Attributes: + mu: class-independent mean. + B: between-class precision. + W: within-class precision. + update_mu: whether to update mu or not when training the model. + update_B: whether to update B or not when training the model. + update_W: whether to update W or not when training the model. + x_dim: data dimension. + """ + def __init__( self, mu=None, @@ -23,7 +38,7 @@ def __init__( update_W=True, **kwargs ): - super(FRPLDA, self).__init__(mu=mu, update_mu=update_mu, **kwargs) + super().__init__(mu=mu, update_mu=update_mu, **kwargs) if mu is not None: self.y_dim = mu.shape[0] self.B = B @@ -33,6 +48,7 @@ def __init__( self.update_W = update_W def validate(self): + """Validates the model parameters.""" assert self.mu.shape[0] == self.B.shape[0] assert self.mu.shape[0] == self.B.shape[1] assert self.mu.shape[0] == self.W.shape[0] @@ -40,6 +56,7 @@ def validate(self): @property def is_init(self): + """Returns True if the model has been initialized.""" if self._is_init: return True if self.mu is not None and self.B is not None and self.W is not None: @@ -48,6 +65,11 @@ def is_init(self): return self._is_init def initialize(self, D): + """initializes the model. + + Args: + D: tuple of sufficient statistics (N, F, S) + """ N, F, S = D self.x_dim = F.shape[1] self.y_dim = F.shape[1] @@ -77,6 +99,21 @@ def initialize(self, D): def compute_py_g_x( self, D, return_cov=False, return_logpy_0=False, return_acc=False ): + """Computes the posterior P(y|x) + + Args: + D: tuple of sufficient statistics (N, F, S) + return_cov: whether or not to return the posterior covariances. + return_logpy_0: whether or not to return log P(y=0|x). + return_acc: whether or not to return Ry and Py accumulators. + + Returns: + Speaker factor posterior means with shape (num_speakers, y_dim) + Speaker factor posterior convariances with shape (num_speakers, y_dim, y_dim) + log P(y=0|x) with shape (num_spakers,) + Ry accumlator for ML step with shape (y_dim, y_dim) + Py accumlator for MD step with shape (y_dim, y_dim) + """ assert self.is_init @@ -160,6 +197,14 @@ def compute_py_g_x( return r def Estep(self, D): + """Expectation step. + + Args: + D: tuple with sufficient statistics (N, F, S) + + Returns: + Tuple of statistics with accumlated expectations. + """ N, F, S = D y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) @@ -179,6 +224,14 @@ def Estep(self, D): return stats def elbo(self, stats): + """Computes the objective function. + + Args: + stats: tuple of expectations computed at the Estep. + + Returns: + log P(X) + """ N, M, S, logpy_x = stats[:4] logW = logdet_pdmat(self.W) @@ -201,19 +254,14 @@ def elbo(self, stats): elbo = logpx_y + logpy - logpy_x return elbo - # N, M, sumy, yy, _, _, CW, logL = stats - # ymu = np.outer(sumy, mu) - # CB = yy - ymu -ymu.T + M*np.outer(self.mu, self.mu.T) - # logW = logdet_pdmat(self.W) - # logB = logdet_pdmat(self.B) + def MstepML(self, stats): + """Maximum likelihood estimation step. - # elbo = 0.5*(-logL - N*self.x_dim*np.log(2*np.pi) - # +N*logW - np.inner(self.W.ravel(), CW.ravel()) - # +M*logB - np.inner(self.B.ravel(), CB.ravel())) - # return elbo + Args: + stats: tuple of expectations computed at the Estep. - def MstepML(self, stats): + """ N, M, S, _, y_acc, Ry, Cy, Py = stats ybar = y_acc / M if self.update_mu: @@ -236,6 +284,7 @@ def MstepMD(self, stats): pass def get_config(self): + """Returns the model configuration dict.""" config = { "update_W": self.update_W, "update_B": self.update_B, @@ -245,18 +294,42 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"mu": self.mu, "B": self.B, "W": self.W} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "B", "W"] params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) def llr_1vs1(self, x1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of one enrollment and one test segments. + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + + Returns: + Score matrix with shape (num_enrollment_segments, num_test_segments). + """ assert self.is_init Lnon = self.B + self.W @@ -303,7 +376,17 @@ def llr_1vs1(self, x1, x2): return scores def llr_NvsM_book(self, D1, D2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with the exact formula (by the book). + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ assert self.is_init N1, F1, _ = D1 @@ -368,7 +451,17 @@ def llr_NvsM_book(self, D1, D2): def sample( self, num_classes, num_samples_per_class, rng=None, seed=1024, return_y=False ): + """Draws samples from the PLDA model. + Args: + num_classes: number of classes to sample. + num_samples_per_class: number of samples to sample per each class. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ assert self.is_init if rng is None: @@ -394,7 +487,15 @@ def sample( return y + z def weighted_avg_params(self, mu, B, W, w_mu, w_B, w_W): - super(FRPLDA, self).weigthed_avg_params(mu, w_mu) + """Performs weighted average of the model parameters + and some given parameters. + + Args: + mu: other mean vector + w_mu: weight of the given mean vector. + + """ + super().weigthed_avg_params(mu, w_mu) if w_B > 0: Sb0 = invert_pdmat(self.B, return_inv=True)[-1] Sb = invert_pdmat(B, return_inv=True)[-1] @@ -407,4 +508,11 @@ def weighted_avg_params(self, mu, B, W, w_mu, w_B, w_W): self.W = invert_pdmat(Sw, return_inv=True)[-1] def weighted_avg_model(self, plda, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and those of another model given as input. + + Args: + plda: other PLDA model. + + """ self.weighted_avg_params(plda.mu, plda.B, plda.W, w_mu, w_B, w_W) diff --git a/hyperion/np/pdfs/plda/plda.py b/hyperion/np/pdfs/plda/plda.py index 30c21361..fd2eb9a9 100644 --- a/hyperion/np/pdfs/plda/plda.py +++ b/hyperion/np/pdfs/plda/plda.py @@ -12,6 +12,25 @@ class PLDA(PLDABase): + """Class for Probabilistic Discriminant Analysis (PLDA) model. + .. math:: + \mathbf{x}_{ij} = \mu + \mathbf{V} \mathbf{y}_i + \mathbf{U} \mathbf{z}_{ij} + \varepsilon_{ij} + + Attributes: + y_dim: speaker factor dimension. + z_dim: channel factor dimension. + mu: class-independent mean. + V: speaker factor loading matrix. + U: channel factor loading matrix. + D: Precision of the additional channel noise. + fullcov_iD: floor for the inverse of D. + update_mu: whether to update mu or not when training the model. + update_V: whether to update V or not when training the model. + update_U: whether to update U or not when training the model. + update_D: whether to update D or not when training the model. + x_dim: data dimension. + """ + def __init__( self, y_dim=None, @@ -27,7 +46,7 @@ def __init__( update_D=True, **kwargs ): - super(PLDA, self).__init__(y_dim=y_dim, mu=mu, update_mu=update_mu, **kwargs) + super().__init__(y_dim=y_dim, mu=mu, update_mu=update_mu, **kwargs) self.z_dim = z_dim if V is not None: self.y_dim = V.shape[0] @@ -52,6 +71,7 @@ def __init__( self._VWV = None def validate(self): + """Validates the model parameters.""" assert self.mu.shape[0] >= self.V.shape[0] assert self.mu.shape[0] == self.V.shape[1] assert self.mu.shape[0] >= self.U.shape[0] @@ -60,6 +80,7 @@ def validate(self): @property def is_init(self): + """Returns True if the model has been initialized.""" if self._is_init: return True if ( @@ -75,6 +96,7 @@ def is_init(self): return self._is_init def compute_aux(self): + """Computes auxiliary variables.""" DV = self.V * self.D DU = self.U * self.D self._DU = DU @@ -89,6 +111,11 @@ def compute_aux(self): self._VWV = np.dot(self.V, self._VW) def initialize(self, D): + """initializes the model. + + Args: + D: tuple of sufficient statistics (N, F, S) + """ N, F, S = D self.x_dim = F.shape[1] M = F.shape[0] @@ -118,7 +145,21 @@ def initialize(self, D): def compute_py_g_x( self, D, return_cov=False, return_logpy_0=False, return_acc=False ): - + """Computes the posterior P(y|x) + + Args: + D: tuple of sufficient statistics (N, F, S) + return_cov: whether or not to return the posterior covariances. + return_logpy_0: whether or not to return log P(y=0|x). + return_acc: whether or not to return Ry and Py accumulators. + + Returns: + Speaker factor posterior means with shape (num_speakers, y_dim) + Speaker factor posterior convariances with shape (num_speakers, y_dim, y_dim) + log P(y=0|x) with shape (num_spakers,) + Ry accumlator for ML step with shape (y_dim, y_dim) + Py accumlator for MD step with shape (y_dim, y_dim) + """ assert self.is_init N, F, S = D @@ -203,6 +244,14 @@ def compute_py_g_x( return tuple(r) def Estep(self, D): + """Expectation step. + + Args: + D: tuple with sufficient statistics (N, F, S) + + Returns: + Tuple of statistics with accumlated expectations. + """ N, F, S = D y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) @@ -264,6 +313,14 @@ def Estep(self, D): return stats def elbo(self, stats): + """Computes the objective function. + + Args: + stats: tuple of expectations computed at the Estep. + + Returns: + log P(X) + """ N, M, F, S, logpy_x = stats[:5] logD = np.sum(np.log(self.D)) @@ -281,6 +338,12 @@ def elbo(self, stats): return elbo def MstepML(self, stats): + """Maximum likelihood estimation step. + + Args: + stats: tuple of expectations computed at the Estep. + + """ N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py, Rz1, Rz, Ryz, Cz = stats if self.update_mu and not self.update_V and not self.update_U: @@ -357,6 +420,12 @@ def MstepML(self, stats): self.compute_aux() def MstepMD(self, stats): + """Minimum divergence estimation step. + + Args: + stats: tuple of expectations computed at the Estep. + + """ N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py, Rz1, Rz, Ryz, Cz = stats mu_y = y_acc / M Cov_y = Py / M - np.outer(mu_y, mu_y) @@ -384,6 +453,7 @@ def MstepMD(self, stats): self.compute_aux() def get_config(self): + """Returns the model configuration dict.""" config = { "update_D": self.update_D, "update_U": self.update_U, @@ -394,17 +464,41 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"mu": self.mu, "V": self.V, "U": self.U, "D": self.D} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "V", "U", "D"] params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) def log_probx_g_y(self, x, y): + """Computes logP(X|Y) + + Args: + x: data samples with shape (num_samples, x_dim). + y: speaker factors for each sample with shape (num_samples, y_dim). + + Returns: + log P(X|Y) array with shape (num_samples,) + """ iW = np.diag(1 / self.D) + np.dot(self.U.T, self.U) mult_W, _, logiW = invert_pdmat(iW, return_logdet=True) delta = x - self.mu - np.dot(y, self.V) @@ -417,6 +511,16 @@ def log_probx_g_y(self, x, y): return logp def log_probx_g_yz(self, x, y, z): + """Computes logP(X|Y,Z) + + Args: + x: data samples with shape (num_samples, x_dim). + y: speaker factors for each sample with shape (num_samples, y_dim). + z: channel factors for each sample with shape (num_samples, z_dim). + + Returns: + log P(X|Y,Z) array with shape (num_samples,) + """ logD = np.sum(np.log(self.D)) delta = x - self.mu - np.dot(y, self.V) - np.dot(z, self.U) logp = ( @@ -428,7 +532,16 @@ def log_probx_g_yz(self, x, y, z): return logp def llr_1vs1(self, x1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of one enrollment and one test segments. + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + Returns: + Score matrix with shape (num_enrollment_segments, num_test_segments). + """ assert self.is_init WV = self._VW VV = self._VWV @@ -472,7 +585,17 @@ def llr_1vs1(self, x1, x2): return scores def llr_NvsM_book(self, D1, D2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with the exact formula (by the book). + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ assert self.is_init N1, F1, _ = D1 @@ -539,6 +662,17 @@ def llr_NvsM_book(self, D1, D2): return scores def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): + """Draws samples from the PLDA model. + + Args: + num_classes: number of classes to sample. + num_samples_per_class: number of samples to sample per each class. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ if rng is None: rng = np.random.RandomState(seed=seed) @@ -562,8 +696,15 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): return y + z1 + z2 def weighted_avg_params(self, mu, V, U, D, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and some given parameters. + + Args: + mu: other mean vector + w_mu: weight of the given mean vector. - super(PLDA, self).weigthed_avg_params(mu, w_mu) + """ + super().weigthed_avg_params(mu, w_mu) if w_B > 0: Sb0 = np.dot(self.V.T, self.V) Sb = np.dot(V.T, V) @@ -582,26 +723,15 @@ def weighted_avg_params(self, mu, V, U, D, w_mu, w_B, w_W): U = U[:, -self.z_dim :] self.U = U.T iD = np.diag(Sw - np.dot(self.U.T, self.U)).copy() - # print(Sw[:10,:10]) - # print(np.dot(self.U.T, self.U)) - # print(iD[:10]) iD[iD < self.floor_iD] = self.floor_iD self.D = 1 / iD - # if w_W > 0: - # Sw0 = np.dot(self.U.T, self.U) - # Sw = np.dot(U.T, U) - # Sw = w_W*Sw + (1-w_W)*Sw0 - # w, U = sla.eigh(Sw, overwrite_a=True) - # U = np.sqrt(w)*U - # U = U[:,-self.z_dim:] - # self.U = U.T - - # if w_D > 0: - # Sd0 = 1/self.D - # Sd = 1/D - # Sd = w_D*Sd + (1-w_D)*Sd0 - # self.D = 1/Sd - def weighted_avg_model(self, plda, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and those of another model given as input. + + Args: + plda: other PLDA model. + + """ self.weighted_avg_params(plda.mu, plda.V, plda.U, plda.D, w_mu, w_B, w_W) diff --git a/hyperion/np/pdfs/plda/plda_base.py b/hyperion/np/pdfs/plda/plda_base.py index 1d5d758c..72503965 100644 --- a/hyperion/np/pdfs/plda/plda_base.py +++ b/hyperion/np/pdfs/plda/plda_base.py @@ -5,30 +5,44 @@ import numpy as np -from abc import ABCMeta, abstractmethod - from ....hyp_defs import float_cpu from ..core.pdf import PDF from ...transforms import LNorm class PLDABase(PDF): - __metaclass__ = ABCMeta + """Abstract Base class for different versions of + Probabilistic Linear Discriminant Analysis (PLDA) models. + + Attributes: + y_dim: speaker factor dimension. + mu: class-independent mean. + update_mu: whether to update mu or not when training the model. + x_dim: data dimension. + """ def __init__(self, y_dim=None, mu=None, update_mu=True, **kwargs): - super(PLDABase, self).__init__(**kwargs) + super().__init__(**kwargs) self.mu = mu self.y_dim = y_dim self.update_mu = update_mu if mu is not None: self.x_dim = mu.shape[0] - @abstractmethod def initialize(self, D): + """initializes the model. + + Args: + D: tuple of sufficient statistics (N, F, S) + """ pass - @abstractmethod def compute_py_g_x(self, D): + """Computes the posterior P(y|x) + + Args: + D: tuple of sufficient statistics (N, F, S) + """ pass def fit( @@ -45,22 +59,47 @@ def fit( ml_md="ml+md", md_epochs=None, ): + """Trains the model. + + Args: + x: train data matrix with shape (num_samples, x_dim). + class_ids: class identifiers [0, num_clases-1] for training data. + ptheta: probability of belonging to a class with shape (num_samples, num_classes) for training data. + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + class_ids_val: class identifiers [0, num_clases-1] for val data. + ptheta_val: probability of belonging to a class with shape (num_samples, num_classes) for val. data. + sample_weight_val: weight of each sample in the val. loss. + epochs: number of EM steps. + ml_md: whether to do maximum likelihood estimation ("ml"), minimum divergence ("md") or both ("ml+md"). + md_epochs: in which epochs to do MD estimation, if None, MD is done in all epochs. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ use_ml = False if ml_md == "md" else True use_md = False if ml_md == "ml" else True assert not (class_ids is None and ptheta is None) if class_ids is None: - D = self.compute_stats_soft(x, ptheta) + D = self.compute_stats_soft(x, ptheta, sample_weight=sample_weight) else: - D = self.compute_stats_hard(x, class_ids) + D = self.compute_stats_hard(x, class_ids, sample_weight=sample_weight) if x_val is not None: assert not (class_ids_val is None and ptheta_val is None) if class_ids_val is None: - D_val = self.compute_stats_soft(x_val, ptheta_val) + D_val = self.compute_stats_soft( + x_val, ptheta_val, sample_weight=sample_weight_val + ) else: - D_val = self.compute_stats_hard(x_val, class_ids_val) + D_val = self.compute_stats_hard( + x_val, class_ids_val, sample_weight=sample_weight_val + ) if not self.is_init: self.initialize(D) @@ -87,24 +126,16 @@ def fit( elbo_val_norm = elbo_val / np.sum(D_val[0]) return elbo, elbo_norm, elbo_val, elbo_val_norm - @abstractmethod def Estep(self, x): + """Expectation step.""" pass - @abstractmethod def MstepML(self, x): + """Maximum likelihood step.""" pass - @abstractmethod def MstepMD(self, x): - pass - - @abstractmethod - def llr_1vs1(self, x1, x2): - pass - - @abstractmethod - def llr_NvsM_book(self, D1, D2): + """Minimum Divergence step.""" pass def fit_adapt_weighted_avg_model( @@ -125,6 +156,31 @@ def fit_adapt_weighted_avg_model( w_B=0.5, w_W=0.5, ): + """Adapts a PLDA model to new data. The adapted model is weighted averaged with the prior after each epoch. + + Args: + x: train data matrix with shape (num_samples, x_dim). + class_ids: class identifiers [0, num_clases-1] for training data. + ptheta: probability of belonging to a class with shape (num_samples, num_classes) for training data. + sample_weight: weight of each sample in the training loss shape (num_samples,). + x_val: validation data matrix with shape (num_val_samples, x_dim). + class_ids_val: class identifiers [0, num_clases-1] for val data. + ptheta_val: probability of belonging to a class with shape (num_samples, num_classes) for val. data. + sample_weight_val: weight of each sample in the val. loss. + epochs: number of EM steps. + ml_md: whether to do maximum likelihood estimation ("ml"), minimum divergence ("md") or both ("ml+md"). + md_epochs: in which epochs to do MD estimation, if None, MD is done in all epochs. + plda0: prior model. + w_mu: weigth of the prior on the mean. + w_B: weight of the prior on the between-class precision. + w_W: weight of the prior on the within-class precision. + + Returns: + log p(X) of the training data. + log p(x) per sample. + log p(X) of the val. data, if present. + log p(x) of the val. data per sample, if present. + """ assert self.is_init use_ml = False if ml_md == "md" else True @@ -132,16 +188,20 @@ def fit_adapt_weighted_avg_model( assert not (class_ids is None and ptheta is None) if class_ids is None: - D = self.compute_stats_soft(x, ptheta) + D = self.compute_stats_soft(x, ptheta, sample_weight=sample_weight) else: - D = self.compute_stats_hard(x, class_ids) + D = self.compute_stats_hard(x, class_ids, sample_weight=sample_weight) if x_val is not None: assert not (class_ids_val is None and ptheta_val is None) if class_ids_val is None: - D_val = self.compute_stats_soft(x_val, ptheta_val) + D_val = self.compute_stats_soft( + x_val, ptheta_val, sample_weight=sample_weight_val + ) else: - D_val = self.compute_stats_hard(x_val, class_ids_val) + D_val = self.compute_stats_hard( + x_val, class_ids_val, sample_weight=sample_weight_val + ) elbo = np.zeros((epochs,), dtype=float_cpu()) elbo_val = np.zeros((epochs,), dtype=float_cpu()) @@ -167,74 +227,21 @@ def fit_adapt_weighted_avg_model( elbo_val_norm = elbo_val / np.sum(D_val[0]) return elbo, elbo_norm, elbo_val, elbo_val_norm - def fit_adapt( - self, - x, - class_ids=None, - ptheta=None, - sample_weight=None, - x0=None, - class_ids0=None, - ptheta0=None, - sample_weight0=None, - x_val=None, - class_ids_val=None, - ptheta_val=None, - sample_weight_val=None, - epochs=20, - ml_md="ml+md", - md_epochs=None, - ): - - assert self.is_init - use_ml = False if ml_md == "md" else True - use_md = False if ml_md == "ml" else True - - assert not (class_ids is None and ptheta is None) - if class_ids is None: - D = self.compute_stats_soft(x, ptheta) - else: - D = self.compute_stats_hard(x, class_ids) - - if x0 is not None: - assert not (class_ids0 is None and ptheta0 is None) - if class_ids0 is None: - D0 = self.compute_stats_soft(x0, ptheta0) - else: - D0 = self.compute_stats_hard(x0, class_ids0) - - if x_val is not None: - assert not (class_ids_val is None and ptheta_val is None) - if class_ids_val is None: - D_val = self.compute_stats_soft(x_val, ptheta_val) - else: - D_val = self.compute_stats_hard(x_val, class_ids_val) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - - stats = self.Estep(D) - stats0 = self.Estep(D0) - elbo[epoch] = self.elbo(stats) - if x_val is not None: - stats_val = self.Estep(D_val) - elbo_val[epoch] = self.elbo(stats_val) - - if use_ml: - self.MstepML(stats) - if use_md and (md_epochs is None or epoch in md_epochs): - self.MstepMD(stats) - - elbo_norm = elbo / np.sum(D[0]) - if x_val is None: - return elbo, elbo_norm - else: - elbo_val_norm = elbo_val / np.sum(D_val[0]) - return elbo, elbo_norm, elbo_val, elbo_val_norm - @staticmethod def compute_stats_soft(x, p_theta, sample_weight=None, scal_factor=None): + """Computes sufficient statistics need by PLDA model using soft class assigments. + + Args: + x: input data with shape (num_samples, x_dim) + p_theta: soft class assigments with shape (num_samples, num_classes) + sample_weight: weight of each sample for training with shape (num_samples, ) + scal_factor: scaling factor for sufficient statistics (Themos factor) + + Returns: + N: zero order stats with shape (num_classes,) + F: first order stats with shape (num_classes, x_dim) + S: Accumulated second order stats with sahpe (x_dim, x_dim) + """ if sample_weight is not None: p_theta = sample_weight[:, None] * p_theta if scal_factor is not None: @@ -247,6 +254,19 @@ def compute_stats_soft(x, p_theta, sample_weight=None, scal_factor=None): @staticmethod def compute_stats_hard(x, class_ids, sample_weight=None, scale_factor=None): + """Computes sufficient statistics need by PLDA model using soft class assigments. + + Args: + x: input data with shape (num_samples, x_dim) + class_ids: integer [0, num_classes-1] vector indicating the class of each sample. + sample_weight: weight of each sample for training with shape (num_samples, ) + scal_factor: scaling factor for sufficient statistics (Themos factor) + + Returns: + N: zero order stats with shape (num_classes,) + F: first order stats with shape (num_classes, x_dim) + S: Accumulated second order stats with sahpe (x_dim, x_dim) + """ x_dim = x.shape[1] num_classes = np.max(class_ids) + 1 N = np.zeros((num_classes,), dtype=float_cpu()) @@ -283,13 +303,66 @@ def compute_stats_hard_v0(x, class_ids, sample_weight=None, scal_factor=None): @staticmethod def center_stats(D, mu): + """Centers the sufficient statistics by the PLDA mean. + + Args: + D: tupe with sufficient stats (N, F, S). + mu: mean vector. + + Returns: + Centered N, F, S + """ N, F, S = D Fc = F - np.outer(N, mu) Fmu = np.outer(np.sum(F, axis=0), mu) Sc = S - Fmu - Fmu.T + np.sum(N) * np.outer(mu, mu) return N, Fc, Sc + def llr_1vs1(self, x1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of one enrollment and one test segments. + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + + Returns: + Score matrix with shape (num_enrollment_segments, num_test_segments). + """ + pass + + def llr_NvsM_book(self, D1, D2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with the exact formula (by the book). + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ + pass + def llr_NvsM(self, x1, x2, ids1=None, ids2=None, method="vavg-lnorm"): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + ids1: integer array mapping from segments to + enrollment-sides in [0, num_enroll_sides-1] + ids2: integer array mapping from segments to + test-sides in [0, num_test_sides-1] + method: evaluation method in ["book" (exact formula), + "vavg" (vector averaging), "vavg-lnorm" (vector averagin + lnorm), + "savg" (score averaging)] + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ if method == "savg": return self.llr_NvsM_savg(x1, ids1, x2, ids2) @@ -304,6 +377,18 @@ def llr_NvsM(self, x1, x2, ids1=None, ids2=None, method="vavg-lnorm"): return self.llr_NvsM_vavg(D1, D2, do_lnorm=True) def llr_NvsM_vavg(self, D1, D2, do_lnorm=True): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with vector averaging. + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + do_lnorm: whether or not to do length norm. after vector averaging. + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ x1 = D1[1] / np.expand_dims(D1[0], axis=-1) x2 = D2[1] / np.expand_dims(D2[0], axis=-1) if do_lnorm: @@ -314,6 +399,20 @@ def llr_NvsM_vavg(self, D1, D2, do_lnorm=True): return self.llr_1vs1(x1, x2) def llr_NvsM_savg(self, x1, ids1, x2, ids2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + ids1: integer array mapping from segments to + enrollment-sides in [0, num_enroll_sides-1] + ids2: integer array mapping from segments to + test-sides in [0, num_test_sides-1] + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ scores_1vs1 = self.llr_1vs1(x1, x2) N, F, _ = self.compute_stats_hard(scores_1vs1, ids1) scores_Nvs1 = F / N[:, None] @@ -322,6 +421,21 @@ def llr_NvsM_savg(self, x1, ids1, x2, ids2): return scores def llr_Nvs1(self, x1, x2, ids1=None, method="vavg-lnorm"): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_test_segmens, x_dim). + ids1: integer array mapping from segments to + enrollment-sides in [0, num_enroll_sides-1] + method: evaluation method in ["book" (exact formula), + "vavg" (vector averaging), "vavg-lnorm" (vector averagin + lnorm), + "savg" (score averaging)] + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ if method == "savg": return self.llr_Nvs1_savg(x1, ids1, x2) @@ -336,6 +450,18 @@ def llr_Nvs1(self, x1, x2, ids1=None, method="vavg-lnorm"): return self.llr_Nvs1_vavg(D1, x2, do_lnorm=True) def llr_Nvs1_vavg(self, D1, x2, do_lnorm=True): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with vector averaging. + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + x2: test vectors with shape (num_test_segmens, x_dim). + do_lnorm: whether or not to do length norm. after vector averaging. + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ x1 = D1[1] / np.expand_dims(D1[0], axis=-1) if do_lnorm: lnorm = LNorm() @@ -345,23 +471,60 @@ def llr_Nvs1_vavg(self, D1, x2, do_lnorm=True): return self.llr_1vs1(x1, x2) def llr_Nvs1_savg(self, x1, ids1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + ids1: integer array mapping from segments to + enrollment-sides in [0, num_enroll_sides-1] + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ scores_1vs1 = self.llr_1vs1(x1, x2) N, F, _ = self.compute_stats_hard(scores_1vs1, ids1) scores = F / N[:, None] return scores - @abstractmethod def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): + """Draws samples from the PLDA model. + + Args: + num_classes: number of classes to sample. + num_samples_per_class: number of samples to sample per each class. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ pass def get_config(self): + """Returns the model configuration dict.""" config = {"y_dim": self.y_dim, "update_mu": self.update_mu} - base_config = super(PLDABase, self).get_config() + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def weigthed_avg_params(self, mu, w_mu): + """Performs weighted average of the model parameters + and some given parameters. + + Args: + mu: other mean vector + w_mu: weight of the given mean vector. + + """ self.mu = w_mu * mu + (1 - w_mu) * self.mu - @abstractmethod def weigthed_avg_model(self, plda): + """Performs weighted average of the model parameters + and those of another model given as input. + + Args: + plda: other PLDA model. + + """ pass diff --git a/hyperion/np/pdfs/plda/splda.py b/hyperion/np/pdfs/plda/splda.py index f10759cf..f9322d26 100644 --- a/hyperion/np/pdfs/plda/splda.py +++ b/hyperion/np/pdfs/plda/splda.py @@ -11,6 +11,22 @@ class SPLDA(PLDABase): + """Class for Simplied Probabilistic Discriminant Analysis (SPLDA). + .. math:: + \mathbf{x}_{ij} = \mu + \mathbf{V} \mathbf{y}_i + \varepsilon_{ij} + + Attributes: + y_dim: speaker factor dimension. + mu: class-independent mean. + V: speaker factor loading matrix. + W: within-class precision. + fullcov_W: whether W is full-precision matrix or not. + update_mu: whether to update mu or not when training the model. + update_V: whether to update V or not when training the model. + update_W: whether to update W or not when training the model. + x_dim: data dimension. + """ + def __init__( self, y_dim=None, @@ -33,6 +49,7 @@ def __init__( self.update_W = update_W def validate(self): + """Validates the model parameters.""" assert self.mu.shape[0] >= self.V.shape[0] assert self.mu.shape[0] == self.V.shape[1] assert self.mu.shape[0] == self.W.shape[0] @@ -40,6 +57,7 @@ def validate(self): @property def is_init(self): + """Returns True if the model has been initialized.""" if self._is_init: return True if self.mu is not None and self.V is not None and self.W is not None: @@ -48,6 +66,11 @@ def is_init(self): return self._is_init def initialize(self, D): + """initializes the model. + + Args: + D: tuple of sufficient statistics (N, F, S) + """ N, F, S = D self.x_dim = F.shape[1] M = F.shape[0] @@ -73,6 +96,21 @@ def initialize(self, D): def compute_py_g_x( self, D, return_cov=False, return_logpy_0=False, return_acc=False ): + """Computes the posterior P(y|x) + + Args: + D: tuple of sufficient statistics (N, F, S) + return_cov: whether or not to return the posterior covariances. + return_logpy_0: whether or not to return log P(y=0|x). + return_acc: whether or not to return Ry and Py accumulators. + + Returns: + Speaker factor posterior means with shape (num_speakers, y_dim) + Speaker factor posterior convariances with shape (num_speakers, y_dim, y_dim) + log P(y=0|x) with shape (num_spakers,) + Ry accumlator for ML step with shape (y_dim, y_dim) + Py accumlator for MD step with shape (y_dim, y_dim) + """ N, F, S = D Fc = F - self.mu @@ -158,6 +196,14 @@ def compute_py_g_x( return tuple(r) def Estep(self, D): + """Expectation step. + + Args: + D: tuple with sufficient statistics (N, F, S) + + Returns: + Tuple of statistics with accumlated expectations. + """ N, F, S = D y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) @@ -179,6 +225,14 @@ def Estep(self, D): return stats def elbo(self, stats): + """Computes the objective function. + + Args: + stats: tuple of expectations computed at the Estep. + + Returns: + log P(X) + """ N, M, F, S, logpy_x = stats[:5] logW = logdet_pdmat(self.W) @@ -196,6 +250,12 @@ def elbo(self, stats): return elbo def MstepML(self, stats): + """Maximum likelihood estimation step. + + Args: + stats: tuple of expectations computed at the Estep. + + """ N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py = stats a = np.hstack((Ry, Ry1[:, None])) @@ -230,6 +290,12 @@ def MstepML(self, stats): self.W = np.diag(1 / np.diag(iW)) def MstepMD(self, stats): + """Minimum divergence estimation step. + + Args: + stats: tuple of expectations computed at the Estep. + + """ N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py = stats mu_y = y_acc / M @@ -242,26 +308,51 @@ def MstepMD(self, stats): self.V = np.dot(chol_Cov_y, self.V) def get_config(self): + """Returns the model configuration dict.""" config = { "update_W": self.update_W, "update_V": self.update_V, "fullcov_W": self.fullcov_W, } - base_config = super(SPLDA, self).get_config() + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"mu": self.mu, "V": self.V, "W": self.W} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "V", "W"] params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) def log_probx_g_y(self, x, y): + """Computes logP(X|Y) + + Args: + x: data samples with shape (num_samples, x_dim). + y: speaker factors for each sample with shape (num_samples, y_dim). + + Returns: + log P(X|Y) array with shape (num_samples,) + """ logW = logdet_pdmat(self.W) delta = x - self.mu - np.dot(y, self.V) logp = ( @@ -273,7 +364,16 @@ def log_probx_g_y(self, x, y): return logp def llr_1vs1(self, x1, x2): + """log-likelihood ratio between target and non-target hypothesis for + the case of one enrollment and one test segments. + Args: + x1: enrollment vectors with shape (num_enroll_segmens, x_dim). + x2: test vectors with shape (num_enroll_segmens, x_dim). + + Returns: + Score matrix with shape (num_enrollment_segments, num_test_segments). + """ WV = np.dot(self.W, self.V.T) VV = np.dot(self.V, WV) I = np.eye(self.y_dim, dtype=float_cpu()) @@ -316,6 +416,17 @@ def llr_1vs1(self, x1, x2): return scores def llr_NvsM_book(self, D1, D2): + """log-likelihood ratio between target and non-target hypothesis for + the case of N segments/enrollment-side and M segments/test-side + evaluated with the exact formula (by the book). + + Args: + D1: tuple of sufficient statistics for the enrollment sides (N1, F1, S1). + D2: tuple of sufficient statistics for the test sides (N2, F2, S2). + + Returns: + Score matrix with shape (num_enrollment_sides, num_test_sides). + """ N1, F1, _ = D1 N2, F2, _ = D2 @@ -379,6 +490,17 @@ def llr_NvsM_book(self, D1, D2): return scores def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): + """Draws samples from the PLDA model. + + Args: + num_classes: number of classes to sample. + num_samples_per_class: number of samples to sample per each class. + rng: random number generator. + seed: random seed used if rng is None. + + Returns: + Generated samples with shape (num_samples, x_dim). + """ if rng is None: rng = np.random.RandomState(seed=seed) @@ -399,7 +521,15 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): return y + z def weighted_avg_params(self, mu, V, W, w_mu, w_B, w_W): - super(SPLDA, self).weigthed_avg_params(mu, w_mu) + """Performs weighted average of the model parameters + and some given parameters. + + Args: + mu: other mean vector + w_mu: weight of the given mean vector. + + """ + super().weigthed_avg_params(mu, w_mu) if w_B > 0: Sb0 = np.dot(self.V.T, self.V) Sb = np.dot(V.T, V) @@ -416,9 +546,26 @@ def weighted_avg_params(self, mu, V, W, w_mu, w_B, w_W): self.W = invert_pdmat(Sw, return_inv=True)[-1] def weighted_avg_model(self, plda, w_mu, w_B, w_W): + """Performs weighted average of the model parameters + and those of another model given as input. + + Args: + plda: other PLDA model. + + """ self.weighted_avg_params(plda.mu, plda.V, plda.W, w_mu, w_B, w_W) def project(self, T, delta_mu=None): + """Transforms the PLDA parameters given an affine transformation + of the data. + + Args: + T: data projection matrix. + delta_mu: data shift vector. + + Returns: + Projected PLDA model. + """ mu = self.mu if mu is not None: mu -= delta_mu diff --git a/hyperion/np/score_norm/adapt_s_norm.py b/hyperion/np/score_norm/adapt_s_norm.py index 3f1a47c7..b213d653 100644 --- a/hyperion/np/score_norm/adapt_s_norm.py +++ b/hyperion/np/score_norm/adapt_s_norm.py @@ -11,10 +11,18 @@ class AdaptSNorm(ScoreNorm): - """Class for adaptive S-Norm""" + """Class for adaptive S-Norm. + + Attributes: + nbest: number of samples selected to compute the statistics for each trial + by the adaptive algorith + nbest_discard: discard the nbest trials with higher scores, which could + be actual target trials. + std_floor: floor for standard deviations. + """ def __init__(self, nbest=100, nbest_discard=0, **kwargs): - super(AdaptSNorm, self).__init__(*kwargs) + super().__init__(*kwargs) self.nbest = nbest self.nbest_discard = nbest_discard @@ -26,6 +34,18 @@ def predict( mask_coh_test=None, mask_enr_coh=None, ): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + scores_enr_coh: score matrix enroll vs cohort. + mask_coh_test: binary matrix to mask out target trials + from cohort vs test matrix. + mask_enr_coh: binary matrix to mask out target trials + from enroll vs. cohort matrix. + + """ assert scores_enr_coh.shape[1] == scores_coh_test.shape[0] assert self.nbest_discard < scores_enr_coh.shape[1] diff --git a/hyperion/np/score_norm/s_norm.py b/hyperion/np/score_norm/s_norm.py index ee00a7e8..2cf81ffc 100644 --- a/hyperion/np/score_norm/s_norm.py +++ b/hyperion/np/score_norm/s_norm.py @@ -15,7 +15,7 @@ class SNorm(ScoreNorm): """Class for S-Norm, symmetric score normalization.""" def __init__(self, **kwargs): - super(SNorm, self).__init__(*kwargs) + super().__init__(*kwargs) self.t_norm = TNorm(**kwargs) self.z_norm = ZNorm(**kwargs) @@ -27,6 +27,18 @@ def predict( mask_coh_test=None, mask_enr_coh=None, ): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + scores_enr_coh: score matrix enroll vs cohort. + mask_coh_test: binary matrix to mask out target trials + from cohort vs test matrix. + mask_enr_coh: binary matrix to mask out target trials + from enroll vs. cohort matrix. + + """ scores_z_norm = self.z_norm.predict(scores, scores_enr_coh, mask_enr_coh) scores_t_norm = self.t_norm.predict(scores, scores_coh_test, mask_coh_test) diff --git a/hyperion/np/score_norm/score_norm.py b/hyperion/np/score_norm/score_norm.py index 45df0323..e2fa1814 100644 --- a/hyperion/np/score_norm/score_norm.py +++ b/hyperion/np/score_norm/score_norm.py @@ -9,10 +9,20 @@ class ScoreNorm(NPModel): - """ - Base class for score normalization + """Base class for score normalization + + Attributes: + std_floor: floor for standard deviations. """ def __init__(self, std_floor=1e-5, **kwargs): - super(ScoreNorm, self).__init__(*kwargs) + super().__init__(*kwargs) self.std_floor = std_floor + + def forward(self, **kwargs): + """Overloads predict function.""" + return self.predict(**kwargs) + + def __call__(self, *kwargs): + """Overloads predict function.""" + return self.predict(**kwargs) diff --git a/hyperion/np/score_norm/t_norm.py b/hyperion/np/score_norm/t_norm.py index 3fb92548..ac87c8ac 100644 --- a/hyperion/np/score_norm/t_norm.py +++ b/hyperion/np/score_norm/t_norm.py @@ -13,7 +13,15 @@ class TNorm(ScoreNorm): """Class for T-Norm score normalization.""" def predict(self, scores, scores_coh_test, mask=None): + """Normalizes the scores. + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + mask: binary matrix to mask out target trials + from cohort vs test matrix. + + """ if mask is None: mu_t = np.mean(scores_coh_test, axis=0, keepdims=True) s_t = np.std(scores_coh_test, axis=0, keepdims=True) diff --git a/hyperion/np/score_norm/tz_norm.py b/hyperion/np/score_norm/tz_norm.py index d4bb1539..6127091d 100644 --- a/hyperion/np/score_norm/tz_norm.py +++ b/hyperion/np/score_norm/tz_norm.py @@ -14,7 +14,7 @@ class TZNorm(ScoreNorm): """Class for TZ-Norm score normalization.""" def __init__(self, **kwargs): - super(SNorm, self).__init__(*kwargs) + super().__init__(*kwargs) self.t_norm = TNorm(**kwargs) self.z_norm = ZNorm(**kwargs) @@ -28,6 +28,20 @@ def predict( mask_enr_coh=None, mask_coh_coh=None, ): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + scores_enr_coh: score matrix enroll vs cohort. + scores_coh_coh: score matrix cohort vs cohort. + mask_coh_test: binary matrix to mask out target trials + from cohort vs test matrix. + mask_enr_coh: binary matrix to mask out target trials + from enroll vs. cohort matrix. + mask_coh_coh: binary matrix to mask out target trials + from cohort vs. cohort matrix. + """ scores_t_norm = self.t_norm.predict(scores, scores_coh_test, mask_coh_test) scores_enr_coh_t_norm = self.t_norm.predict( diff --git a/hyperion/np/score_norm/z_norm.py b/hyperion/np/score_norm/z_norm.py index f5350fb1..98189e06 100644 --- a/hyperion/np/score_norm/z_norm.py +++ b/hyperion/np/score_norm/z_norm.py @@ -14,7 +14,15 @@ class ZNorm(ScoreNorm): """ def predict(self, scores, scores_enr_coh, mask=None): + """Normalizes the scores. + Args: + scores: score matrix enroll vs. test. + scores_enr_coh: score matrix enroll vs cohort. + mask: binary matrix to mask out target trials + from enroll vs. cohort matrix. + + """ if mask is None: mu_z = np.mean(scores_enr_coh, axis=1, keepdims=True) s_z = np.std(scores_enr_coh, axis=1, keepdims=True) diff --git a/hyperion/np/score_norm/zt_norm.py b/hyperion/np/score_norm/zt_norm.py index 4c5c8b5c..415ddca8 100644 --- a/hyperion/np/score_norm/zt_norm.py +++ b/hyperion/np/score_norm/zt_norm.py @@ -15,7 +15,7 @@ class ZTNorm(ScoreNorm): """Class ZT-Norm score-normalization.""" def __init__(self, **kwargs): - super(SNorm, self).__init__(*kwargs) + super().__init__(*kwargs) self.t_norm = TNorm(**kwargs) self.z_norm = ZNorm(**kwargs) @@ -29,10 +29,24 @@ def predict( mask_enr_coh=None, mask_coh_coh=None, ): + """Normalizes the scores. + + Args: + scores: score matrix enroll vs. test. + scores_coh_test: score matrix cohort vs. test. + scores_enr_coh: score matrix enroll vs cohort. + scores_coh_coh: score matrix cohort vs cohort. + mask_coh_test: binary matrix to mask out target trials + from cohort vs test matrix. + mask_enr_coh: binary matrix to mask out target trials + from enroll vs. cohort matrix. + mask_coh_coh: binary matrix to mask out target trials + from cohort vs. cohort matrix. + """ scores_z_norm = self.z_norm.predict(scores, scores_enr_coh, mask_enr_coh) scores_coh_test_z_norm = self.z_norm.predict( - scores_coh_test, scores_coh_coh, mask_enr_coh + scores_coh_test, scores_coh_coh, mask_coh_coh ) scores_zt_norm = self.t_norm.predict( scores_z_norm, scores_coh_test_z_norm, mask_coh_test diff --git a/hyperion/np/transforms/cent_whiten_up.py b/hyperion/np/transforms/cent_whiten_up.py index f3793328..1200e61b 100644 --- a/hyperion/np/transforms/cent_whiten_up.py +++ b/hyperion/np/transforms/cent_whiten_up.py @@ -17,17 +17,17 @@ class CentWhitenUP(CentWhiten): """Class to do centering and whitening with uncertainty propagation.""" def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): - super(CentWhitenUP, self).__init__(mu, T, update_mu, update_T, **kwargs) + super().__init__(mu, T, update_mu, update_T, **kwargs) def predict(self, x): x_dim = int(x.shape[-1] / 2) m_x = x[:, :x_dim] s2_x = x[:, x_dim:] - m_x = super(CentWhitenUP, self).predict(m_x) + m_x = super().predict(m_x) for i in range(x.shape[0]): s2_x[i] = np.diag(np.dot(self.T.T * s2_x[i], self.T)) return np.hstack((m_x, s2_x)) def fit(self, x, sample_weight=None): x = x[:, : int(x.shape[-1] / 2)] - super(CentWhitenUP, self).fit(x, sample_weight=sample_weight) + super().fit(x, sample_weight=sample_weight) diff --git a/hyperion/np/transforms/lda.py b/hyperion/np/transforms/lda.py index 13c74fe8..5644a2a3 100644 --- a/hyperion/np/transforms/lda.py +++ b/hyperion/np/transforms/lda.py @@ -85,14 +85,6 @@ def load_params(cls, f, config): params = cls._load_params_to_dict(f, config["name"], param_list) return cls(mu=params["mu"], T=params["T"], name=config["name"]) - # @classmethod - # def load(cls, file_path): - # with h5py.File(file_path, 'r') as f: - # config = self.load_config_from_json(f['config']) - # param_list = ['mu', 'T'] - # params = self._load_params_to_dict(f, config['name'], param_list) - # return cls(mu=params['mu'], T=params['T'], name=config['name']) - @classmethod def load_mat(cls, file_path): with h5py.File(file_path, "r") as f: diff --git a/hyperion/torch/layers/margin_losses.py b/hyperion/torch/layers/margin_losses.py index 63da2493..0d748249 100644 --- a/hyperion/torch/layers/margin_losses.py +++ b/hyperion/torch/layers/margin_losses.py @@ -22,6 +22,9 @@ def _l2_norm(x, axis=-1): class ArcLossOutput(nn.Module): """Additive angular margin softmax (ArcFace) output layer. + It includes the option to also use InterTopK penalty: + https://arxiv.org/abs/2109.01989 + Attributes: in_feats: input feature dimension. num_classes: number of output classes. @@ -29,10 +32,19 @@ class ArcLossOutput(nn.Module): margin: angular margin. margin_warmup_epochs: number of epochs to warm up the margin from 0 to its final value. + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. """ def __init__( - self, in_feats, num_classes, cos_scale=64, margin=0.3, margin_warmup_epochs=0 + self, + in_feats, + num_classes, + cos_scale=64, + margin=0.3, + margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0, ): super().__init__() self.in_feats = in_feats @@ -40,10 +52,14 @@ def __init__( self.cos_scale = cos_scale self.margin = margin self.margin_warmup_epochs = margin_warmup_epochs + self.intertop_k = intertop_k + self.intertop_margin = intertop_margin if margin_warmup_epochs == 0: self.cur_margin = margin + self.cur_intertop_margin = intertop_margin else: self.cur_margin = 0 + self.cur_intertop_margin = 0 self._compute_aux() @@ -54,20 +70,28 @@ def __repr__(self): return self.__str__() def __str__(self): - s = "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d)" % ( + s = "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" % ( self.__class__.__name__, self.in_feats, self.num_classes, self.cos_scale, self.margin, self.margin_warmup_epochs, + self.intertop_k, + self.intertop_margin, ) return s def _compute_aux(self): - logging.info("updating arc-softmax margin=%.2f" % (self.cur_margin)) + logging.info( + "updating arc-softmax margin=%.2f intertop-margin=%.2f", + self.cur_margin, + self.cur_intertop_margin, + ) self.cos_m = math.cos(self.cur_margin) self.sin_m = math.sin(self.cur_margin) + self.intertop_cos_m = math.cos(self.cur_intertop_margin) + self.intertop_sin_m = math.sin(self.cur_intertop_margin) def update_margin(self, epoch): """Updates the value of the margin. @@ -80,9 +104,13 @@ def update_margin(self, epoch): if epoch < self.margin_warmup_epochs: self.cur_margin = self.margin * epoch / self.margin_warmup_epochs + self.cur_intertop_margin = ( + self.intertop_margin * epoch / self.margin_warmup_epochs + ) else: if self.cur_margin != self.margin: self.cur_margin = self.margin + self.cur_intertop_margin = self.intertop_margin else: return @@ -117,7 +145,35 @@ def forward(self, x, y=None): cos_theta_m = cos_theta * self.cos_m - sin_theta * self.sin_m idx_ = torch.arange(0, batch_size, dtype=torch.long) + # if torch.distributed.get_rank() == 0: + # print("o1", output[idx_, y]) output[idx_, y] = cos_theta_m[idx_, y] + # if torch.distributed.get_rank() == 0: + # print("o2", output[idx_, y]) + if self.cur_intertop_margin > 0: + # implementation of intertop-K + # set positive scores to -inf so they don't appear in the top k + cos_aux = cos_theta * 1 + cos_aux[idx_, y] = -1e10 + # find topk indices for negative samples + topk = torch.topk(cos_aux, k=self.intertop_k, dim=-1, sorted=False) + idx_ = ( + idx_.unsqueeze(-1).expand(batch_size, self.intertop_k).flatten() + ) + topk_idx = topk.indices.flatten() + # compute cos(theta-m') + cos_theta_m = ( + cos_theta[idx_, topk_idx] * self.intertop_cos_m + + sin_theta[idx_, topk_idx] * self.intertop_sin_m + ) + # take the maximum for the cases where m' is larger than theta to get cos(max(0, theta-m')) + # if torch.distributed.get_rank() == 0: + # print("o3", output[idx_, topk_idx]) + output[idx_, topk_idx] = torch.maximum( + output[idx_, topk_idx], cos_theta_m + ) + # if torch.distributed.get_rank() == 0: + # print("o4", output[idx_, topk_idx], flush=True) output *= s # scale up in order to make softmax work return output @@ -133,10 +189,19 @@ class CosLossOutput(nn.Module): margin: angular margin. margin_warmup_epochs: number of epochs to warm up the margin from 0 to its final value. + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. """ def __init__( - self, in_feats, num_classes, cos_scale=64, margin=0.3, margin_warmup_epochs=0 + self, + in_feats, + num_classes, + cos_scale=64, + margin=0.3, + margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, ): super().__init__() self.in_feats = in_feats @@ -144,14 +209,34 @@ def __init__( self.cos_scale = cos_scale self.margin = margin self.margin_warmup_epochs = margin_warmup_epochs + self.intertop_k = intertop_k + self.intertop_margin = intertop_margin if margin_warmup_epochs == 0: self.cur_margin = margin + self.cur_intertop_margin = intertop_margin else: self.cur_margin = 0 + self.cur_intertop_margin = 0 self.kernel = nn.Parameter(torch.Tensor(in_feats, num_classes)) self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) + def __repr__(self): + return self.__str__() + + def __str__(self): + s = "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" % ( + self.__class__.__name__, + self.in_feats, + self.num_classes, + self.cos_scale, + self.margin, + self.margin_warmup_epochs, + self.intertop_k, + self.intertop_margin, + ) + return s + def update_margin(self, epoch): """Updates the value of the margin. @@ -163,11 +248,23 @@ def update_margin(self, epoch): if epoch < self.margin_warmup_epochs: self.cur_margin = self.margin * epoch / self.margin_warmup_epochs - logging.info("updating cos-softmax margin=%.2f" % (self.cur_margin)) + logging.info( + "updating cos-softmax margin=%.2f intertop-margin=%.2f", + self.cur_margin, + self.cur_intertop_margin, + ) + self.cur_intertop_margin = ( + self.intertop_margin * epoch / self.margin_warmup_epochs + ) else: if self.cur_margin != self.margin: self.cur_margin = self.margin - logging.info("updating cos-softmax margin=%.2f" % (self.cur_margin)) + self.cur_intertop_margin = self.intertop_margin + logging.info( + "updating cos-softmax margin=%.2f intertop-margin=%.2f", + self.cur_margin, + self.cur_intertop_margin, + ) else: return @@ -198,6 +295,21 @@ def forward(self, x, y=None): cos_theta_m = cos_theta - self.cur_margin idx_ = torch.arange(0, batch_size, dtype=torch.long) output[idx_, y] = cos_theta_m[idx_, y] + if self.cur_intertop_margin > 0: + # implementation of intertop-K + # set positive scores to -inf so they don't appear in the top k + cos_aux = cos_theta * 1 + cos_aux[idx_, y] = -1e10 + # find topk indices for negative samples + topk = torch.topk(cos_aux, k=self.intertop_k, dim=-1, sorted=False) + idx_ = ( + idx_.unsqueeze(-1).expand(batch_size, self.intertop_k).flatten() + ) + topk_idx = topk.indices.flatten() + # compute cos(theta) + m' + cos_theta_m = cos_theta[idx_, topk_idx] + self.cur_intertop_margin + # clamp so cos cannt be larger than 1. + output[idx_, topk_idx] = cos_theta_m.clamp(max=1.0) output *= s # scale up in order to make softmax work return output @@ -214,6 +326,8 @@ class SubCenterArcLossOutput(ArcLossOutput): margin: angular margin. margin_warmup_epochs: number of epochs to warm up the margin from 0 to its final value. + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. """ def __init__( @@ -224,6 +338,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, ): super().__init__( in_feats, @@ -231,12 +347,14 @@ def __init__( cos_scale, margin, margin_warmup_epochs, + intertop_k, + intertop_margin, ) self.num_classes = num_classes self.num_subcenters = num_subcenters def __str__(self): - s = "%s(in_feats=%d, num_classes=%d, num_subcenters=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d)" % ( + s = "%s(in_feats=%d, num_classes=%d, num_subcenters=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" % ( self.__class__.__name__, self.in_feats, self.num_classes, @@ -244,6 +362,8 @@ def __str__(self): self.cos_scale, self.margin, self.margin_warmup_epochs, + self.intertop_k, + self.intertop_margin, ) return s @@ -283,6 +403,26 @@ def forward(self, x, y=None): idx_ = torch.arange(0, batch_size, dtype=torch.long) output[idx_, y] = cos_theta_m[idx_, y] + if self.cur_intertop_margin > 0: + # implementation of intertop-K + # set positive scores to -inf so they don't appear in the top k + cos_aux = cos_theta * 1 + cos_aux[idx_, y] = -1e10 + # find topk indices for negative samples + topk = torch.topk(cos_aux, k=self.intertop_k, dim=-1, sorted=False) + idx_ = ( + idx_.unsqueeze(-1).expand(batch_size, self.intertop_k).flatten() + ) + topk_idx = topk.indices.flatten() + # compute cos(theta-m') + cos_theta_m = ( + cos_theta[idx_, topk_idx] * self.intertop_cos_m + + sin_theta[idx_, topk_idx] * self.intertop_sin_m + ) + # take the maximum for the cases where m' is larger than theta to get cos(max(0, theta-m')) + output[idx_, topk_idx] = torch.maximum( + output[idx_, topk_idx], cos_theta_m + ) output *= s # scale up in order to make softmax work return output diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index cb8ff1d0..d79d5a26 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -36,7 +36,7 @@ def __init__( self.xvector = xvector self.feat_fusion_start = feat_fusion_start self.feat_fusion_method = feat_fusion_method - self._hf_context = contextlib.nullcontext + self._hf_context = contextlib.nullcontext() self._make_fuser() def _make_fuser(self): @@ -244,7 +244,7 @@ def set_train_mode(self, mode): logging.info("using torch.no_grad for hf_feats") self._hf_context = torch.no_grad() else: - self._hf_context = contextlib.nullcontext + self._hf_context = contextlib.nullcontext() self._train_mode = mode diff --git a/hyperion/torch/models/xvectors/efficient_net_xvector.py b/hyperion/torch/models/xvectors/efficient_net_xvector.py index 606a9554..21eb9dbe 100644 --- a/hyperion/torch/models/xvectors/efficient_net_xvector.py +++ b/hyperion/torch/models/xvectors/efficient_net_xvector.py @@ -42,6 +42,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, drop_connect_rate=0.2, dropout_rate=0, @@ -88,6 +90,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index 706ee4ef..e4495182 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -26,6 +26,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0, norm_layer=None, @@ -52,6 +54,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, @@ -62,58 +66,6 @@ def __init__( proj_feats=proj_feats, ) - # @property - # def in_channels(self): - # return self.encoder_net.in_channels - - # @property - # def conv_channels(self): - # return self.encoder_net.conv_channels - - # @property - # def base_channels(self): - # return self.encoder_net.base_channels - - # @property - # def in_kernel_size(self): - # return self.encoder_net.in_kernel_size - - # @property - # def in_stride(self): - # return self.encoder_net.in_stride - - # @property - # def zero_init_residual(self): - # return self.encoder_net.zero_init_residual - - # @property - # def groups(self): - # return self.encoder_net.groups - - # @property - # def replace_stride_with_dilation(self): - # return self.encoder_net.replace_stride_with_dilation - - # @property - # def do_maxpool(self): - # return self.encoder_net.do_maxpool - - # @property - # def in_norm(self): - # return self.encoder_net.in_norm - - # @property - # def se_r(self): - # return self.encoder_net.se_r - - # @property - # def res2net_scale(self): - # return self.encoder_net.res2net_scale - - # @property - # def res2net_width_factor(self): - # return self.encoder_net.res2net_width_factor - def get_config(self): base_config = super().get_config() diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index 58a34c94..99385cae 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -36,6 +36,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0, norm_layer=None, @@ -84,6 +86,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index d3a22bce..676952da 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -40,6 +40,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0, norm_layer=None, @@ -92,6 +94,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, diff --git a/hyperion/torch/models/xvectors/tdnn_xvector.py b/hyperion/torch/models/xvectors/tdnn_xvector.py index a0211f87..7816c7ea 100644 --- a/hyperion/torch/models/xvectors/tdnn_xvector.py +++ b/hyperion/torch/models/xvectors/tdnn_xvector.py @@ -33,6 +33,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0, norm_layer=None, @@ -73,6 +75,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, diff --git a/hyperion/torch/models/xvectors/transformer_xvector_v1.py b/hyperion/torch/models/xvectors/transformer_xvector_v1.py index 1eaa03b6..742fadc8 100644 --- a/hyperion/torch/models/xvectors/transformer_xvector_v1.py +++ b/hyperion/torch/models/xvectors/transformer_xvector_v1.py @@ -73,6 +73,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, dropout_rate=0.1, pos_dropout_rate=0.1, @@ -118,6 +120,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=norm_layer, head_norm_layer=head_norm_layer, diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 008f595c..8c2070b5 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -38,6 +38,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, norm_layer=None, head_norm_layer=None, @@ -120,6 +122,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, num_subcenters=num_subcenters, norm_layer=head_norm_layer, use_norm=use_norm, @@ -163,6 +167,14 @@ def margin(self): def margin_warmup_epochs(self): return self.classif_net.margin_warmup_epochs + @property + def intertop_k(self): + return self.classif_net.intertop_k + + @property + def intertop_margin(self): + return self.classif_net.intertop_margin + @property def num_subcenters(self): return self.classif_net.num_subcenters @@ -490,6 +502,8 @@ def get_config(self): "cos_scale": self.cos_scale, "margin": self.margin, "margin_warmup_epochs": self.margin_warmup_epochs, + "intertop_k": self.intertop_k, + "intertop_margin": self.intertop_margin, "num_subcenters": self.num_subcenters, "norm_layer": self.norm_layer, "head_norm_layer": self.head_norm_layer, @@ -560,6 +574,7 @@ def set_train_mode(self, mode): elif mode == "frozen": self.freeze() elif mode == "ft-embed-affine": + self.unfreeze() self.freeze_preembed_layers() else: raise ValueError(f"invalid train_mode={mode}") @@ -581,7 +596,8 @@ def _train(self, train_mode: str): else: raise ValueError(f"invalid train_mode={train_mode}") - def valid_train_modes(self): + @staticmethod + def valid_train_modes(): return ["full", "frozen", "ft-embed-affine"] @staticmethod @@ -607,6 +623,8 @@ def filter_args(**kwargs): "cos_scale", "margin", "margin_warmup_epochs", + "intertop_k", + "intertop_margin", "num_subcenters", "use_norm", "norm_before", @@ -670,6 +688,16 @@ def add_class_args(parser, prefix=None, skip=set()): help="number of epoch until we set the final margin", ) + parser.add_argument( + "--intertop-k", default=5, type=int, help="K for InterTopK penalty" + ) + parser.add_argument( + "--intertop-margin", + default=0.0, + type=float, + help="margin for InterTopK penalty", + ) + parser.add_argument( "--num-subcenters", default=2, @@ -760,9 +788,15 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - valid_args = ("loss_type", "cos_scale", "margin", "margin_warmup_epochs") + valid_args = ( + "loss_type", + "cos_scale", + "margin", + "margin_warmup_epochs", + "intertop_k", + "intertop_margin", + ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args @staticmethod @@ -793,6 +827,16 @@ def add_finetune_args(parser, prefix=None): help="number of epoch until we set the final margin", ) + parser.add_argument( + "--intertop-k", default=5, type=int, help="K for InterTopK penalty" + ) + parser.add_argument( + "--intertop-margin", + default=0.0, + type=float, + help="margin for InterTopK penalty", + ) + parser.add_argument( "--num-subcenters", default=2, diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index 5824cb1b..06bd988c 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -29,6 +29,8 @@ class ClassifHead(NetArch): cos_scale: scale parameter for cos-softmax and arc-softmax margin: margin parameter for cos-softmax and arc-softmax margin_warmup_epochs: number of epochs to anneal the margin from 0 to margin + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. num_subcenters: number of subcenters in subcenter losses norm_layer: norm_layer object or str indicating type norm layer, if None it uses BatchNorm1d use_norm: it True it uses layer/batch-normalization @@ -46,6 +48,8 @@ def __init__( cos_scale=64, margin=0.3, margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, norm_layer=None, use_norm=True, @@ -78,6 +82,8 @@ def __init__( self.cos_scale = cos_scale self.margin = margin self.margin_warmup_epochs = margin_warmup_epochs + self.intertop_k = intertop_k + self.intertop_margin = intertop_margin self.num_subcenters = num_subcenters prev_feats = in_feats @@ -124,6 +130,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) elif loss_type == "arc-softmax": self.output = ArcLossOutput( @@ -132,6 +140,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) elif loss_type == "subcenter-arc-softmax": self.output = SubCenterArcLossOutput( @@ -141,6 +151,8 @@ def __init__( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) def rebuild_output_layer( @@ -150,6 +162,8 @@ def rebuild_output_layer( cos_scale, margin, margin_warmup_epochs, + intertop_k=5, + intertop_margin=0.0, num_subcenters=2, ): @@ -159,6 +173,8 @@ def rebuild_output_layer( self.cos_scale = cos_scale self.margin = margin self.margin_warmup_epochs = margin_warmup_epochs + self.intertop_margin = intertop_margin + self.num_subcenters = num_subcenters self.num_subcenters = num_subcenters if loss_type == "softmax": @@ -170,6 +186,8 @@ def rebuild_output_layer( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) elif loss_type == "arc-softmax": self.output = ArcLossOutput( @@ -178,6 +196,8 @@ def rebuild_output_layer( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) elif loss_type == "subcenter-arc-softmax": self.output = SubCenterArcLossOutput( @@ -187,6 +207,8 @@ def rebuild_output_layer( cos_scale=cos_scale, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, ) def set_margin(self, margin): @@ -281,6 +303,8 @@ def get_config(self): "cos_scale": self.cos_scale, "margin": self.margin, "margin_warmup_epochs": self.margin_warmup_epochs, + "intertop_k": self.intertop_k, + "intertop_margin": self.intertop_margin, "num_subcenters": self.num_subcenters, "norm_layer": self.norm_layer, "use_norm": self.use_norm, @@ -311,6 +335,8 @@ def filter_args(**kwargs): "s", "margin", "margin_warmup_epochs", + "intertop_k", + "intertop_margin", "num_subcenters", "use_norm", "norm_before", @@ -362,6 +388,16 @@ def add_class_args(parser, prefix=None): help="number of epoch until we set the final margin", ) + parser.add_argument( + "--intertop-k", default=5, type=int, help="K for InterTopK penalty" + ) + parser.add_argument( + "--intertop-margin", + default=0.0, + type=float, + help="margin for InterTopK penalty", + ) + parser.add_argument( "--num-subcenters", default=2, diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 8dfad9ce..4e29dab5 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -586,8 +586,11 @@ def load_checkpoint(self, file_path): logs = checkpoint["logs"] del checkpoint - if self.device is not None: - torch.cuda.empty_cache() + # this was added before to try to release as much GPU memory as possible + # Recently has started to cause CUDA not available devices error + # Commenting for now. + # if self.device is not None: + # torch.cuda.empty_cache() return logs diff --git a/notebooks/tutorial_jsalt22/ivectors.ipynb b/notebooks/tutorial_jsalt22/ivectors.ipynb new file mode 100644 index 00000000..46d4eb61 --- /dev/null +++ b/notebooks/tutorial_jsalt22/ivectors.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# i-Vectors Tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hpath='/exp/jvillalba/hyperion/hyperion-persephone'\n", + "import sys\n", + "sys.path.append(hpath)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import hyperion as hyp\n", + "import hyperion.np as hnp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_data(num_dims, num_spks=10, num_utts=10, num_units=10, unit_length=10, tv_dim=2):\n", + " \"\"\" Generate data following the i-vector model\n", + "\n", + " Args:\n", + " num_dims: number of dimensions of the features.\n", + " num_spks: number of speakers.\n", + " num_utts: number of utterances per speaker.\n", + " num_units: number of phonetic units per utterance.\n", + " unit_length: duration of each phonetic unit.\n", + " \"\"\"\n", + " rng = np.random.RandomState(seed=1234)\n", + " # we set the number of phonetic classes to 2^num_dim\n", + " num_comp = 2**num_dims\n", + " \n", + " # Define UBM\n", + " # Means of the GMM-UBM\n", + " ubm_means = np.zeros((num_comp, num_dims))\n", + " kernel=np.array([1.,-1.])[:,None]\n", + " ubm_means = kernel\n", + " for i in range(1,num_dims):\n", + " ubm_means = np.concatenate((np.repeat(kernel, int(2**i), axis=0), np.tile(ubm_means,(2,1))), axis=1)\n", + " \n", + " # Covariances of the GMM-UBM\n", + " ubm_cov = 0.1 * np.ones((num_comp, num_dims))\n", + " ubm_prec = 1./ubm_cov\n", + "\n", + " # Weights of the GMM-UBM\n", + " ubm_weights = np.ones((num_comp))/num_comp\n", + " \n", + "\n", + " # Define between and within speaker covariances\n", + " sb = 0.7\n", + " sw = 0.3\n", + "\n", + " # Define Total Variability sub-space\n", + " T = rng.randn(tv_dim, num_dims * num_comp)\n", + " T = 0.2 * T/np.max(T)\n", + " \n", + " # Sample speakers\n", + " spk_ids = np.arange(num_spks)\n", + " y = sb * rng.randn(num_spks, tv_dim)\n", + "\n", + " # Sample i-vectors\n", + " spk_ids = np.repeat(spk_ids, num_utts, axis=0)\n", + " y = np.repeat(y, num_utts, axis=0)\n", + " w = y + sw * rng.randn(num_spks*num_utts, tv_dim)\n", + "\n", + " x = []\n", + " r_idx = []\n", + " # Sample features\n", + " for i in range(w.shape[0]):\n", + " # For each utterance\n", + " # Compute the GMM mean of the utterance\n", + " means_i = ubm_means + np.dot(w[i],T).reshape(num_dims,num_comp).T\n", + "\n", + " # Create a GMM for the utterance.\n", + " gmm = hnp.pdfs.GMMDiagCov(pi=ubm_weights, mu=means_i, Lambda=ubm_prec)\n", + "\n", + " # Sample the Gaussian components\n", + " r_i = rng.multinomial(1, ubm_weights, size=(num_units,))\n", + " # Assume that we stay in the same component several time steps.\n", + " r_i = np.repeat(r_i, unit_length, axis=0)\n", + " # Draw samples from the GMM\n", + " x_i = gmm.sample(r=r_i)\n", + " x.append(x_i)\n", + " r_idx.append(r_i.argmax(axis=-1))\n", + "\n", + " return x, r_idx, spk_ids\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x, r_idx, spk_ids =generate_data(num_dims=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_cat=np.concatenate(x, axis=0)\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(projection='3d')\n", + "ax.scatter(x_cat[:,0], x_cat[:,1], x_cat[:,2], marker='o')\n", + "ax.set_xlabel('x1')\n", + "ax.set_ylabel('x2')\n", + "ax.set_zlabel('x3')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ubm_gmm = hnp.pdfs.GMMDiagCov(num_comp=8, x_dim=3)\n", + "elbo, elbo_norm = ubm_gmm.fit(x_cat, epochs=10)\n", + "fig = plt.figure()\n", + "plt.plot(elbo_norm)\n", + "plt.xlabel('iters')\n", + "plt.ylabel('log(p(x))')\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ubm_gmm.mu\n", + "ubm_gmm.pi\n", + "ubm_gmm.Sigma\n", + "fig=plt.figure()\n", + "ax=fig.add_subplot(111, projection=\"3d\")\n", + "ubm_gmm.plot3D_ellipsoid(num_sigmas=1, ax=ax)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "iv_model = hnp.pdfs.JFATotal(K=8, x_dim=3, y_dim=2)\n", + "N=[]\n", + "F=[]\n", + "for x_i in x:\n", + " N_i, u_x_i = ubm_gmm.accum_suff_stats(x_i)\n", + " N_i, F_i = ubm_gmm.norm_suff_stats(N_i, u_x_i)\n", + " N.append(N_i.reshape(1,-1))\n", + " F.append(F_i.reshape(1,-1))\n", + "\n", + "N = np.concatenate(N, axis=0)\n", + "F = np.concatenate(F, axis=0)\n", + "\n", + "elbo, elbo_norm = iv_model.fit(N, F)\n", + "fig = plt.figure()\n", + "plt.plot(elbo_norm)\n", + "plt.xlabel('iters')\n", + "plt.ylabel('log(p(x))')\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "F.shape\n", + "\n" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "488a239b304e646027d6710c3377746db4487e56624448f35f81edd765904a6d" + }, + "kernelspec": { + "display_name": "Python 3.8.12 ('py38_pt101_cu112')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 25bbd0e652c1de7cf1f3fb214604028cc2176df9 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 2 Jun 2022 18:57:48 -0400 Subject: [PATCH 015/154] documented most np models --- hyperion/np/transforms/cent_whiten.py | 68 +++++++++++++- hyperion/np/transforms/cent_whiten_up.py | 44 ++++++++- hyperion/np/transforms/coral.py | 80 ++++++++++++++-- hyperion/np/transforms/gaussianizer.py | 74 +++++++++++++-- hyperion/np/transforms/lda.py | 72 ++++++++++++++- hyperion/np/transforms/lnorm.py | 9 +- hyperion/np/transforms/lnorm_up.py | 11 ++- hyperion/np/transforms/mvn.py | 60 +++++++++++- hyperion/np/transforms/nap.py | 90 ++++++++++++++---- hyperion/np/transforms/nda.py | 111 +++++++++++++++++++++-- hyperion/np/transforms/pca.py | 72 ++++++++++++++- hyperion/np/transforms/sb_sw.py | 32 ++++++- hyperion/np/transforms/skl_tsne.py | 50 ++++++++++ hyperion/np/transforms/transform_list.py | 43 ++++++++- 14 files changed, 749 insertions(+), 67 deletions(-) diff --git a/hyperion/np/transforms/cent_whiten.py b/hyperion/np/transforms/cent_whiten.py index f1cdf227..e700dbe8 100644 --- a/hyperion/np/transforms/cent_whiten.py +++ b/hyperion/np/transforms/cent_whiten.py @@ -13,7 +13,14 @@ class CentWhiten(NPModel): - """Class to do centering and whitening of i-vectors.""" + """Class to do centering and whitening of i-vectors. + + Attributes: + mu: data mean vector + T: whitening projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): super().__init__(**kwargs) @@ -22,18 +29,55 @@ def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): self.update_mu = update_mu self.update_T = update_T + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ if self.mu is not None: x = x - self.mu if self.T is not None: if self.T.ndim == 1: - x = x * T + x = x * self.T else: x = np.dot(x, self.T) return x def fit(self, x=None, sample_weight=None, mu=None, S=None): - + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + sample_weight: weight for each training sample. + mu: precomputed mean (used if x is None). + S: precomputed convariances (used if x is None). + """ if x is not None: if x.shape[0] > x.shape[1]: gauss = Normal(x_dim=x.shape[1]) @@ -62,19 +106,35 @@ def fit(self, x=None, sample_weight=None, mu=None, S=None): self.T = V def get_config(self): + """Returns the model configuration dict.""" config = {"update_mu": self.update_mu, "update_t": self.update_T} base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"mu": self.mu, "T": self.T} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "T"] params = cls._load_params_to_dict(f, config["name"], param_list) - return cls(mu=params["mu"], T=params["T"], name=config["name"]) + return cls(mu=params["mu"], T=params["T"], **config) @classmethod def load_mat(cls, file_path): diff --git a/hyperion/np/transforms/cent_whiten_up.py b/hyperion/np/transforms/cent_whiten_up.py index 1200e61b..9290eae6 100644 --- a/hyperion/np/transforms/cent_whiten_up.py +++ b/hyperion/np/transforms/cent_whiten_up.py @@ -14,12 +14,49 @@ class CentWhitenUP(CentWhiten): - """Class to do centering and whitening with uncertainty propagation.""" + """Class to do centering and whitening with uncertainty propagation. + + Attributes: + mu: data mean vector + T: whitening projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): super().__init__(mu, T, update_mu, update_T, **kwargs) + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ x_dim = int(x.shape[-1] / 2) m_x = x[:, :x_dim] s2_x = x[:, x_dim:] @@ -29,5 +66,10 @@ def predict(self, x): return np.hstack((m_x, s2_x)) def fit(self, x, sample_weight=None): + """Trains the transformation parameters. + + Args: + x: training samples with shape (num_samples, x_dim) + """ x = x[:, : int(x.shape[-1] / 2)] super().fit(x, sample_weight=sample_weight) diff --git a/hyperion/np/transforms/coral.py b/hyperion/np/transforms/coral.py index 9aee7579..54bd27bc 100644 --- a/hyperion/np/transforms/coral.py +++ b/hyperion/np/transforms/coral.py @@ -12,7 +12,19 @@ class CORAL(NPModel): - """Class to do CORAL""" + """Class to do CORAL. + + https://arxiv.org/abs/1612.01939 + + Attributes: + mu: mean shift between both domains. + T_col: recoloring projection. + T_white: whitening projection. + update_mu: whether or not to update mu when training. + update_T: wheter or not to update T_col and T_white when training. + alpha_mu: weight of the in-domain data when computing in-domain mean. + alpha_T: weight of the in-domain data when computing in-domain covariance. + """ def __init__( self, @@ -25,7 +37,7 @@ def __init__( alpha_T=1, **kwargs ): - super(CORAL, self).__init__(**kwargs) + super().__init__(**kwargs) self.mu = mu self.T_col = T_col self.T_white = T_white @@ -36,19 +48,51 @@ def __init__( self.alpha_T = alpha_T def get_config(self): + """Returns the model configuration dict.""" config = { "update_mu": self.update_mu, "update_t": self.update_T, - "pca_dim": self.pca_dim, + "alpha_mu": self.alpha_mu, + "alpha_T": self.alpha_T, } - base_config = super(CORAL, self).get_config() + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def _compute_T(self): if self.T_col is not None and self.T_white is not None: self.T = np.dot(self.T_white, self.T_col) + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ if self.T is None: self._compute_T() if self.mu is not None: @@ -60,7 +104,14 @@ def predict(self, x): return x def fit(self, x, sample_weight=None, x_out=None, sample_weight_out=None): - + """Trains the model. + + Args: + x: in-domain data samples with shape (num_samples, x_dim). + sample_weight: weight for each in-domain training sample. + x_out: out-domain data samples with shape (num_samples, x_dim). + sample_weight_out: weight for each out-domain training sample. + """ if x_out is None: assert self.T_white is not None else: @@ -88,21 +139,34 @@ def fit(self, x, sample_weight=None, x_out=None, sample_weight_out=None): @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "T_col", "T_white"] params = cls._load_params_to_dict(f, config["name"], param_list) return cls( mu=params["mu"], T_col=params["T_col"], T_white=params["T_white"], - name=config["name"], + **config, ) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = { "mu": self.mu, "T_col": self.T_col, "T_white": self.T_white, - "alpha_mu": self.alpha_mu, - "alpha_T": self.alpha_T, } self._save_params_from_dict(f, params) diff --git a/hyperion/np/transforms/gaussianizer.py b/hyperion/np/transforms/gaussianizer.py index 26294134..393364b6 100644 --- a/hyperion/np/transforms/gaussianizer.py +++ b/hyperion/np/transforms/gaussianizer.py @@ -15,18 +15,56 @@ class Gaussianizer(NPModel): - """Class to make i-vector distribution standard Normal.""" + """Class to make i-vector distribution standard Normal. + + Args: + max_vectors: maximum number of background vectors needed to + compute the Gaussianization. + r: background vector matrix obtained by fit function. + """ def __init__(self, max_vectors=None, r=None, **kwargs): - super(Gaussianizer, self).__init__(**kwargs) + super().__init__(**kwargs) self.max_vectors = max_vectors self.r = r + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): - px_cum = np.linspace(0, 1, self.r.shape[0] + 2)[1:-1] + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + # px_cum = np.linspace(0, 1, self.r.shape[0] + 2)[1:-1] + px_cum = np.linspace(0, 1, self.r.shape[0] + 3)[1:-1] y_map = erfinv(2 * px_cum - 1) * np.sqrt(2) - r = self.r[1:] + # r = self.r[1:] + r = self.r y = np.zeros_like(x) for i in range(x.shape[1]): y_index = np.searchsorted(r[:, i], x[:, i]) @@ -36,10 +74,13 @@ def predict(self, x): return y def fit(self, x): + """Trains the model. + Args: + x: training data samples with shape (num_samples, x_dim). + """ r = np.sort(x, axis=0, kind="heapsort") - - x = np.zeros((1, x.shape[-1]), dtype=float_cpu()) + # x = np.zeros((1, x.shape[-1]), dtype=float_cpu()) if r.shape[0] > self.max_vectors: index = np.round( @@ -47,20 +88,37 @@ def fit(self, x): ).astype(int) r = r[index, :] - self.r = np.vstack((x, r)) + # self.r = np.vstack((x, r)) + self.r = r def get_config(self): + """Returns the model configuration dict.""" config = {"max_vectors": self.max_vectors} - base_config = super(Gaussianizer, self).get_config() + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"r": self.r} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["r"] params = cls._load_params_to_dict(f, config["name"], param_list) return cls( diff --git a/hyperion/np/transforms/lda.py b/hyperion/np/transforms/lda.py index 5644a2a3..b4f5cbc8 100644 --- a/hyperion/np/transforms/lda.py +++ b/hyperion/np/transforms/lda.py @@ -13,12 +13,20 @@ class LDA(NPModel): - """Class to do linear discriminant analysis.""" + """Class to do linear discriminant analysis. + + Attributes: + mu: data mean vector + T: LDA projection. + lda_dim: LDA dimension. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ def __init__( self, mu=None, T=None, lda_dim=None, update_mu=True, update_T=True, **kwargs ): - super(LDA, self).__init__(**kwargs) + super().__init__(**kwargs) self.mu = mu self.T = T if T is None: @@ -28,13 +36,51 @@ def __init__( self.update_mu = update_mu self.update_T = update_T + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ if self.mu is not None: x = x - self.mu return np.dot(x, self.T) def fit(self, x, y, mu=None, Sb=None, Sw=None): - + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + y: training labels as integers in [0, num_classes-1] with shape (num_samples,) + mu: precomputed mean. + Sb: precomputed between-class covariance. + Sw: precomputed within-class covariance. + """ if mu is None or Sb is None or Sw is None: sbsw = SbSw() sbsw.fit(x, y) @@ -67,23 +113,39 @@ def fit(self, x, y, mu=None, Sb=None, Sw=None): self.T = V def get_config(self): + """Returns the model configuration dict.""" config = { "lda_dim": self.lda_dim, "update_mu": self.update_mu, "update_t": self.update_T, } - base_config = super(LDA, self).get_config() + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"mu": self.mu, "T": self.T} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "T"] params = cls._load_params_to_dict(f, config["name"], param_list) - return cls(mu=params["mu"], T=params["T"], name=config["name"]) + return cls(mu=params["mu"], T=params["T"], **config) @classmethod def load_mat(cls, file_path): diff --git a/hyperion/np/transforms/lnorm.py b/hyperion/np/transforms/lnorm.py index 088748b2..9b4f36fe 100644 --- a/hyperion/np/transforms/lnorm.py +++ b/hyperion/np/transforms/lnorm.py @@ -9,7 +9,14 @@ class LNorm(CentWhiten): - """Class to do length normalization.""" + """Class to do length normalization. + + Attributes: + mu: data mean vector + T: whitening projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ def predict(self, x): x = super().predict(x) diff --git a/hyperion/np/transforms/lnorm_up.py b/hyperion/np/transforms/lnorm_up.py index b6e211d5..0814f9fe 100644 --- a/hyperion/np/transforms/lnorm_up.py +++ b/hyperion/np/transforms/lnorm_up.py @@ -10,10 +10,17 @@ class LNormUP(CentWhitenUP): - """Class to do Lenght Normalization with uncertainty propagation""" + """Class to do Lenght Normalization with uncertainty propagation. + + Attributes: + mu: data mean vector + T: whitening projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + """ def predict(self, x): - x = super(LNormUP, self).predict(x) + x = super().predict(x) x_dim = int(x.shape[-1] / 2) m_x = x[:, :x_dim] s2_x = x[:, x_dim:] diff --git a/hyperion/np/transforms/mvn.py b/hyperion/np/transforms/mvn.py index 7f60206e..484a6913 100644 --- a/hyperion/np/transforms/mvn.py +++ b/hyperion/np/transforms/mvn.py @@ -12,14 +12,50 @@ class MVN(NPModel): - """Class to do global mean and variance normalization.""" + """Class to do global mean and variance normalization. + + Attributes: + mu: data mean vector + s: standard deviation vector. + + """ def __init__(self, mu=None, s=None, **kwargs): - super(MVN, self).__init__(**kwargs) + super().__init__(**kwargs) self.mu = mu self.s = s + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ if self.mu is not None: x = x - self.mu if self.s is not None: @@ -27,15 +63,35 @@ def predict(self, x): return x def fit(self, x): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + """ self.mu = np.mean(x, axis=0) self.s = np.std(x, axis=0) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"mu": self.mu, "s": self.s} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "s"] params = cls._load_params_to_dict(f, config["name"], param_list) return cls(mu=params["mu"], s=params["s"], name=config["name"]) diff --git a/hyperion/np/transforms/nap.py b/hyperion/np/transforms/nap.py index ee13e7e0..c6f8f8de 100644 --- a/hyperion/np/transforms/nap.py +++ b/hyperion/np/transforms/nap.py @@ -12,47 +12,105 @@ class NAP(NPModel): - """Class to do nussance attribute projection.""" + """Class to do nuissance attribute projection. - def __init__(self, U=None, **kwargs): - super(NAP, self).__init__(**kwargs) + Attributes: + U: NAP projection. + """ + + def __init__(self, U=None, U_dim=None, **kwargs): + super().__init__(**kwargs) self.U = U + if U is None: + self.U_dim = U_dim + else: + self.U_dim = U.shape[0] + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ return x - np.dot(np.dot(x, self.U.T), self.U) - def fit(self, x, U_dim, class_ids): - x_hat = np.zeros_like(x) - u_ids = np.uniqe(class_ids) + def fit(self, x, y): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + y: training labels as integers in [0, num_classes-1] with shape (num_samples,) + """ + u_ids = np.unique(y) M = np.sqrt(len(u_ids)) for i in u_ids: - idx = np.nonzero(i == class_ids) + idx = y == i N = np.sqrt(len(idx)) mu_i = np.mean(x[idx, :], axis=0) xx[idx, :] = (x[idx, :] - mu_i) / N xx /= M _, s, Vt = np.svd(xx, full_matrices=False, overwrite_a=True) - idx = (np.argsort(s)[::-1])[:U_dim] + idx = (np.argsort(s)[::-1])[: self.U_dim] self.U = Vt[idx, :] + def get_config(self): + """Returns the model configuration dict.""" + config = { + "U_dim": self.U_dim, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"U": self.U} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["U"] params = cls._load_params_to_dict(f, config["name"], param_list) return cls(U=params["U"], name=config["name"]) - # @classmethod - # def load(cls, file_path): - # with h5py.File(file_path, 'r') as f: - # config = self.load_config_from_json(f['config']) - # param_list = ['U'] - # params = self._load_params_to_dict(f, config['name'], param_list) - # return cls(U=params['U'], name=config['name']) - @classmethod def load_mat(cls, file_path): with h5py.File(file_path, "r") as f: diff --git a/hyperion/np/transforms/nda.py b/hyperion/np/transforms/nda.py index c84a4527..71910c92 100644 --- a/hyperion/np/transforms/nda.py +++ b/hyperion/np/transforms/nda.py @@ -10,47 +10,140 @@ from ..np_model import NPModel from ...hyp_defs import float_cpu +from .sb_sw import NSbSw class NDA(NPModel): - """Class to do nearest-neighbors discriminant analysis""" + """Class to do nearest-neighbors discriminant analysis - def __init__(self, mu=None, T=None, **kwargs): + Attributes: + mu: data mean vector + T: NDA projection. + """ + + def __init__( + self, mu=None, T=None, nda_dim=None, update_mu=True, update_T=True, **kwargs + ): super().__init__(**kwargs) self.mu = mu self.T = T + if T is None: + self.nda_dim = nda_dim + else: + self.nda_dim = T.shape[1] + self.update_mu = update_mu + self.update_T = update_T + + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ if self.mu is not None: x = x - self.mu return np.dot(x, self.T) - def fit(self, mu, Sb, Sw, nda_dim=None): - self.mu = mu + def fit(self, x, y, mu=None, Sb=None, Sw=None): + """Trains the model. + + Args: + x: training data samples with shape (num_samples, x_dim). + y: training labels as integers in [0, num_classes-1] with shape (num_samples,) + mu: precomputed mean. + Sb: precomputed between-class covariance. + Sw: precomputed within-class covariance. + """ + if mu is None or Sb is None or Sw is None: + sbsw = NSbSw() + sbsw.fit(x, y) + mu = sbsw.mu + Sb = sbsw.Sb + Sw = sbsw.Sw + + if self.update_mu: + self.mu = mu + + if not self.update_T: + return assert Sb.shape == Sw.shape - d, V = la.eigh(Sb, Sw) + try: + d, V = la.eigh(Sb, Sw) + except: + alpha = 1e-2 * np.max(np.diag(Sw)) + d, V = la.eigh(Sb, alpha * np.eye(Sw.shape[0]) + Sw) V = np.fliplr(V) p = V[0, :] < 0 V[:, p] *= -1 - if nda_dim is not None: - assert nda_dim <= V.shape[1] - V = V[:, :nda_dim] + if self.nda_dim is not None: + assert self.nda_dim <= V.shape[1] + V = V[:, : self.nda_dim] self.T = V + def get_config(self): + """Returns the model configuration dict.""" + config = { + "nda_dim": self.nda_dim, + "update_mu": self.update_mu, + "update_t": self.update_T, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"mu": self.mu, "T": self.T} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "T"] params = cls._load_params_to_dict(f, config["name"], param_list) - return cls(mu=params["mu"], T=params["T"], name=config["name"]) + return cls(mu=params["mu"], T=params["T"], **config) @classmethod def load_mat(cls, file_path): diff --git a/hyperion/np/transforms/pca.py b/hyperion/np/transforms/pca.py index 23477c84..6d6ff7b1 100644 --- a/hyperion/np/transforms/pca.py +++ b/hyperion/np/transforms/pca.py @@ -12,7 +12,18 @@ class PCA(NPModel): - """Class to do principal component analysis""" + """Class to do principal component analysis + + Attributes: + mu: data mean vector + T: LDA projection. + update_mu: whether or not to update the mean when training. + update_T: wheter or not to update T when training. + pca_dim: pca dimension (optional). + pca_var_r: pca variance ratio to retain, overrides pca_dim (optional). + pca_min_dim: minimum dimension of PCA when using pca_var_r. + whiten: whitens the data after PCA. + """ def __init__( self, @@ -36,7 +47,37 @@ def __init__( self.pca_min_dim = pca_min_dim self.whiten = whiten + def __call__(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): + """Applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ if self.mu is not None: x = x - self.mu return np.dot(x, self.T) @@ -57,8 +98,15 @@ def get_pca_dim_for_var_ratio(x, var_r=1, min_dim=2): rank = max(min_dim, rank) return rank - def fit(self, x=None, sample_weight=None, mu=None, S=None): + def fit(self, x=None, mu=None, S=None): + """Trains the model. + Args: + x: training data samples with shape (num_samples, x_dim). + y: training labels as integers in [0, num_classes-1] with shape (num_samples,) + mu: precomputed mean. + S: precomputed total covariance. + """ if x is not None: mu = np.mean(x, axis=0) delta = x - mu @@ -104,28 +152,44 @@ def fit(self, x=None, sample_weight=None, mu=None, S=None): self.T = V def get_config(self): + """Returns the model configuration dict.""" config = { "update_mu": self.update_mu, "update_t": self.update_T, "pca_dim": self.pca_dim, "pca_var_r": self.pca_var_r, + "pca_min_dim": self.pca_min_dim, } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def save_params(self, f): + """Saves the model paramters into the file. + + Args: + f: file handle. + """ params = {"mu": self.mu, "T": self.T} self._save_params_from_dict(f, params) @classmethod def load_params(cls, f, config): + """Initializes the model from the configuration and loads the model + parameters from file. + + Args: + f: file handle. + config: configuration dictionary. + + Returns: + Model object. + """ param_list = ["mu", "T"] params = cls._load_params_to_dict(f, config["name"], param_list) return cls( mu=params["mu"], T=params["T"], - pca_dim=config["pca_dim"], - name=config["name"], + **config, ) @classmethod diff --git a/hyperion/np/transforms/sb_sw.py b/hyperion/np/transforms/sb_sw.py index 92cba594..6d013e55 100644 --- a/hyperion/np/transforms/sb_sw.py +++ b/hyperion/np/transforms/sb_sw.py @@ -13,7 +13,14 @@ class SbSw(NPModel): - """Class to compute between and within class matrices""" + """Class to compute between and within class covariance matrices. + + Args: + Sb: between-class cov. matrix. + Sw: within-class cov. matrix. + mu: data mean vector. + num_classes: number of classes. + """ def __init__(self, Sb=None, Sw=None, mu=None, num_classes=0, **kwargs): super(SbSw, self).__init__(**kwargs) @@ -22,7 +29,7 @@ def __init__(self, Sb=None, Sw=None, mu=None, num_classes=0, **kwargs): self.mu = None self.num_classes = num_classes - def fit(self, x, class_ids, sample_weight=None, class_weights=None, normalize=True): + def fit(self, x, class_ids, normalize=True): dim = x.shape[1] if self.Sb is None: self.Sb = np.zeros((dim, dim)) @@ -75,7 +82,7 @@ def save_params(self, f): @classmethod def load(cls, file_path): with h5py.File(file_path, "r") as f: - config = self.load_config_from_json(f["config"]) + config = cls.load_config_from_json(f["config"]) param_list = ["mu", "Sb", "Sw", "num_classes"] params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) @@ -83,12 +90,26 @@ def load(cls, file_path): class NSbSw(SbSw): + """Class to compute nearest neighbour between and within class + covariance matrices. + https://www.isca-speech.org/archive/pdfs/interspeech_2014/sadjadi14_interspeech.pdf + + Args: + K: number of neighbours. + alpha: distance exponent that determines how fast the weight of the samples decays + when they get far from the classification boundary. + Sb: between-class cov. matrix. + Sw: within-class cov. matrix. + mu: data mean vector. + num_classes: number of classes. + """ + def __init__(self, K=10, alpha=1, **kwargs): - super(NSbSw, self).__init__(**kwargs) + super().__init__(**kwargs) self.K = K self.alpha = alpha - def fit(self, x, class_ids, sample_weight=None, class_weights=None, normalize=True): + def fit(self, x, class_ids, normalize=True): dim = x.shape[1] self.Sb = np.zeros((dim, dim), dtype=float_cpu()) self.Sw = np.zeros((dim, dim), dtype=float_cpu()) @@ -139,6 +160,7 @@ def normalize(self): self.Sw /= self.num_classes def get_config(self): + """Returns the model configuration dict.""" config = {"K": self.K, "alpha": self.alpha} base_config = super(NSbSw, self).get_config() return dict(list(base_config.items()) + list(config.items())) diff --git a/hyperion/np/transforms/skl_tsne.py b/hyperion/np/transforms/skl_tsne.py index b5be0fac..71a3e084 100644 --- a/hyperion/np/transforms/skl_tsne.py +++ b/hyperion/np/transforms/skl_tsne.py @@ -121,10 +121,48 @@ def angle(self): def num_jobs(self): return self._tsne.n_jobs + def __call__(self, x): + """Trains and applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Trains and applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): + """Trains and applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ return self._tsne.fit_transform(x) def fit(self, x): + """Trains and applies the transformation to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ return self._tsne.fit_transform(x) def save_params(self, f): @@ -135,6 +173,7 @@ def load_params(cls, f, config): return cls(**config) def get_config(self): + """Returns the model configuration dict.""" config = { "tsne_dim": self.tsne_dim, "perplexity": self.perplexity, @@ -155,6 +194,11 @@ def get_config(self): @staticmethod def filter_args(**kwargs): + """Filters the arguments corresponding to this model from a dictionary. + + Returns + Dictionary containing valid options to initialize the model. + """ valid_args = ( "tsne_dim", "perplexity", @@ -174,6 +218,12 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): + """Adds model options to parser. + + Args: + parser: parser object. + prefix: prefix str to add to the argument names. + """ if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/np/transforms/transform_list.py b/hyperion/np/transforms/transform_list.py index 62bc802e..1ddceeaa 100644 --- a/hyperion/np/transforms/transform_list.py +++ b/hyperion/np/transforms/transform_list.py @@ -23,7 +23,11 @@ class TransformList(NPModel): - """Class to perform a list of transformations""" + """Class to perform a sequence of transformations + + Attributes: + transforms: list of transformation objects. + """ def __init__(self, transforms, **kwargs): super().__init__(**kwargs) @@ -34,11 +38,46 @@ def __init__(self, transforms, **kwargs): self.update_names() def append(self, t): + """Appends a transformation to the list. + + Args: + t: transformation object. + """ self.transforms.append(t) if self.name is not None: t.name = self.name + "/" + t.name + def __call__(self, x): + """Applies the list of transformations to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + + def forward(self, x): + """Applies the list of transformations to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ + return self.predict(x) + def predict(self, x): + """Applies the list of transformations to the data. + + Args: + x: data samples. + + Returns: + Transformed data samples. + """ for t in self.transforms: x = t.predict(x) return x @@ -49,7 +88,7 @@ def update_names(self): t.name = self.name + "/" + t.name def get_config(self): - config = super(TransformList, self).get_config() + config = super().get_config() config_t = {} for i in range(len(self.transforms)): config_t[i] = self.transforms[i].get_config() From 0ecebc0e6524b74628f459896ccbcb46a3011f78 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 9 Jun 2022 09:06:36 -0400 Subject: [PATCH 016/154] wavlm phase3 --- egs/voxceleb/v2/cmd.sh | 2 +- ...aseplus_ecapatdnn512x3_phase2_default.yaml | 12 + ...aseplus_ecapatdnn512x3_phase3_default.yaml | 11 + .../v2/conf/trainer_phase2_sgd_default.yaml | 18 ++ .../v2/conf/trainer_phase3_sgd_default.yaml | 18 ++ ...ig_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh | 52 ++++ ...nfig_wavlmbaseplus_ecapatdnn512x3_v1.10.sh | 16 + egs/voxceleb/v2/run_011_train_xvector.sh | 56 +++- egs/voxceleb/v2/run_030_extract_xvectors.sh | 10 + egs/voxceleb/v2/run_040_eval_be.sh | 11 + hyperion/bin/finetune_wav2vec2xvector.py | 204 +++++++++++++ ....py => finetune_xvector_dfr_from_feats.py} | 0 ...av.py => finetune_xvector_dfr_from_wav.py} | 0 ...xvec.py => finetune_xvector_from_feats.py} | 0 hyperion/bin/finetune_xvector_from_wav.py | 190 ++++++++++++ hyperion/bin/torch-finetune-xvec-from-wav.py | 287 ------------------ hyperion/bin/train_wav2vec2xvector.py | 2 +- hyperion/bin/train_xvector_from_wav.py | 7 +- hyperion/torch/layers/margin_losses.py | 8 +- .../hf_hubert2resnet1d_xvector.py | 11 + .../hf_wav2vec2resnet1d_xvector.py | 11 + .../models/wav2xvectors/hf_wav2xvector.py | 22 ++ .../wav2xvectors/hf_wavlm2resnet1d_xvector.py | 11 + .../torch/models/wav2xvectors/wav2xvector.py | 22 ++ hyperion/torch/models/xvectors/xvector.py | 6 + hyperion/torch/narchs/classif_head.py | 21 ++ notebooks/tutorial_jsalt22/ivectors.ipynb | 33 +- 27 files changed, 736 insertions(+), 305 deletions(-) create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml create mode 100644 egs/voxceleb/v2/conf/trainer_phase2_sgd_default.yaml create mode 100644 egs/voxceleb/v2/conf/trainer_phase3_sgd_default.yaml create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh create mode 100755 hyperion/bin/finetune_wav2vec2xvector.py rename hyperion/bin/{torch-finetune-xvec-dfr.py => finetune_xvector_dfr_from_feats.py} (100%) rename hyperion/bin/{torch-finetune-xvec-dfr-from-wav.py => finetune_xvector_dfr_from_wav.py} (100%) rename hyperion/bin/{torch-finetune-xvec.py => finetune_xvector_from_feats.py} (100%) create mode 100755 hyperion/bin/finetune_xvector_from_wav.py delete mode 100755 hyperion/bin/torch-finetune-xvec-from-wav.py diff --git a/egs/voxceleb/v2/cmd.sh b/egs/voxceleb/v2/cmd.sh index 040f458b..00f8d40a 100755 --- a/egs/voxceleb/v2/cmd.sh +++ b/egs/voxceleb/v2/cmd.sh @@ -17,7 +17,7 @@ if [ "$(hostname -d)" == "cm.gemini" ];then #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" - # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml new file mode 100644 index 00000000..87b01a1f --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml @@ -0,0 +1,12 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: trainer_phase2_sgd_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml new file mode 100644 index 00000000..d13931e0 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml @@ -0,0 +1,11 @@ +data: + train: train_data_default.yaml + val: val_data_default.yaml +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: trainer_phase3_sgd_default.yaml + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/trainer_phase2_sgd_default.yaml b/egs/voxceleb/v2/conf/trainer_phase2_sgd_default.yaml new file mode 100644 index 00000000..ae708b62 --- /dev/null +++ b/egs/voxceleb/v2/conf/trainer_phase2_sgd_default.yaml @@ -0,0 +1,18 @@ +optim: + opt_type: sgd + lr: 5.5e-3 + momentum: 0.9 + weight_decay: 1e-4 +lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 4.4e-3 + warmup_steps: 6000 + update_lr_on_opt_step: true +use_amp: true +log_interval: 1000 +epochs: 7 +eff_batch_size: 512 +train_mode: full diff --git a/egs/voxceleb/v2/conf/trainer_phase3_sgd_default.yaml b/egs/voxceleb/v2/conf/trainer_phase3_sgd_default.yaml new file mode 100644 index 00000000..2529e25a --- /dev/null +++ b/egs/voxceleb/v2/conf/trainer_phase3_sgd_default.yaml @@ -0,0 +1,18 @@ +optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 +lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true +use_amp: true +log_interval: 1000 +epochs: 7 +eff_batch_size: 192 +train_mode: full diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh new file mode 100644 index 00000000..b580508a --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh @@ -0,0 +1,52 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +xvec_train_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml +xvec_train_args="--model.xvector.margin-warmup-epochs 5 --trainer.lrsched.decay-steps 4200 --trainer.lrsched.warmup-steps 1500 --trainer.lrsched.hold-steps 1500 --trainer.epochs 60 --model.feat-fusion-method weighted-avg --model.feat-fusion-start 2 --model.xvector.intertop-margin 0.1" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.10 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0060.pth + +xvec_train_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml +xvec_train_s2_args="--trainer.epochs 20" +nnet_name_s2=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_name_s2 +nnet_s2=$nnet_s2_dir/model_ep0007.pth +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +xvec_train_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml +xvec_train_s3_args="--trainer.epochs 10 --data.train.dataset.min-chunk-length 6 --data.train.dataset.max-chunk-length 6 --model.xvector.intertop-margin 0.1" +nnet_name_s3=${nnet_name}.s3.4 +nnet_s3_dir=exp/xvector_nnets/$nnet_name_s3 +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0006.pth +#nnet_s3=$nnet_s3_dir/model_ep0010.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh index 47af1f43..b84c1f15 100644 --- a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh @@ -21,6 +21,22 @@ nnet_name=${hf_model_name}_ecapatdnn512x3_v1.10 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0060.pth +xvec_train_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml +xvec_train_s2_args="--trainer.epochs 20" +nnet_name_s2=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_name_s2 +nnet_s2=$nnet_s2_dir/model_ep0007.pth +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +xvec_train_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml +xvec_train_s3_args="--trainer.epochs 10 --data.train.dataset.min-chunk-length 6 --data.train.dataset.max-chunk-length 6" +nnet_name_s3=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_name_s3 +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0006.pth +nnet_s3=$nnet_s3_dir/model_ep0010.pth + + # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 diff --git a/egs/voxceleb/v2/run_011_train_xvector.sh b/egs/voxceleb/v2/run_011_train_xvector.sh index 0b9a092e..b959936f 100755 --- a/egs/voxceleb/v2/run_011_train_xvector.sh +++ b/egs/voxceleb/v2/run_011_train_xvector.sh @@ -28,17 +28,17 @@ fi if [ "$use_tb" == "true" ];then extra_args="$extra_args --trainer.use-tensorboard" fi -if [ "$use_wandb" == "true" ];then - extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" -fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.2 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + # Network Training if [ $stage -le 1 ]; then - mkdir -p $nnet_dir/log $cuda_cmd \ @@ -53,6 +53,54 @@ if [ $stage -le 1 ]; then --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir $args \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_name_s2.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type --cfg $xvec_train_s2_base_cfg $xvec_train_s2_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet \ + --trainer.exp-path $nnet_s2_dir $args \ + --num-gpus $ngpu \ + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_name_s3.$(date -Iminutes)" + fi + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type --cfg $xvec_train_s3_base_cfg $xvec_train_s3_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir $args \ --num-gpus $ngpu \ fi diff --git a/egs/voxceleb/v2/run_030_extract_xvectors.sh b/egs/voxceleb/v2/run_030_extract_xvectors.sh index 90186a42..77c46672 100755 --- a/egs/voxceleb/v2/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v2/run_030_extract_xvectors.sh @@ -10,6 +10,7 @@ set -e stage=1 config_file=default_config.sh use_gpu=false +nnet_stage=1 xvec_chunk_length=120 #seconds . parse_options.sh || exit 1; . $config_file @@ -21,6 +22,15 @@ else xvec_cmd="$train_cmd --mem 12G" fi + +if [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_name_s2 +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_name_s3 +fi + xvector_dir=exp/xvectors/$nnet_name if [ $stage -le 1 ]; then diff --git a/egs/voxceleb/v2/run_040_eval_be.sh b/egs/voxceleb/v2/run_040_eval_be.sh index cd168180..d9c03bba 100755 --- a/egs/voxceleb/v2/run_040_eval_be.sh +++ b/egs/voxceleb/v2/run_040_eval_be.sh @@ -9,14 +9,25 @@ set -e stage=1 config_file=default_config.sh +nnet_stage=1 . parse_options.sh || exit 1; . $config_file . datapath.sh +if [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_name_s2 +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_name_s3 +fi + + plda_label=${plda_type}y${plda_y_dim}_v1 be_name=lda${lda_dim}_${plda_label}_${plda_data} + xvector_dir=exp/xvectors/$nnet_name be_dir=exp/be/$nnet_name/$be_name score_dir=exp/scores/$nnet_name/${be_name} diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py new file mode 100755 index 00000000..fda819ad --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ( + HFWav2Vec2ResNet1dXVector, + HFHubert2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) +from hyperion.torch import TorchModelLoader as TML + +model_dict = { + "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, + "hf_hubert2resnet1d": HFHubert2ResNet1dXVector, + "hf_wavlm2resnet1d": HFWavLM2ResNet1dXVector, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = Sampler.filter_args(**kwargs["sampler"]) + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + ad_args["is_val"] = partition == "val" + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = Sampler(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_model(num_classes, in_model_file, rank, **kwargs): + xvec_args = kwargs["model"]["xvector"] + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = TML.load(in_model_file) + model.rebuild_output_layer(**xvec_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + Sampler.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + Sampler.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_file", "data.val.dataset.class_file" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + parser.link_arguments( + "data.train.sampler.batch_size", "data.val.sampler.batch_size" + ) + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Finetunes Wav2Vec2XVector model from audio files" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/torch-finetune-xvec-dfr.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py similarity index 100% rename from hyperion/bin/torch-finetune-xvec-dfr.py rename to hyperion/bin/finetune_xvector_dfr_from_feats.py diff --git a/hyperion/bin/torch-finetune-xvec-dfr-from-wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py similarity index 100% rename from hyperion/bin/torch-finetune-xvec-dfr-from-wav.py rename to hyperion/bin/finetune_xvector_dfr_from_wav.py diff --git a/hyperion/bin/torch-finetune-xvec.py b/hyperion/bin/finetune_xvector_from_feats.py similarity index 100% rename from hyperion/bin/torch-finetune-xvec.py rename to hyperion/bin/finetune_xvector_from_feats.py diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py new file mode 100755 index 00000000..5ddc4d82 --- /dev/null +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import torch + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp +from hyperion.torch.models import XVector as XVec +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch import TorchModelLoader as TML + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = Sampler.filter_args(**kwargs["sampler"]) + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + ad_args["is_val"] = partition == "val" + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = Sampler(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_feats(rank, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + if rank == 0: + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + if rank == 0: + logging.info("feat-extractor={}".format(feat_extractor)) + return feat_extractor + + +def init_xvector(num_classes, in_model_path, rank, **kwargs): + xvec_args = XVec.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = TML.load(in_model_path) + model.rebuild_output_layer(**xvec_args) + # if train_mode == "ft-embed-affine": + # model.freeze_preembed_layers() + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + feat_extractor = init_feats(**kwargs) + model = init_xvector(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Fine-tune x-vector model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + Sampler.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + Sampler.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_file", "data.val.dataset.class_file" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + parser.link_arguments( + "data.train.sampler.batch_size", "data.val.sampler.batch_size" + ) + + AF.add_class_args(parser, prefix="feats") + parser.add_argument("--in-model-path", required=True) + + XVec.add_finetune_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=XVec.valid_train_modes() + ) + ddp.add_ddp_args(parser) + + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) + + args = parser.parse_args() + gpu_id = args.local_rank + del args.local_rank + + if gpu_id == 0: + try: + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args) diff --git a/hyperion/bin/torch-finetune-xvec-from-wav.py b/hyperion/bin/torch-finetune-xvec-from-wav.py deleted file mode 100755 index e33d9b8e..00000000 --- a/hyperion/bin/torch-finetune-xvec-from-wav.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time -import logging -import multiprocessing - -import numpy as np - -import torch -import torch.nn as nn - -from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch import TorchModelLoader as TML - - -def init_data( - audio_path, - train_list, - val_list, - train_aug_cfg, - val_aug_cfg, - num_workers, - num_gpus, - rank, - **kwargs -): - - ad_args = AD.filter_args(**kwargs) - sampler_args = Sampler.filter_args(**kwargs) - if rank == 0: - logging.info("audio dataset args={}".format(ad_args)) - logging.info("sampler args={}".format(sampler_args)) - logging.info("init datasets") - - train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) - val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) - - if rank == 0: - logging.info("init samplers") - train_sampler = Sampler(train_data, **sampler_args) - val_sampler = Sampler(val_data, **sampler_args) - - num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ( - {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} - ) - - train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler=train_sampler, **largs - ) - - test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler=val_sampler, **largs - ) - - return train_loader, test_loader - - -def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs["feats"]) - if rank == 0: - logging.info("feat args={}".format(feat_args)) - logging.info("initializing feature extractor") - feat_extractor = AF(trans=True, **feat_args) - if rank == 0: - logging.info("feat-extractor={}".format(feat_extractor)) - return feat_extractor - - -def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): - xvec_args = XVec.filter_finetune_args(**kwargs) - if rank == 0: - logging.info("xvector network ft args={}".format(xvec_args)) - xvec_args["num_classes"] = num_classes - model = TML.load(in_model_path) - model.rebuild_output_layer(**xvec_args) - if train_mode == "ft-embed-affine": - model.freeze_preembed_layers() - if rank == 0: - logging.info("x-vector-model={}".format(model)) - return model - - -def train_xvec(gpu_id, args): - - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - kwargs = namespace_to_dict(args) - torch.manual_seed(args.seed) - set_float_cpu("float32") - - train_mode = kwargs["train_mode"] - - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank - - train_loader, test_loader = init_data(**kwargs) - feat_extractor = init_feats(**kwargs) - model = init_xvector(train_loader.dataset.num_classes, **kwargs) - - trn_args = Trainer.filter_args(**kwargs) - if rank == 0: - logging.info("trainer args={}".format(trn_args)) - metrics = {"acc": CategoricalAccuracy()} - trainer = Trainer( - model, - feat_extractor, - device=device, - metrics=metrics, - ddp=world_size > 1, - train_mode=train_mode, - **trn_args - ) - if args.resume: - trainer.load_last_checkpoint() - trainer.fit(train_loader, test_loader) - - ddp.ddp_cleanup() - - -# (audio_path, train_list, val_list, -# train_aug_cfg, val_aug_cfg, -# in_model_path, num_gpus, resume, num_workers, -# train_mode, **kwargs): - -# set_float_cpu('float32') -# logging.info('initializing devices num_gpus={}'.format(num_gpus)) -# device = open_device(num_gpus=num_gpus) - -# ad_args = AD.filter_args(**kwargs) -# sampler_args = Sampler.filter_args(**kwargs) -# feat_args = AFF.filter_args(prefix='feats', **kwargs) -# mvn_args = MVN.filter_args(prefix='mvn', **kwargs) -# xvec_args = XVec.filter_finetune_args(**kwargs) -# opt_args = OF.filter_args(prefix='opt', **kwargs) -# lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) -# trn_args = Trainer.filter_args(**kwargs) -# logging.info('audio dataset args={}'.format(ad_args)) -# logging.info('sampler args={}'.format(sampler_args)) -# logging.info('feat args={}'.format(feat_args)) -# logging.info('mvn args={}'.format(mvn_args)) -# logging.info('xvector finetune args={}'.format(xvec_args)) -# logging.info('optimizer args={}'.format(opt_args)) -# logging.info('lr scheduler args={}'.format(lrsch_args)) -# logging.info('trainer args={}'.format(trn_args)) - -# logging.info('initializing feature extractor args={}'.format(feat_args)) -# feat_extractor = AFF.create(**feat_args) -# mvn = None -# if mvn_args['norm_mean'] or mvn_args['norm_var']: -# logging.info('initializing short-time mvn') -# mvn = MVN(**mvn_args) - -# feat_extractor = FeatExtractor(feat_extractor, mvn) - -# logging.info('init datasets') -# train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) -# val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) - -# logging.info('init samplers') -# train_sampler = Sampler(train_data, **sampler_args) -# val_sampler = Sampler(val_data, **sampler_args) - -# largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} - -# train_loader = torch.utils.data.DataLoader( -# train_data, batch_sampler = train_sampler, **largs) - -# test_loader = torch.utils.data.DataLoader( -# val_data, batch_sampler = val_sampler, **largs) - -# xvec_args['num_classes'] = train_data.num_classes -# model = TML.load(in_model_path) -# model.rebuild_output_layer(**xvec_args) -# if train_mode == 'ft-embed-affine': -# model.freeze_preembed_layers() - -# logging.info('feat-extractor={}'.format(feat_extractor)) -# logging.info('x-vector-model={}'.format(model)) - -# optimizer = OF.create(model.parameters(), **opt_args) -# lr_sch = LRSF.create(optimizer, **lrsch_args) -# metrics = { 'acc': CategoricalAccuracy() } - -# trainer = Trainer(model, feat_extractor, optimizer, -# device=device, metrics=metrics, lr_scheduler=lr_sch, -# data_parallel=(num_gpus>1), train_mode=train_mode, -# **trn_args) -# if resume: -# trainer.load_last_checkpoint() -# trainer.fit(train_loader, test_loader) - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Fine-tune x-vector model from audio files") - - parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--audio-path", required=True) - parser.add_argument("--train-list", required=True) - parser.add_argument("--val-list", required=True) - - AD.add_argparse_args(parser) - Sampler.add_argparse_args(parser) - - parser.add_argument("--train-aug-cfg", default=None) - parser.add_argument("--val-aug-cfg", default=None) - parser.add_argument( - "--num-workers", type=int, default=5, help="num_workers of data loader" - ) - - AF.add_class_args(parser, prefix="feats") - parser.add_argument("--in-model-path", required=True) - - XVec.add_finetune_args(parser) - Trainer.add_class_args(parser) - ddp.add_ddp_args(parser) - - # parser.add_argument('--num-gpus', type=int, default=1, - # help='number of gpus, if 0 it uses cpu') - parser.add_argument("--seed", type=int, default=1123581321, help="random seed") - parser.add_argument( - "--resume", - action="store_true", - default=False, - help="resume training from checkpoint", - ) - parser.add_argument( - "--train-mode", - default="ft-embed-affine", - choices=["ft-full", "ft-embed-affine"], - help=( - "ft-full: adapt full x-vector network" - "ft-embed-affine: adapt affine transform before embedding" - ), - ) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - parser.add_argument("--local_rank", default=0, type=int) - - args = parser.parse_args() - gpu_id = args.local_rank - del args.local_rank - - if gpu_id == 0: - try: - config_file = Path(args.exp_path) / "config.yaml" - parser.save(args, str(config_file), format="yaml", overwrite=True) - except: - pass - - # torch docs recommend using forkserver - multiprocessing.set_start_method("forkserver") - train_xvec(gpu_id, args) - - # args = parser.parse_args() - # config_logger(args.verbose) - # del args.verbose - # logging.debug(args) - - # torch.manual_seed(args.seed) - # del args.seed - - # train_xvec(**vars(args)) diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index c673f5c9..e92b9a1a 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ - Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import sys diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index 39b037ba..5eb871db 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -16,10 +16,7 @@ import logging import multiprocessing -import numpy as np - import torch -import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.utils import ddp @@ -223,7 +220,9 @@ def make_parser(xvec_class): AF.add_class_args(parser, prefix="feats") xvec_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, prefix="trainer") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) ddp.add_ddp_args(parser) parser.add_argument("--seed", type=int, default=1123581321, help="random seed") # parser.add_argument( diff --git a/hyperion/torch/layers/margin_losses.py b/hyperion/torch/layers/margin_losses.py index 0d748249..d7a086d1 100644 --- a/hyperion/torch/layers/margin_losses.py +++ b/hyperion/torch/layers/margin_losses.py @@ -99,8 +99,8 @@ def update_margin(self, epoch): Args: epoch: value of current epoch. """ - if self.margin_warmup_epochs == 0: - return + # if self.margin_warmup_epochs == 0: + # return if epoch < self.margin_warmup_epochs: self.cur_margin = self.margin * epoch / self.margin_warmup_epochs @@ -243,8 +243,8 @@ def update_margin(self, epoch): Args: epoch: value of current epoch. """ - if self.margin_warmup_epochs == 0: - return + # if self.margin_warmup_epochs == 0: + # return if epoch < self.margin_warmup_epochs: self.cur_margin = self.margin * epoch / self.margin_warmup_epochs diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py index d585567f..bf0552dc 100644 --- a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py @@ -72,3 +72,14 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py index 3b44c53f..3cabb1d5 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -76,3 +76,14 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index d79d5a26..85944fb9 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -92,6 +92,28 @@ def update_loss_margin(self, epoch): """ self.xvector.update_loss_margin(epoch) + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + self.xvector.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + def forward_feats(self, x, x_lengths, return_feat_layers=None): return_hid_states = ( False diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py index 89e7120e..efac4e50 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py @@ -72,3 +72,14 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 27268e44..83c95222 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -35,6 +35,28 @@ def __init__(self, feats, xvector): self.feats = feats self.xvector = xvector + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + self.xvector.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + def forward( self, x, diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 8c2070b5..c35e6a4a 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -538,6 +538,9 @@ def rebuild_output_layer( cos_scale=64, margin=0.3, margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, ): if (self.num_classes is not None and self.num_classes != num_classes) or ( self.loss_type != loss_type @@ -553,6 +556,9 @@ def rebuild_output_layer( self.classif_net.set_margin(margin) self.classif_net.set_margin_warmup_epochs(margin_warmup_epochs) self.classif_net.set_cos_scale(cos_scale) + self.classif_net.set_intertop_k(intertop_k) + self.classif_net.set_intertop_margin(intertop_margin) + self.classif_net.set_num_subcenters(num_subcenters) def freeze_preembed_layers(self): self.encoder_net.freeze() diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index 06bd988c..6a886e44 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -232,6 +232,27 @@ def set_cos_scale(self, cos_scale): self.cos_scale = cos_scale self.output.cos_scale = cos_scale + def set_intertop_k(self, intertop_k): + if self.loss_type == "softmax": + return + + self.intertop_k = intertop_k + self.output.intertop_k = intertop_k + + def set_intertop_margin(self, intertop_margin): + if self.loss_type == "softmax": + return + + self.intertop_margin = intertop_margin + self.output.intertop_margin = intertop_margin + + def set_num_subcenters(self, num_subcenters): + if not self.loss_type == "subcenter-arc-softmax": + return + + self.num_subcenters = num_subcenters + self.output.num_subcenters = num_subcenters + def update_margin(self, epoch): if hasattr(self.output, "update_margin"): self.output.update_margin(epoch) diff --git a/notebooks/tutorial_jsalt22/ivectors.ipynb b/notebooks/tutorial_jsalt22/ivectors.ipynb index 46d4eb61..91548ea8 100644 --- a/notebooks/tutorial_jsalt22/ivectors.ipynb +++ b/notebooks/tutorial_jsalt22/ivectors.ipynb @@ -113,7 +113,8 @@ "metadata": {}, "outputs": [], "source": [ - "x, r_idx, spk_ids =generate_data(num_dims=3)" + "x_dim=3\n", + "x, r_idx, spk_ids=generate_data(num_dims=x_dim)" ] }, { @@ -138,7 +139,17 @@ "metadata": {}, "outputs": [], "source": [ - "ubm_gmm = hnp.pdfs.GMMDiagCov(num_comp=8, x_dim=3)\n", + "num_comp=8\n", + "y_dim=2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ubm_gmm = hnp.pdfs.GMMDiagCov(num_comp=num_comp, x_dim=x_dim)\n", "elbo, elbo_norm = ubm_gmm.fit(x_cat, epochs=10)\n", "fig = plt.figure()\n", "plt.plot(elbo_norm)\n", @@ -167,7 +178,7 @@ "metadata": {}, "outputs": [], "source": [ - "iv_model = hnp.pdfs.JFATotal(K=8, x_dim=3, y_dim=2)\n", + "iv_model = hnp.pdfs.JFATotal(K=num_comp, x_dim=x_dim, y_dim=y_dim)\n", "N=[]\n", "F=[]\n", "for x_i in x:\n", @@ -193,7 +204,21 @@ "metadata": {}, "outputs": [], "source": [ - "F.shape\n", + "num_utts=100\n", + "w = np.randn(num_utts, 2)\n", + "M = ubm_gmm.mu.ravel() + ubm_gmm.cholLambda.ravel() * np.dot(w, iv_model.T)\n", + "M = M.reshape(num_utts, num_comp, x_dim)\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(projection='3d')\n", + "colors = ['b', 'g','r','c','m','y','k','b']\n", + "for i in range(num_comp):\n", + " ax.plot_surface(M[:,i,0], M[:,i,1], M[:,i,2], alpha=0.2, color=colors[i])\n", + " ax.scatter(M[:,i,0], M[:,i,1], M[:,i,2], marker='o', color=colors[i])\n", + "ax.set_xlabel('x1')\n", + "ax.set_ylabel('x2')\n", + "ax.set_zlabel('x3')\n", + "plt.show()\n", + "\n", "\n" ] } From 81d4a7f88e944c17db63842d39f0eb3a2bab1c7f Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sun, 12 Jun 2022 17:44:15 -0400 Subject: [PATCH 017/154] added default config to voxceleb/v2 --- ...lmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml | 45 + ...lmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml | 49 + ...lmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml | 50 + .../v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml | 6 +- egs/voxceleb/v2/default_config.sh | 2 +- ...onfig_wavlmbaseplus_ecapatdnn512x3_v1.0.sh | 49 + egs/voxceleb/v2/run_011_train_xvector.sh | 26 +- .../models/wav2xvectors/hf_wav2xvector.py | 4 +- hyperion/torch/trainers/ae_trainer.py | 2 +- hyperion/torch/trainers/dvae_trainer.py | 2 +- hyperion/torch/trainers/vae_trainer.py | 2 +- hyperion/torch/trainers/vq_dvae_trainer.py | 2 +- hyperion/torch/trainers/vq_vae_trainer.py | 2 +- .../torch/trainers/xvector_adv_trainer.py | 2 +- .../trainers/xvector_adv_trainer_from_wav.py | 2 +- hyperion/torch/trainers/xvector_trainer.py | 2 +- .../trainers/xvector_trainer_deep_feat_reg.py | 2 +- .../xvector_trainer_deep_feat_reg_from_wav.py | 2 +- .../trainers/xvector_trainer_from_wav.py | 2 +- notebooks/tutorial_jsalt22/ivectors.ipynb | 4 +- notebooks/tutorial_jsalt22/xvector.ipynb | 3760 +++++++++++++++++ 21 files changed, 3988 insertions(+), 29 deletions(-) create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh create mode 100644 notebooks/tutorial_jsalt22/xvector.ipynb diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..f62b2e14 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml new file mode 100644 index 00000000..1298a056 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml @@ -0,0 +1,49 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5.5e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 4.4e-3 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..1721e337 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,50 @@ +data: + train: + dataset: + max_chunk_length: 6.0 + min_chunk_length: 6.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 2 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml index 787e3718..69b85d8d 100644 --- a/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml +++ b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3.yaml @@ -33,6 +33,8 @@ xvector: embed_dim: 192 cos_scale: 32.0 margin: 0.2 - margin_warmup_epochs: 3 + margin_warmup_epochs: 5 + intertop_margin: 0.1 dropout_rate: 0.0 -feat_fusion_method: last +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/default_config.sh b/egs/voxceleb/v2/default_config.sh index 65108e89..abcc2a2e 120000 --- a/egs/voxceleb/v2/default_config.sh +++ b/egs/voxceleb/v2/default_config.sh @@ -1 +1 @@ -global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh \ No newline at end of file +global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..7d39995d --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/run_011_train_xvector.sh b/egs/voxceleb/v2/run_011_train_xvector.sh index b959936f..dc4e1dee 100755 --- a/egs/voxceleb/v2/run_011_train_xvector.sh +++ b/egs/voxceleb/v2/run_011_train_xvector.sh @@ -34,17 +34,19 @@ if [ "$interactive" == "true" ];then fi if [ "$use_wandb" == "true" ];then - extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.2 --trainer.wandb.name $nnet_name.$(date -Iminutes)" + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" fi + # Network Training if [ $stage -le 1 ]; then - - mkdir -p $nnet_dir/log + + mkdir -p $nnet_s1_dir/log $cuda_cmd \ - --gpu $ngpu $nnet_dir/log/train.log \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - train_wav2vec2xvector.py $nnet_type --cfg $xvec_train_base_cfg $xvec_train_args $extra_args \ + train_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ @@ -52,7 +54,7 @@ if [ $stage -le 1 ]; then --data.val.dataset.audio-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ - --trainer.exp-path $nnet_dir $args \ + --trainer.exp-path $nnet_s1_dir $args \ --num-gpus $ngpu fi @@ -60,14 +62,15 @@ fi if [ $stage -le 2 ]; then if [ "$use_wandb" == "true" ];then - extra_args="$extra_args --trainer.wandb.name $nnet_name_s2.$(date -Iminutes)" + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" fi mkdir -p $nnet_s2_dir/log $cuda_cmd \ --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2vec2xvector.py $nnet_type --cfg $xvec_train_s2_base_cfg $xvec_train_s2_args $extra_args \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.audio-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ @@ -75,7 +78,7 @@ if [ $stage -le 2 ]; then --data.val.dataset.audio-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ - --in-model-file $nnet \ + --in-model-file $nnet_s1 \ --trainer.exp-path $nnet_s2_dir $args \ --num-gpus $ngpu \ @@ -84,14 +87,15 @@ fi if [ $stage -le 3 ]; then if [ "$use_wandb" == "true" ];then - extra_args="$extra_args --trainer.wandb.name $nnet_name_s3.$(date -Iminutes)" + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" fi mkdir -p $nnet_s3_dir/log $cuda_cmd \ --gpu $ngpu $nnet_s3_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2vec2xvector.py $nnet_type --cfg $xvec_train_s3_base_cfg $xvec_train_s3_args $extra_args \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ --data.train.dataset.audio-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 85944fb9..d75a257b 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -50,7 +50,7 @@ def _make_fuser(self): self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "linear": self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weights.data = torch.ones(num_layers) / num_layers + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers elif self.feat_fusion_method == "cat": self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) @@ -74,7 +74,7 @@ def _fuse_hid_feats(self, hid_feats): feats = torch.sum(hid_feats * norm_weights, dim=-1) elif self.feat_fusion_method == "linear": hid_feats = torch.stack(hid_feats, dim=-1) - feats = self.feat_fuser(hid_feats) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) elif self.feat_fusion_method == "cat": hid_feats = torch.cat(hid_feats, dim=-1) feats = self.feat_fuser(hid_feats) diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index c1debdb6..21d53d32 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -63,7 +63,7 @@ def __init__( loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 02c4fb6e..b75a94ab 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -61,7 +61,7 @@ def __init__( loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index ae193209..284f07d0 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -61,7 +61,7 @@ def __init__( loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index 840d70d6..30d2d3b3 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -61,7 +61,7 @@ def __init__( loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index cb09ba00..c484b5c7 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -61,7 +61,7 @@ def __init__( loggers=None, ddp=False, ddp_type="ddp", - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 91c75823..961597e5 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -69,7 +69,7 @@ def __init__( ddp=False, ddp_type="ddp", loss=None, - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 2797e678..036ee46e 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -71,7 +71,7 @@ def __init__( ddp=False, ddp_type="ddp", loss=None, - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 8c39a345..4cc4bc8c 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -63,7 +63,7 @@ def __init__( ddp=False, ddp_type="ddp", loss=None, - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 58235961..1c0c26b7 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -74,7 +74,7 @@ def __init__( ddp_type="ddp", loss=None, reg_loss=None, - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index da9d064a..1ad4d24a 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -75,7 +75,7 @@ def __init__( ddp_type="ddp", loss=None, reg_loss=None, - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index d75936d8..64a1d187 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -63,7 +63,7 @@ def __init__( ddp=False, ddp_type="ddp", loss=None, - train_mode="train", + train_mode="full", use_amp=False, log_interval=10, use_tensorboard=False, diff --git a/notebooks/tutorial_jsalt22/ivectors.ipynb b/notebooks/tutorial_jsalt22/ivectors.ipynb index 91548ea8..7f2780d9 100644 --- a/notebooks/tutorial_jsalt22/ivectors.ipynb +++ b/notebooks/tutorial_jsalt22/ivectors.ipynb @@ -76,12 +76,12 @@ " \n", " # Sample speakers\n", " spk_ids = np.arange(num_spks)\n", - " y = sb * rng.randn(num_spks, tv_dim)\n", + " y = np.sqrt(sb) * rng.randn(num_spks, tv_dim)\n", "\n", " # Sample i-vectors\n", " spk_ids = np.repeat(spk_ids, num_utts, axis=0)\n", " y = np.repeat(y, num_utts, axis=0)\n", - " w = y + sw * rng.randn(num_spks*num_utts, tv_dim)\n", + " w = y + np.sqrt(sw) * rng.randn(num_spks*num_utts, tv_dim)\n", "\n", " x = []\n", " r_idx = []\n", diff --git a/notebooks/tutorial_jsalt22/xvector.ipynb b/notebooks/tutorial_jsalt22/xvector.ipynb new file mode 100644 index 00000000..70f01057 --- /dev/null +++ b/notebooks/tutorial_jsalt22/xvector.ipynb @@ -0,0 +1,3760 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "hpath='/exp/jvillalba/hyperion/hyperion-persephone'\n", + "import sys\n", + "sys.path.append(hpath)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from copy import deepcopy\n", + "import logging\n", + "import numpy as np\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "from torch.utils.data import Dataset, DataLoader\n", + "\n", + "import hyperion as hyp\n", + "import hyperion.np as hnp\n", + "import hyperion.torch as ht\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class IVDataset(Dataset):\n", + " \"\"\"Datasets that generates utterances following the i-vector model.\n", + " \n", + " Attributes:\n", + " num_samples: num. of utterances in the dataset.\n", + " num_spks: num. of speakers in the dataset.\n", + " x_dim: feature dimension.\n", + " num_gauss: number of Gaussian components in UBM-GMM\n", + " w_dim: i-vector dimension.\n", + " sb: isotropic between-class cov.\n", + " sw: isotropic within-class cov.\n", + " \n", + " \"\"\"\n", + " \n", + " def __init__(self, num_samples=10000, num_spks=100, x_dim=16, num_gauss=32, w_dim=16, sb=0.7, sw=0.3, utt_length=200, unit_length=25, seed=1234):\n", + " self.rng = np.random.RandomState(seed=seed)\n", + " self.num_samples = num_samples\n", + " self.num_spks = num_spks\n", + " self.x_dim = x_dim\n", + " self.w_dim = w_dim\n", + " self.num_gauss = num_gauss\n", + " self.utt_length = utt_length\n", + " self.unit_length = unit_length\n", + " self.sb = sb\n", + " self.sw = sw\n", + " self.y = self._make_spks(num_spks, w_dim, sb, self.rng)\n", + " self.gmm_ubm = self._make_ubm(x_dim, num_gauss, self.rng)\n", + " self.T = self._make_ivector(x_dim, num_gauss, w_dim, self.rng)\n", + "\n", + " def __len__(self):\n", + " return self.num_samples\n", + "\n", + " def __getitem__(self, idx):\n", + " spk_idx = idx % self.y.shape[0]\n", + " x = self.sample_utterance(spk_idx)\n", + " x = x.astype('float32')\n", + " return x, spk_idx\n", + "\n", + "\n", + " @staticmethod\n", + " def _make_spks(num_spks, w_dim, sb, rng):\n", + " \"\"\"Creates the speaker identity vectors\"\"\"\n", + " return np.sqrt(sb) * rng.randn(num_spks, w_dim)\n", + "\n", + " @staticmethod\n", + " def _make_ubm(x_dim, num_gauss, rng):\n", + " \"\"\"Creates the UBM GMM\"\"\"\n", + " # Define UBM\n", + " # Means of the GMM-UBM\n", + " ubm_means = rng.randn(num_gauss, x_dim)\n", + " ubm_means = np.sqrt(x_dim) * ubm_means/np.linalg.norm(ubm_means, axis=-1, keepdims=True)\n", + "\n", + " # Covariances of the GMM-UBM\n", + " ubm_cov = 0.1 * np.ones((num_gauss, x_dim))\n", + " ubm_prec = 1./ubm_cov\n", + "\n", + " # Weights of the GMM-UBM\n", + " ubm_weights = np.ones((num_gauss))/num_gauss\n", + "\n", + " return hnp.pdfs.GMMDiagCov(pi=ubm_weights, mu=ubm_means, Lambda=ubm_prec)\n", + "\n", + " @staticmethod\n", + " def _make_ivector(x_dim, num_gauss, w_dim, rng):\n", + " # Define Total Variability sub-space\n", + " T = rng.randn(w_dim, x_dim * num_gauss)\n", + " T = 0.2 * T/np.max(T)\n", + " return T\n", + "\n", + " def sample_utterance(self, spk_idx):\n", + " # generate i-vector\n", + " y = self.y[spk_idx] # spk factor\n", + " w = y + np.sqrt(self.sw) * self.rng.randn(self.w_dim)\n", + "\n", + " # For each utterance\n", + " # Compute the GMM mean of the utterance\n", + " means_i = self.gmm_ubm.mu + np.dot(w, self.T).reshape(self.x_dim, self.num_gauss).T\n", + "\n", + " # Create a GMM for the utterance.\n", + " gmm = self.gmm_ubm.copy()\n", + " gmm.mu = means_i\n", + " \n", + " # Sample the Gaussian components\n", + " num_units = self.utt_length // self.unit_length\n", + " r = self.rng.multinomial(1, gmm.pi, size=(num_units,))\n", + " # Assume that we stay in the same component several time steps.\n", + " r = np.repeat(r, self.unit_length, axis=0)\n", + " # Draw samples from the GMM\n", + " x = gmm.sample(r=r)\n", + " return x\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the Training dataset\n", + "train_data = IVDataset()\n", + "# The Validation dataset is a copy of the training data but with less samples\n", + "val_data = deepcopy(train_data)\n", + "val_data.num_samples = 100\n", + "\n", + "# Create data loaders\n", + "train_loader = DataLoader(train_data, batch_size=32, shuffle=True)\n", + "val_loader = DataLoader(val_data, batch_size=32, shuffle=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "class StemBlock(nn.Module):\n", + " \"\"\"Build block input layer of x-vector model\n", + "\n", + " Args:\n", + " in_channels: input channels.\n", + " out_channels: output channels.\n", + " kernel_size: kernels size for the convolution.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " in_channels,\n", + " out_channels,\n", + " kernel_size,\n", + " ):\n", + "\n", + " super().__init__()\n", + "\n", + " self.activation = nn.ReLU()\n", + " padding = int((kernel_size - 1) // 2)\n", + " self.bn = nn.BatchNorm1d(out_channels)\n", + " self.conv = nn.Conv1d(\n", + " in_channels,\n", + " out_channels,\n", + " bias=False,\n", + " kernel_size=kernel_size,\n", + " padding=padding,\n", + " )\n", + "\n", + " def forward(self, x, x_mask=None):\n", + " \"\"\"Forward function.\n", + "\n", + " Args:\n", + " x: input tensor with shape = (batch, in_channels, in_time).\n", + " x_mask: unused.\n", + "\n", + " Returns:\n", + " Tensor with shape = (batch, out_channels, out_time).\n", + " \"\"\"\n", + "\n", + " x = self.conv(x)\n", + " x = self.bn(x)\n", + " x = self.activation(x)\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "class TDNNBlock(nn.Module):\n", + " \"\"\"TDNN Layer block.\n", + "\n", + " Attributes:\n", + " in_channels: input channels.\n", + " out_channels: output channels.\n", + " kernel_size: kernel size.\n", + " dilation: dilation factor of the conv. kernels.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " in_channels,\n", + " out_channels,\n", + " kernel_size=3,\n", + " dilation=1,\n", + " ):\n", + "\n", + " super().__init__()\n", + " self.in_channels = in_channels\n", + " self.out_channels = out_channels\n", + " self.activation = nn.ReLU()\n", + " self.bn = nn.BatchNorm1d(out_channels)\n", + "\n", + " padding = int(dilation * (kernel_size - 1) // 2)\n", + " self.conv = nn.Conv1d(\n", + " in_channels,\n", + " out_channels,\n", + " bias=False,\n", + " kernel_size=kernel_size,\n", + " dilation=dilation,\n", + " padding=padding,\n", + " )\n", + "\n", + "\n", + " def forward(self, x):\n", + " \"\"\"Forward function.\n", + "\n", + " Args:\n", + " x: input tensor with shape = (batch, in_channels, time).\n", + " x_mask: unused.\n", + "\n", + " Returns:\n", + " Tensor with shape = (batch, out_channels, time).\n", + " \"\"\"\n", + " x = self.conv(x)\n", + " x = self.bn(x)\n", + " x = self.activation(x)\n", + " return x\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "class StatsPooling(nn.Module):\n", + " \"\"\"mean + stddev pooling layer.\"\"\"\n", + "\n", + " def forward(self, x):\n", + " x_mean = torch.mean(x, dim=-1)\n", + " x2_mean = torch.mean(x**2, dim=-1)\n", + " x_std = torch.sqrt((x2_mean-x_mean**2).clamp(min=1e-5))\n", + " return torch.cat((x_mean, x_std), dim=-1)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "class TDNNXVec(ht.TorchModel):\n", + "\n", + " def __init__(self, feat_dim, num_layers, layer_dim, embed_dim, num_classes):\n", + " super().__init__()\n", + " self.in_block = StemBlock(feat_dim, layer_dim, kernel_size=5)\n", + " tdnn_layers = []\n", + " for i in range(num_layers):\n", + " layer_i = TDNNBlock(layer_dim, layer_dim, kernel_size=3, dilation=i+2)\n", + " tdnn_layers.append(layer_i)\n", + "\n", + " self.tdnn_layers = nn.ModuleList(tdnn_layers)\n", + " self.pooling = StatsPooling()\n", + " self.projection = nn.Linear(2*layer_dim, embed_dim)\n", + " self.output = nn.Linear(embed_dim, num_classes)\n", + "\n", + " def update_loss_margin(self, epoch):\n", + " pass\n", + "\n", + " def forward(self, x, y=None, infer=False):\n", + " x = x.transpose(1,2)\n", + " x = self.in_block(x)\n", + " for i, layer in enumerate(self.tdnn_layers):\n", + " x = layer(x)\n", + "\n", + " x = self.pooling(x)\n", + " z = self.projection(x)\n", + " if infer:\n", + " return z\n", + "\n", + " return self.output(z)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "model = TDNNXVec(16, 2, 32, 16, 100)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:optimizer args={'opt_type': 'sgd', 'lr': 0.01, 'momentum': 0.9, 'oss': False}\n", + "INFO:root:lr scheduler args={'lrsch_type': 'exp_lr', 'decay_rate': 0.5, 'decay_steps': 4000, 'hold_steps': 2000, 'warmup_steps': 1000, 'update_lr_on_opt_step': True}\n" + ] + } + ], + "source": [ + "from hyperion.torch.trainers import XVectorTrainer\n", + "from hyperion.torch.metrics import CategoricalAccuracy\n", + "\n", + "optim = {\"opt_type\": \"sgd\", \"lr\": 0.01, \"momentum\": 0.9}\n", + "lrsched = {\"lrsch_type\": \"exp_lr\", \"decay_rate\": 0.5, \"decay_steps\": 4000, \"hold_steps\": 2000, \"warmup_steps\": 1000, \"update_lr_on_opt_step\": True}\n", + "metrics = {\"acc\": CategoricalAccuracy()}\n", + "trainer = XVectorTrainer(model, optim=optim, lrsched=lrsched, exp_path='./tdnn_xvec', device=torch.device('cpu'), train_mode=\"full\", metrics=metrics )" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:epoch: 1/100 starts\n", + "INFO:root:epoch: 1/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 4.601476 acc: 0.018750 lr: 0.000090\n", + "INFO:root:epoch: 1/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 4.598933 acc: 0.018750 lr: 0.000190\n", + "INFO:root:epoch: 1/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 4.598896 acc: 0.018750 lr: 0.000290\n", + "INFO:root:epoch: 1/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 4.598870 acc: 0.016406 lr: 0.000390\n", + "INFO:root:epoch: 1/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 4.598446 acc: 0.016250 lr: 0.000490\n", + "INFO:root:epoch: 1/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 4.596868 acc: 0.015625 lr: 0.000590\n", + "INFO:root:epoch: 1/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 4.596662 acc: 0.016071 lr: 0.000690\n", + "INFO:root:epoch: 1/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 4.596169 acc: 0.016797 lr: 0.000790\n", + "INFO:root:epoch: 1/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 4.596366 acc: 0.017361 lr: 0.000890\n", + "INFO:root:epoch: 1/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 4.598048 acc: 0.016562 lr: 0.000990\n", + "INFO:root:epoch: 1/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 4.597743 acc: 0.017330 lr: 0.001090\n", + "INFO:root:epoch: 1/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 4.597656 acc: 0.017969 lr: 0.001190\n", + "INFO:root:epoch: 1/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 4.597299 acc: 0.018510 lr: 0.001290\n", + "INFO:root:epoch: 1/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 4.596856 acc: 0.018750 lr: 0.001390\n", + "INFO:root:epoch: 1/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 4.596479 acc: 0.018542 lr: 0.001490\n", + "INFO:root:epoch: 1/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 4.596669 acc: 0.018359 lr: 0.001590\n", + "INFO:root:epoch: 1/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 4.596362 acc: 0.019301 lr: 0.001690\n", + "INFO:root:epoch: 1/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 4.596951 acc: 0.019271 lr: 0.001790\n", + "INFO:root:epoch: 1/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 4.597304 acc: 0.019572 lr: 0.001890\n", + "INFO:root:epoch: 1/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 4.597318 acc: 0.019063 lr: 0.001990\n", + "INFO:root:epoch: 1/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 4.596914 acc: 0.019494 lr: 0.002090\n", + "INFO:root:epoch: 1/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 4.597153 acc: 0.019176 lr: 0.002190\n", + "INFO:root:epoch: 1/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 4.597729 acc: 0.018750 lr: 0.002290\n", + "INFO:root:epoch: 1/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 4.597708 acc: 0.018750 lr: 0.002390\n", + "INFO:root:epoch: 1/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 4.597762 acc: 0.019375 lr: 0.002490\n", + "INFO:root:epoch: 1/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 4.597500 acc: 0.019351 lr: 0.002590\n", + "INFO:root:epoch: 1/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 4.597324 acc: 0.018981 lr: 0.002690\n", + "INFO:root:epoch: 1/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 4.597246 acc: 0.018527 lr: 0.002790\n", + "INFO:root:epoch: 1/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 4.597722 acc: 0.018211 lr: 0.002890\n", + "INFO:root:epoch: 1/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 4.597453 acc: 0.018437 lr: 0.002990\n", + "INFO:root:epoch: 1/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 4.597106 acc: 0.018548 lr: 0.003090\n", + "INFO:root:epoch: 2/100 starts\n", + "INFO:root:epoch: 2/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 4.587101 acc: 0.031250 lr: 0.003220\n", + "INFO:root:epoch: 2/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 4.588415 acc: 0.026562 lr: 0.003320\n", + "INFO:root:epoch: 2/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 4.582873 acc: 0.021875 lr: 0.003420\n", + "INFO:root:epoch: 2/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 4.584817 acc: 0.021094 lr: 0.003520\n", + "INFO:root:epoch: 2/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 4.586176 acc: 0.021875 lr: 0.003620\n", + "INFO:root:epoch: 2/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 4.587892 acc: 0.021354 lr: 0.003720\n", + "INFO:root:epoch: 2/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 4.589204 acc: 0.021875 lr: 0.003820\n", + "INFO:root:epoch: 2/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 4.587747 acc: 0.021094 lr: 0.003920\n", + "INFO:root:epoch: 2/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 4.587081 acc: 0.020139 lr: 0.004020\n", + "INFO:root:epoch: 2/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 4.587247 acc: 0.020000 lr: 0.004120\n", + "INFO:root:epoch: 2/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 4.586554 acc: 0.021307 lr: 0.004220\n", + "INFO:root:epoch: 2/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 4.587415 acc: 0.021094 lr: 0.004320\n", + "INFO:root:epoch: 2/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 4.587646 acc: 0.021635 lr: 0.004420\n", + "INFO:root:epoch: 2/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 4.586850 acc: 0.022098 lr: 0.004520\n", + "INFO:root:epoch: 2/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 4.586491 acc: 0.021667 lr: 0.004620\n", + "INFO:root:epoch: 2/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 4.586102 acc: 0.021680 lr: 0.004720\n", + "INFO:root:epoch: 2/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 4.585205 acc: 0.021691 lr: 0.004820\n", + "INFO:root:epoch: 2/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 4.584884 acc: 0.022743 lr: 0.004920\n", + "INFO:root:epoch: 2/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 4.584379 acc: 0.022697 lr: 0.005020\n", + "INFO:root:epoch: 2/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 4.584310 acc: 0.022656 lr: 0.005120\n", + "INFO:root:epoch: 2/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 4.583735 acc: 0.023512 lr: 0.005220\n", + "INFO:root:epoch: 2/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 4.583194 acc: 0.023722 lr: 0.005320\n", + "INFO:root:epoch: 2/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 4.582450 acc: 0.023913 lr: 0.005420\n", + "INFO:root:epoch: 2/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 4.581911 acc: 0.024089 lr: 0.005520\n", + "INFO:root:epoch: 2/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 4.581052 acc: 0.024500 lr: 0.005620\n", + "INFO:root:epoch: 2/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 4.579837 acc: 0.024880 lr: 0.005720\n", + "INFO:root:epoch: 2/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 4.579425 acc: 0.024769 lr: 0.005820\n", + "INFO:root:epoch: 2/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 4.578843 acc: 0.024888 lr: 0.005920\n", + "INFO:root:epoch: 2/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 4.577931 acc: 0.025216 lr: 0.006020\n", + "INFO:root:epoch: 2/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 4.577415 acc: 0.025000 lr: 0.006120\n", + "INFO:root:epoch: 2/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 4.576722 acc: 0.024798 lr: 0.006220\n", + "INFO:root:epoch: 3/100 starts\n", + "INFO:root:epoch: 3/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 4.554732 acc: 0.018750 lr: 0.006350\n", + "INFO:root:epoch: 3/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 4.546799 acc: 0.025000 lr: 0.006450\n", + "INFO:root:epoch: 3/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 4.543545 acc: 0.021875 lr: 0.006550\n", + "INFO:root:epoch: 3/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 4.539997 acc: 0.023437 lr: 0.006650\n", + "INFO:root:epoch: 3/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 4.534892 acc: 0.027500 lr: 0.006750\n", + "INFO:root:epoch: 3/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 4.533372 acc: 0.026042 lr: 0.006850\n", + "INFO:root:epoch: 3/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 4.535993 acc: 0.027232 lr: 0.006950\n", + "INFO:root:epoch: 3/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 4.532067 acc: 0.027734 lr: 0.007050\n", + "INFO:root:epoch: 3/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 4.529817 acc: 0.027083 lr: 0.007150\n", + "INFO:root:epoch: 3/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 4.528748 acc: 0.026875 lr: 0.007250\n", + "INFO:root:epoch: 3/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 4.524458 acc: 0.027273 lr: 0.007350\n", + "INFO:root:epoch: 3/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 4.522450 acc: 0.027083 lr: 0.007450\n", + "INFO:root:epoch: 3/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 4.519957 acc: 0.027644 lr: 0.007550\n", + "INFO:root:epoch: 3/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 4.516462 acc: 0.029018 lr: 0.007650\n", + "INFO:root:epoch: 3/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 4.512551 acc: 0.029375 lr: 0.007750\n", + "INFO:root:epoch: 3/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 4.511358 acc: 0.029102 lr: 0.007850\n", + "INFO:root:epoch: 3/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 4.505479 acc: 0.030515 lr: 0.007950\n", + "INFO:root:epoch: 3/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 4.500308 acc: 0.030382 lr: 0.008050\n", + "INFO:root:epoch: 3/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 4.496550 acc: 0.030099 lr: 0.008150\n", + "INFO:root:epoch: 3/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 4.491571 acc: 0.030781 lr: 0.008250\n", + "INFO:root:epoch: 3/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 4.486266 acc: 0.031250 lr: 0.008350\n", + "INFO:root:epoch: 3/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 4.480225 acc: 0.031108 lr: 0.008450\n", + "INFO:root:epoch: 3/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 4.474944 acc: 0.030299 lr: 0.008550\n", + "INFO:root:epoch: 3/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 4.469195 acc: 0.030599 lr: 0.008650\n", + "INFO:root:epoch: 3/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 4.462828 acc: 0.031250 lr: 0.008750\n", + "INFO:root:epoch: 3/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 4.458274 acc: 0.031611 lr: 0.008850\n", + "INFO:root:epoch: 3/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 4.451506 acc: 0.032639 lr: 0.008950\n", + "INFO:root:epoch: 3/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 4.447941 acc: 0.032701 lr: 0.009050\n", + "INFO:root:epoch: 3/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 4.442886 acc: 0.033190 lr: 0.009150\n", + "INFO:root:epoch: 3/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 4.436341 acc: 0.033021 lr: 0.009250\n", + "INFO:root:epoch: 3/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 4.431607 acc: 0.033165 lr: 0.009350\n", + "INFO:root:epoch: 4/100 starts\n", + "INFO:root:epoch: 4/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 4.216670 acc: 0.040625 lr: 0.009480\n", + "INFO:root:epoch: 4/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 4.220620 acc: 0.039062 lr: 0.009580\n", + "INFO:root:epoch: 4/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 4.215897 acc: 0.040625 lr: 0.009680\n", + "INFO:root:epoch: 4/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 4.204186 acc: 0.042188 lr: 0.009780\n", + "INFO:root:epoch: 4/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 4.204257 acc: 0.041875 lr: 0.009880\n", + "INFO:root:epoch: 4/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 4.193417 acc: 0.048438 lr: 0.009980\n", + "INFO:root:epoch: 4/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 4.188381 acc: 0.048214 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 4.175587 acc: 0.049219 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 4.172535 acc: 0.048958 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 4.158615 acc: 0.051562 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 4.143053 acc: 0.053409 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 4.133410 acc: 0.054427 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 4.131123 acc: 0.053846 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 4.130420 acc: 0.055580 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 4.121361 acc: 0.054792 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 4.107443 acc: 0.057617 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 4.098401 acc: 0.060294 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 4.090190 acc: 0.060937 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 4.076577 acc: 0.061349 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 4.067421 acc: 0.061562 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 4.060067 acc: 0.060863 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 4.049889 acc: 0.062074 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 4.040142 acc: 0.062636 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 4.031185 acc: 0.063932 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 4.015913 acc: 0.065375 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 4.004460 acc: 0.066947 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 3.989642 acc: 0.069329 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 3.973949 acc: 0.072210 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 3.960340 acc: 0.074461 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 3.948678 acc: 0.075312 lr: 0.010000\n", + "INFO:root:epoch: 4/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 3.937246 acc: 0.076512 lr: 0.010000\n", + "INFO:root:epoch: 5/100 starts\n", + "INFO:root:epoch: 5/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 3.602264 acc: 0.146875 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 3.575348 acc: 0.131250 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 3.582449 acc: 0.118750 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 3.577921 acc: 0.121875 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 3.565801 acc: 0.121250 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 3.548683 acc: 0.118750 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 3.508676 acc: 0.120089 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 3.496036 acc: 0.123047 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 3.490975 acc: 0.122917 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 3.477949 acc: 0.124063 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 3.463334 acc: 0.125852 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 3.450261 acc: 0.124740 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 3.438849 acc: 0.128846 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 3.431314 acc: 0.129241 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 3.416327 acc: 0.133333 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 3.406788 acc: 0.136328 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 3.395345 acc: 0.138787 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 3.388426 acc: 0.140278 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 3.381553 acc: 0.142599 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 3.367660 acc: 0.145312 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 3.354665 acc: 0.147917 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 3.348045 acc: 0.148295 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 3.339810 acc: 0.151766 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 3.324989 acc: 0.154167 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 3.310998 acc: 0.156875 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 3.299634 acc: 0.158654 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 3.291406 acc: 0.160532 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 3.283068 acc: 0.161272 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 3.271852 acc: 0.162931 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 3.265716 acc: 0.163437 lr: 0.010000\n", + "INFO:root:epoch: 5/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 3.254701 acc: 0.165625 lr: 0.010000\n", + "INFO:root:epoch: 6/100 starts\n", + "INFO:root:epoch: 6/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 2.820441 acc: 0.228125 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.939542 acc: 0.231250 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 2.921857 acc: 0.229167 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 2.906477 acc: 0.224219 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 2.914442 acc: 0.230000 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.915281 acc: 0.225000 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.917999 acc: 0.219643 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.905005 acc: 0.221875 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.914437 acc: 0.220486 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.907124 acc: 0.222812 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.909275 acc: 0.222727 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.903210 acc: 0.222656 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.899808 acc: 0.221875 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 2.897036 acc: 0.222768 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.892817 acc: 0.225000 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 2.893514 acc: 0.225391 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.886759 acc: 0.227390 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.884362 acc: 0.228299 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.880840 acc: 0.228125 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.872577 acc: 0.230938 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.867449 acc: 0.233185 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.862959 acc: 0.234233 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.862449 acc: 0.234511 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.855118 acc: 0.236458 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.851627 acc: 0.236500 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.842585 acc: 0.238341 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.837825 acc: 0.239583 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.834812 acc: 0.238616 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.829549 acc: 0.238901 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.827458 acc: 0.239896 lr: 0.010000\n", + "INFO:root:epoch: 6/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.823910 acc: 0.238810 lr: 0.010000\n", + "INFO:root:epoch: 7/100 starts\n", + "INFO:root:epoch: 7/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 2.607040 acc: 0.287500 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.614367 acc: 0.289062 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 2.638236 acc: 0.283333 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 2.631425 acc: 0.284375 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 2.635860 acc: 0.283750 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.633486 acc: 0.283333 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.660004 acc: 0.282143 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.655013 acc: 0.284375 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.650505 acc: 0.284722 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.653058 acc: 0.281563 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.652163 acc: 0.280966 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.656446 acc: 0.280469 lr: 0.010000\n", + "INFO:root:epoch: 7/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.651380 acc: 0.280288 lr: 0.009988\n", + "INFO:root:epoch: 7/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 2.641273 acc: 0.280357 lr: 0.009971\n", + "INFO:root:epoch: 7/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.635538 acc: 0.280625 lr: 0.009953\n", + "INFO:root:epoch: 7/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 2.629010 acc: 0.280078 lr: 0.009936\n", + "INFO:root:epoch: 7/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.621454 acc: 0.282721 lr: 0.009919\n", + "INFO:root:epoch: 7/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.620412 acc: 0.282292 lr: 0.009902\n", + "INFO:root:epoch: 7/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.616242 acc: 0.282072 lr: 0.009885\n", + "INFO:root:epoch: 7/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.613541 acc: 0.283281 lr: 0.009867\n", + "INFO:root:epoch: 7/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.606067 acc: 0.285119 lr: 0.009850\n", + "INFO:root:epoch: 7/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.600847 acc: 0.285937 lr: 0.009833\n", + "INFO:root:epoch: 7/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.596402 acc: 0.287092 lr: 0.009816\n", + "INFO:root:epoch: 7/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.593893 acc: 0.287500 lr: 0.009799\n", + "INFO:root:epoch: 7/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.596835 acc: 0.287250 lr: 0.009782\n", + "INFO:root:epoch: 7/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.588553 acc: 0.289062 lr: 0.009765\n", + "INFO:root:epoch: 7/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.582253 acc: 0.290625 lr: 0.009748\n", + "INFO:root:epoch: 7/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.583183 acc: 0.289955 lr: 0.009732\n", + "INFO:root:epoch: 7/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.578796 acc: 0.290194 lr: 0.009715\n", + "INFO:root:epoch: 7/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.574179 acc: 0.289792 lr: 0.009698\n", + "INFO:root:epoch: 7/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.570780 acc: 0.290524 lr: 0.009681\n", + "INFO:root:epoch: 8/100 starts\n", + "INFO:root:epoch: 8/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 2.451610 acc: 0.346875 lr: 0.009659\n", + "INFO:root:epoch: 8/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.430122 acc: 0.337500 lr: 0.009643\n", + "INFO:root:epoch: 8/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 2.406345 acc: 0.322917 lr: 0.009626\n", + "INFO:root:epoch: 8/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 2.398308 acc: 0.324219 lr: 0.009609\n", + "INFO:root:epoch: 8/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 2.389222 acc: 0.323750 lr: 0.009593\n", + "INFO:root:epoch: 8/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.398692 acc: 0.321354 lr: 0.009576\n", + "INFO:root:epoch: 8/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.409715 acc: 0.319643 lr: 0.009559\n", + "INFO:root:epoch: 8/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.426385 acc: 0.313281 lr: 0.009543\n", + "INFO:root:epoch: 8/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.418058 acc: 0.312500 lr: 0.009526\n", + "INFO:root:epoch: 8/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.407412 acc: 0.319062 lr: 0.009510\n", + "INFO:root:epoch: 8/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.407154 acc: 0.318466 lr: 0.009493\n", + "INFO:root:epoch: 8/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.398121 acc: 0.319531 lr: 0.009477\n", + "INFO:root:epoch: 8/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.399252 acc: 0.318990 lr: 0.009461\n", + "INFO:root:epoch: 8/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 2.395213 acc: 0.321205 lr: 0.009444\n", + "INFO:root:epoch: 8/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.402388 acc: 0.321458 lr: 0.009428\n", + "INFO:root:epoch: 8/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 2.397477 acc: 0.323047 lr: 0.009412\n", + "INFO:root:epoch: 8/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.401867 acc: 0.320956 lr: 0.009395\n", + "INFO:root:epoch: 8/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.393594 acc: 0.321875 lr: 0.009379\n", + "INFO:root:epoch: 8/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.390463 acc: 0.323520 lr: 0.009363\n", + "INFO:root:epoch: 8/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.388567 acc: 0.324219 lr: 0.009347\n", + "INFO:root:epoch: 8/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.387305 acc: 0.324702 lr: 0.009330\n", + "INFO:root:epoch: 8/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.379817 acc: 0.327131 lr: 0.009314\n", + "INFO:root:epoch: 8/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.381477 acc: 0.326630 lr: 0.009298\n", + "INFO:root:epoch: 8/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.374062 acc: 0.328125 lr: 0.009282\n", + "INFO:root:epoch: 8/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.368610 acc: 0.330000 lr: 0.009266\n", + "INFO:root:epoch: 8/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.366560 acc: 0.330889 lr: 0.009250\n", + "INFO:root:epoch: 8/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.359682 acc: 0.332755 lr: 0.009234\n", + "INFO:root:epoch: 8/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.359157 acc: 0.332812 lr: 0.009218\n", + "INFO:root:epoch: 8/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.351791 acc: 0.334375 lr: 0.009202\n", + "INFO:root:epoch: 8/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.346438 acc: 0.335729 lr: 0.009186\n", + "INFO:root:epoch: 8/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.345128 acc: 0.336492 lr: 0.009170\n", + "INFO:root:epoch: 9/100 starts\n", + "INFO:root:epoch: 9/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 2.281513 acc: 0.356250 lr: 0.009149\n", + "INFO:root:epoch: 9/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.292913 acc: 0.357812 lr: 0.009134\n", + "INFO:root:epoch: 9/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 2.271482 acc: 0.366667 lr: 0.009118\n", + "INFO:root:epoch: 9/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 2.253157 acc: 0.363281 lr: 0.009102\n", + "INFO:root:epoch: 9/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 2.234764 acc: 0.366250 lr: 0.009086\n", + "INFO:root:epoch: 9/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.234404 acc: 0.369271 lr: 0.009070\n", + "INFO:root:epoch: 9/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.226868 acc: 0.370982 lr: 0.009055\n", + "INFO:root:epoch: 9/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.227068 acc: 0.371875 lr: 0.009039\n", + "INFO:root:epoch: 9/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.234855 acc: 0.369444 lr: 0.009023\n", + "INFO:root:epoch: 9/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.224417 acc: 0.371250 lr: 0.009008\n", + "INFO:root:epoch: 9/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.215646 acc: 0.369886 lr: 0.008992\n", + "INFO:root:epoch: 9/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.221938 acc: 0.367969 lr: 0.008977\n", + "INFO:root:epoch: 9/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.213379 acc: 0.369471 lr: 0.008961\n", + "INFO:root:epoch: 9/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 2.202343 acc: 0.370536 lr: 0.008946\n", + "INFO:root:epoch: 9/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.205875 acc: 0.365208 lr: 0.008930\n", + "INFO:root:epoch: 9/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 2.202338 acc: 0.367188 lr: 0.008915\n", + "INFO:root:epoch: 9/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.203373 acc: 0.365441 lr: 0.008899\n", + "INFO:root:epoch: 9/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.205812 acc: 0.366146 lr: 0.008884\n", + "INFO:root:epoch: 9/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.201259 acc: 0.367105 lr: 0.008868\n", + "INFO:root:epoch: 9/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.197683 acc: 0.367344 lr: 0.008853\n", + "INFO:root:epoch: 9/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.196061 acc: 0.367262 lr: 0.008838\n", + "INFO:root:epoch: 9/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.194003 acc: 0.368040 lr: 0.008822\n", + "INFO:root:epoch: 9/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.189800 acc: 0.369837 lr: 0.008807\n", + "INFO:root:epoch: 9/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.193373 acc: 0.369271 lr: 0.008792\n", + "INFO:root:epoch: 9/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.189737 acc: 0.370375 lr: 0.008777\n", + "INFO:root:epoch: 9/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.186865 acc: 0.371274 lr: 0.008762\n", + "INFO:root:epoch: 9/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.185678 acc: 0.370833 lr: 0.008746\n", + "INFO:root:epoch: 9/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.185041 acc: 0.369308 lr: 0.008731\n", + "INFO:root:epoch: 9/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.182037 acc: 0.369504 lr: 0.008716\n", + "INFO:root:epoch: 9/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.181730 acc: 0.370625 lr: 0.008701\n", + "INFO:root:epoch: 9/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.178883 acc: 0.372177 lr: 0.008686\n", + "INFO:root:epoch: 10/100 starts\n", + "INFO:root:epoch: 10/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 2.206240 acc: 0.356250 lr: 0.008666\n", + "INFO:root:epoch: 10/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 2.121857 acc: 0.393750 lr: 0.008651\n", + "INFO:root:epoch: 10/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 2.135537 acc: 0.379167 lr: 0.008636\n", + "INFO:root:epoch: 10/100 et: 1s eta: 13s batches: 40/313(12%) samples: 1280 loss: 2.120857 acc: 0.388281 lr: 0.008621\n", + "INFO:root:epoch: 10/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 2.131934 acc: 0.379375 lr: 0.008607\n", + "INFO:root:epoch: 10/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 2.105163 acc: 0.390625 lr: 0.008592\n", + "INFO:root:epoch: 10/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 2.096578 acc: 0.391071 lr: 0.008577\n", + "INFO:root:epoch: 10/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 2.085447 acc: 0.394141 lr: 0.008562\n", + "INFO:root:epoch: 10/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 2.076434 acc: 0.396875 lr: 0.008547\n", + "INFO:root:epoch: 10/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 2.064907 acc: 0.402188 lr: 0.008532\n", + "INFO:root:epoch: 10/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 2.054218 acc: 0.404830 lr: 0.008517\n", + "INFO:root:epoch: 10/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 2.041692 acc: 0.408073 lr: 0.008503\n", + "INFO:root:epoch: 10/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 2.031965 acc: 0.411058 lr: 0.008488\n", + "INFO:root:epoch: 10/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 2.017054 acc: 0.414286 lr: 0.008473\n", + "INFO:root:epoch: 10/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 2.022641 acc: 0.411667 lr: 0.008459\n", + "INFO:root:epoch: 10/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 2.028360 acc: 0.410938 lr: 0.008444\n", + "INFO:root:epoch: 10/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 2.032524 acc: 0.411029 lr: 0.008429\n", + "INFO:root:epoch: 10/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 2.033351 acc: 0.410417 lr: 0.008415\n", + "INFO:root:epoch: 10/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 2.031236 acc: 0.408553 lr: 0.008400\n", + "INFO:root:epoch: 10/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 2.030756 acc: 0.408750 lr: 0.008386\n", + "INFO:root:epoch: 10/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 2.030620 acc: 0.410119 lr: 0.008371\n", + "INFO:root:epoch: 10/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 2.023995 acc: 0.408097 lr: 0.008357\n", + "INFO:root:epoch: 10/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 2.021110 acc: 0.410870 lr: 0.008342\n", + "INFO:root:epoch: 10/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 2.029748 acc: 0.408984 lr: 0.008328\n", + "INFO:root:epoch: 10/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 2.029976 acc: 0.411000 lr: 0.008313\n", + "INFO:root:epoch: 10/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 2.032057 acc: 0.409014 lr: 0.008299\n", + "INFO:root:epoch: 10/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 2.026943 acc: 0.412153 lr: 0.008285\n", + "INFO:root:epoch: 10/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 2.024672 acc: 0.413951 lr: 0.008270\n", + "INFO:root:epoch: 10/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 2.022465 acc: 0.414871 lr: 0.008256\n", + "INFO:root:epoch: 10/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 2.018964 acc: 0.415521 lr: 0.008242\n", + "INFO:root:epoch: 10/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 2.013963 acc: 0.417036 lr: 0.008227\n", + "INFO:root:epoch: 11/100 starts\n", + "INFO:root:epoch: 11/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.937762 acc: 0.434375 lr: 0.008209\n", + "INFO:root:epoch: 11/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.951108 acc: 0.437500 lr: 0.008195\n", + "INFO:root:epoch: 11/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.933343 acc: 0.436458 lr: 0.008180\n", + "INFO:root:epoch: 11/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.929408 acc: 0.428906 lr: 0.008166\n", + "INFO:root:epoch: 11/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.915191 acc: 0.433125 lr: 0.008152\n", + "INFO:root:epoch: 11/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.903354 acc: 0.435938 lr: 0.008138\n", + "INFO:root:epoch: 11/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.914053 acc: 0.434821 lr: 0.008124\n", + "INFO:root:epoch: 11/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.927116 acc: 0.428125 lr: 0.008110\n", + "INFO:root:epoch: 11/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.913659 acc: 0.431250 lr: 0.008096\n", + "INFO:root:epoch: 11/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.905937 acc: 0.435000 lr: 0.008082\n", + "INFO:root:epoch: 11/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.909420 acc: 0.436932 lr: 0.008068\n", + "INFO:root:epoch: 11/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.912288 acc: 0.436719 lr: 0.008054\n", + "INFO:root:epoch: 11/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.908594 acc: 0.437500 lr: 0.008040\n", + "INFO:root:epoch: 11/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.900271 acc: 0.440402 lr: 0.008026\n", + "INFO:root:epoch: 11/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.896671 acc: 0.440000 lr: 0.008012\n", + "INFO:root:epoch: 11/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.901755 acc: 0.438867 lr: 0.007998\n", + "INFO:root:epoch: 11/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.905474 acc: 0.437684 lr: 0.007984\n", + "INFO:root:epoch: 11/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.909003 acc: 0.437153 lr: 0.007971\n", + "INFO:root:epoch: 11/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.919181 acc: 0.434704 lr: 0.007957\n", + "INFO:root:epoch: 11/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.924912 acc: 0.433437 lr: 0.007943\n", + "INFO:root:epoch: 11/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.922627 acc: 0.434077 lr: 0.007929\n", + "INFO:root:epoch: 11/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.919427 acc: 0.436506 lr: 0.007915\n", + "INFO:root:epoch: 11/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.915449 acc: 0.436821 lr: 0.007902\n", + "INFO:root:epoch: 11/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.911553 acc: 0.436328 lr: 0.007888\n", + "INFO:root:epoch: 11/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.911302 acc: 0.436375 lr: 0.007874\n", + "INFO:root:epoch: 11/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.908464 acc: 0.438942 lr: 0.007861\n", + "INFO:root:epoch: 11/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.901090 acc: 0.440972 lr: 0.007847\n", + "INFO:root:epoch: 11/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.895085 acc: 0.442522 lr: 0.007834\n", + "INFO:root:epoch: 11/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.893688 acc: 0.443427 lr: 0.007820\n", + "INFO:root:epoch: 11/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.894488 acc: 0.442813 lr: 0.007807\n", + "INFO:root:epoch: 11/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.895415 acc: 0.441230 lr: 0.007793\n", + "INFO:root:epoch: 12/100 starts\n", + "INFO:root:epoch: 12/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.809218 acc: 0.456250 lr: 0.007775\n", + "INFO:root:epoch: 12/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.880935 acc: 0.448437 lr: 0.007762\n", + "INFO:root:epoch: 12/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.879414 acc: 0.450000 lr: 0.007749\n", + "INFO:root:epoch: 12/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.840837 acc: 0.466406 lr: 0.007735\n", + "INFO:root:epoch: 12/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.828304 acc: 0.470000 lr: 0.007722\n", + "INFO:root:epoch: 12/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.816310 acc: 0.472917 lr: 0.007708\n", + "INFO:root:epoch: 12/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.795621 acc: 0.475446 lr: 0.007695\n", + "INFO:root:epoch: 12/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.800446 acc: 0.476562 lr: 0.007682\n", + "INFO:root:epoch: 12/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.794079 acc: 0.477431 lr: 0.007668\n", + "INFO:root:epoch: 12/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.795587 acc: 0.479375 lr: 0.007655\n", + "INFO:root:epoch: 12/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.795109 acc: 0.477557 lr: 0.007642\n", + "INFO:root:epoch: 12/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.810152 acc: 0.476562 lr: 0.007629\n", + "INFO:root:epoch: 12/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.814612 acc: 0.475240 lr: 0.007615\n", + "INFO:root:epoch: 12/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.816775 acc: 0.475446 lr: 0.007602\n", + "INFO:root:epoch: 12/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.808929 acc: 0.477708 lr: 0.007589\n", + "INFO:root:epoch: 12/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.802069 acc: 0.477930 lr: 0.007576\n", + "INFO:root:epoch: 12/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.791479 acc: 0.480882 lr: 0.007563\n", + "INFO:root:epoch: 12/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.791557 acc: 0.480382 lr: 0.007550\n", + "INFO:root:epoch: 12/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.790787 acc: 0.481086 lr: 0.007537\n", + "INFO:root:epoch: 12/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.782486 acc: 0.482187 lr: 0.007524\n", + "INFO:root:epoch: 12/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.786853 acc: 0.479018 lr: 0.007511\n", + "INFO:root:epoch: 12/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.784409 acc: 0.478409 lr: 0.007498\n", + "INFO:root:epoch: 12/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.782175 acc: 0.479755 lr: 0.007485\n", + "INFO:root:epoch: 12/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.777636 acc: 0.480339 lr: 0.007472\n", + "INFO:root:epoch: 12/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.775001 acc: 0.481250 lr: 0.007459\n", + "INFO:root:epoch: 12/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.773161 acc: 0.481010 lr: 0.007446\n", + "INFO:root:epoch: 12/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.769285 acc: 0.483333 lr: 0.007433\n", + "INFO:root:epoch: 12/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.771988 acc: 0.481250 lr: 0.007420\n", + "INFO:root:epoch: 12/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.768392 acc: 0.482651 lr: 0.007407\n", + "INFO:root:epoch: 12/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.773989 acc: 0.480000 lr: 0.007394\n", + "INFO:root:epoch: 12/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.773571 acc: 0.480645 lr: 0.007382\n", + "INFO:root:epoch: 13/100 starts\n", + "INFO:root:epoch: 13/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.907340 acc: 0.443750 lr: 0.007365\n", + "INFO:root:epoch: 13/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.854356 acc: 0.451563 lr: 0.007352\n", + "INFO:root:epoch: 13/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.850993 acc: 0.455208 lr: 0.007339\n", + "INFO:root:epoch: 13/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.807789 acc: 0.474219 lr: 0.007327\n", + "INFO:root:epoch: 13/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.767176 acc: 0.485000 lr: 0.007314\n", + "INFO:root:epoch: 13/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.754622 acc: 0.489062 lr: 0.007301\n", + "INFO:root:epoch: 13/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.735719 acc: 0.495536 lr: 0.007289\n", + "INFO:root:epoch: 13/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.729122 acc: 0.492578 lr: 0.007276\n", + "INFO:root:epoch: 13/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.739504 acc: 0.488194 lr: 0.007264\n", + "INFO:root:epoch: 13/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.723727 acc: 0.495000 lr: 0.007251\n", + "INFO:root:epoch: 13/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.726329 acc: 0.496307 lr: 0.007238\n", + "INFO:root:epoch: 13/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.720890 acc: 0.495833 lr: 0.007226\n", + "INFO:root:epoch: 13/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.717123 acc: 0.500240 lr: 0.007213\n", + "INFO:root:epoch: 13/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.702864 acc: 0.504687 lr: 0.007201\n", + "INFO:root:epoch: 13/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.716533 acc: 0.500208 lr: 0.007188\n", + "INFO:root:epoch: 13/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.707112 acc: 0.503711 lr: 0.007176\n", + "INFO:root:epoch: 13/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.704043 acc: 0.504044 lr: 0.007164\n", + "INFO:root:epoch: 13/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.706834 acc: 0.500868 lr: 0.007151\n", + "INFO:root:epoch: 13/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.701104 acc: 0.499671 lr: 0.007139\n", + "INFO:root:epoch: 13/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.706654 acc: 0.497969 lr: 0.007126\n", + "INFO:root:epoch: 13/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.709501 acc: 0.495238 lr: 0.007114\n", + "INFO:root:epoch: 13/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.712312 acc: 0.494602 lr: 0.007102\n", + "INFO:root:epoch: 13/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.709233 acc: 0.496060 lr: 0.007089\n", + "INFO:root:epoch: 13/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.706317 acc: 0.497266 lr: 0.007077\n", + "INFO:root:epoch: 13/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.709944 acc: 0.496625 lr: 0.007065\n", + "INFO:root:epoch: 13/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.708697 acc: 0.496034 lr: 0.007053\n", + "INFO:root:epoch: 13/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.708349 acc: 0.495370 lr: 0.007041\n", + "INFO:root:epoch: 13/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.705332 acc: 0.495647 lr: 0.007028\n", + "INFO:root:epoch: 13/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.702112 acc: 0.496444 lr: 0.007016\n", + "INFO:root:epoch: 13/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.699214 acc: 0.496354 lr: 0.007004\n", + "INFO:root:epoch: 13/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.700123 acc: 0.495968 lr: 0.006992\n", + "INFO:root:epoch: 14/100 starts\n", + "INFO:root:epoch: 14/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.590518 acc: 0.540625 lr: 0.006976\n", + "INFO:root:epoch: 14/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.565230 acc: 0.537500 lr: 0.006964\n", + "INFO:root:epoch: 14/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.629103 acc: 0.525000 lr: 0.006952\n", + "INFO:root:epoch: 14/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.609315 acc: 0.531250 lr: 0.006940\n", + "INFO:root:epoch: 14/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.618679 acc: 0.530000 lr: 0.006928\n", + "INFO:root:epoch: 14/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.636791 acc: 0.523958 lr: 0.006916\n", + "INFO:root:epoch: 14/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.622564 acc: 0.530357 lr: 0.006904\n", + "INFO:root:epoch: 14/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.633111 acc: 0.531250 lr: 0.006892\n", + "INFO:root:epoch: 14/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.629323 acc: 0.533333 lr: 0.006880\n", + "INFO:root:epoch: 14/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.631151 acc: 0.533125 lr: 0.006868\n", + "INFO:root:epoch: 14/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.631182 acc: 0.530966 lr: 0.006856\n", + "INFO:root:epoch: 14/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.630641 acc: 0.529167 lr: 0.006844\n", + "INFO:root:epoch: 14/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.631247 acc: 0.528365 lr: 0.006833\n", + "INFO:root:epoch: 14/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.642582 acc: 0.523884 lr: 0.006821\n", + "INFO:root:epoch: 14/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.634954 acc: 0.526250 lr: 0.006809\n", + "INFO:root:epoch: 14/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.633762 acc: 0.527734 lr: 0.006797\n", + "INFO:root:epoch: 14/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.627493 acc: 0.528309 lr: 0.006785\n", + "INFO:root:epoch: 14/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.622669 acc: 0.528299 lr: 0.006774\n", + "INFO:root:epoch: 14/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.621551 acc: 0.527632 lr: 0.006762\n", + "INFO:root:epoch: 14/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.618660 acc: 0.528750 lr: 0.006750\n", + "INFO:root:epoch: 14/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.620324 acc: 0.527976 lr: 0.006739\n", + "INFO:root:epoch: 14/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.619587 acc: 0.528267 lr: 0.006727\n", + "INFO:root:epoch: 14/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.617841 acc: 0.527310 lr: 0.006715\n", + "INFO:root:epoch: 14/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.620178 acc: 0.526042 lr: 0.006704\n", + "INFO:root:epoch: 14/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.620031 acc: 0.526500 lr: 0.006692\n", + "INFO:root:epoch: 14/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.626258 acc: 0.525601 lr: 0.006680\n", + "INFO:root:epoch: 14/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.629321 acc: 0.525000 lr: 0.006669\n", + "INFO:root:epoch: 14/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.624419 acc: 0.526339 lr: 0.006657\n", + "INFO:root:epoch: 14/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.620877 acc: 0.527155 lr: 0.006646\n", + "INFO:root:epoch: 14/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.620289 acc: 0.527292 lr: 0.006634\n", + "INFO:root:epoch: 14/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.618532 acc: 0.527319 lr: 0.006623\n", + "INFO:root:epoch: 15/100 starts\n", + "INFO:root:epoch: 15/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.670857 acc: 0.500000 lr: 0.006608\n", + "INFO:root:epoch: 15/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.671997 acc: 0.487500 lr: 0.006596\n", + "INFO:root:epoch: 15/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.660190 acc: 0.496875 lr: 0.006585\n", + "INFO:root:epoch: 15/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.652766 acc: 0.502344 lr: 0.006574\n", + "INFO:root:epoch: 15/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.599847 acc: 0.518125 lr: 0.006562\n", + "INFO:root:epoch: 15/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.589491 acc: 0.527604 lr: 0.006551\n", + "INFO:root:epoch: 15/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.575495 acc: 0.529018 lr: 0.006539\n", + "INFO:root:epoch: 15/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.584001 acc: 0.529297 lr: 0.006528\n", + "INFO:root:epoch: 15/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.589451 acc: 0.527083 lr: 0.006517\n", + "INFO:root:epoch: 15/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.582228 acc: 0.530000 lr: 0.006506\n", + "INFO:root:epoch: 15/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.584572 acc: 0.526989 lr: 0.006494\n", + "INFO:root:epoch: 15/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.575291 acc: 0.527865 lr: 0.006483\n", + "INFO:root:epoch: 15/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.568114 acc: 0.531490 lr: 0.006472\n", + "INFO:root:epoch: 15/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.570564 acc: 0.529911 lr: 0.006461\n", + "INFO:root:epoch: 15/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.564324 acc: 0.534583 lr: 0.006449\n", + "INFO:root:epoch: 15/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 1.562554 acc: 0.536914 lr: 0.006438\n", + "INFO:root:epoch: 15/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.560759 acc: 0.536397 lr: 0.006427\n", + "INFO:root:epoch: 15/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.559806 acc: 0.536979 lr: 0.006416\n", + "INFO:root:epoch: 15/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.553790 acc: 0.538487 lr: 0.006405\n", + "INFO:root:epoch: 15/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.557610 acc: 0.537500 lr: 0.006394\n", + "INFO:root:epoch: 15/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.560925 acc: 0.535863 lr: 0.006383\n", + "INFO:root:epoch: 15/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.561667 acc: 0.535085 lr: 0.006372\n", + "INFO:root:epoch: 15/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.560213 acc: 0.535462 lr: 0.006361\n", + "INFO:root:epoch: 15/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.562566 acc: 0.535547 lr: 0.006350\n", + "INFO:root:epoch: 15/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.570237 acc: 0.533750 lr: 0.006339\n", + "INFO:root:epoch: 15/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.569232 acc: 0.534014 lr: 0.006328\n", + "INFO:root:epoch: 15/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.570253 acc: 0.534028 lr: 0.006317\n", + "INFO:root:epoch: 15/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.567725 acc: 0.534710 lr: 0.006306\n", + "INFO:root:epoch: 15/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.563901 acc: 0.536099 lr: 0.006295\n", + "INFO:root:epoch: 15/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.562417 acc: 0.536146 lr: 0.006284\n", + "INFO:root:epoch: 15/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.560062 acc: 0.536895 lr: 0.006273\n", + "INFO:root:epoch: 16/100 starts\n", + "INFO:root:epoch: 16/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.412898 acc: 0.568750 lr: 0.006259\n", + "INFO:root:epoch: 16/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.427294 acc: 0.576563 lr: 0.006248\n", + "INFO:root:epoch: 16/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.443494 acc: 0.576042 lr: 0.006237\n", + "INFO:root:epoch: 16/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.482928 acc: 0.562500 lr: 0.006227\n", + "INFO:root:epoch: 16/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.471112 acc: 0.560625 lr: 0.006216\n", + "INFO:root:epoch: 16/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.481284 acc: 0.562500 lr: 0.006205\n", + "INFO:root:epoch: 16/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.476716 acc: 0.563839 lr: 0.006194\n", + "INFO:root:epoch: 16/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.483507 acc: 0.564062 lr: 0.006184\n", + "INFO:root:epoch: 16/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.500457 acc: 0.561458 lr: 0.006173\n", + "INFO:root:epoch: 16/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.494647 acc: 0.563125 lr: 0.006162\n", + "INFO:root:epoch: 16/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.493164 acc: 0.561364 lr: 0.006151\n", + "INFO:root:epoch: 16/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.485447 acc: 0.563021 lr: 0.006141\n", + "INFO:root:epoch: 16/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.490535 acc: 0.559856 lr: 0.006130\n", + "INFO:root:epoch: 16/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.488749 acc: 0.559598 lr: 0.006120\n", + "INFO:root:epoch: 16/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.488843 acc: 0.556250 lr: 0.006109\n", + "INFO:root:epoch: 16/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.487248 acc: 0.556445 lr: 0.006098\n", + "INFO:root:epoch: 16/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.491937 acc: 0.554596 lr: 0.006088\n", + "INFO:root:epoch: 16/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.488469 acc: 0.555729 lr: 0.006077\n", + "INFO:root:epoch: 16/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.490497 acc: 0.553947 lr: 0.006067\n", + "INFO:root:epoch: 16/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.495249 acc: 0.553125 lr: 0.006056\n", + "INFO:root:epoch: 16/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.495144 acc: 0.552976 lr: 0.006046\n", + "INFO:root:epoch: 16/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.491441 acc: 0.554830 lr: 0.006035\n", + "INFO:root:epoch: 16/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.488090 acc: 0.555571 lr: 0.006025\n", + "INFO:root:epoch: 16/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.485084 acc: 0.557031 lr: 0.006014\n", + "INFO:root:epoch: 16/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.489830 acc: 0.555625 lr: 0.006004\n", + "INFO:root:epoch: 16/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.488661 acc: 0.556130 lr: 0.005994\n", + "INFO:root:epoch: 16/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.488818 acc: 0.555903 lr: 0.005983\n", + "INFO:root:epoch: 16/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.491405 acc: 0.554464 lr: 0.005973\n", + "INFO:root:epoch: 16/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.495075 acc: 0.554310 lr: 0.005963\n", + "INFO:root:epoch: 16/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.492755 acc: 0.553542 lr: 0.005952\n", + "INFO:root:epoch: 16/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.491041 acc: 0.555141 lr: 0.005942\n", + "INFO:root:epoch: 17/100 starts\n", + "INFO:root:epoch: 17/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.449564 acc: 0.590625 lr: 0.005929\n", + "INFO:root:epoch: 17/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.427082 acc: 0.578125 lr: 0.005918\n", + "INFO:root:epoch: 17/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.475510 acc: 0.562500 lr: 0.005908\n", + "INFO:root:epoch: 17/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.448906 acc: 0.571875 lr: 0.005898\n", + "INFO:root:epoch: 17/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.455275 acc: 0.578125 lr: 0.005888\n", + "INFO:root:epoch: 17/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.463978 acc: 0.571875 lr: 0.005877\n", + "INFO:root:epoch: 17/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.450951 acc: 0.575000 lr: 0.005867\n", + "INFO:root:epoch: 17/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.455543 acc: 0.572266 lr: 0.005857\n", + "INFO:root:epoch: 17/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.461897 acc: 0.566319 lr: 0.005847\n", + "INFO:root:epoch: 17/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.459599 acc: 0.565000 lr: 0.005837\n", + "INFO:root:epoch: 17/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.445611 acc: 0.568182 lr: 0.005827\n", + "INFO:root:epoch: 17/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.447909 acc: 0.566146 lr: 0.005817\n", + "INFO:root:epoch: 17/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.450760 acc: 0.566346 lr: 0.005807\n", + "INFO:root:epoch: 17/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.442193 acc: 0.567634 lr: 0.005796\n", + "INFO:root:epoch: 17/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.434523 acc: 0.572292 lr: 0.005786\n", + "INFO:root:epoch: 17/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.432792 acc: 0.571875 lr: 0.005776\n", + "INFO:root:epoch: 17/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.425576 acc: 0.574265 lr: 0.005766\n", + "INFO:root:epoch: 17/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.426011 acc: 0.572917 lr: 0.005756\n", + "INFO:root:epoch: 17/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.427779 acc: 0.571217 lr: 0.005746\n", + "INFO:root:epoch: 17/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.430185 acc: 0.570625 lr: 0.005737\n", + "INFO:root:epoch: 17/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.423199 acc: 0.572173 lr: 0.005727\n", + "INFO:root:epoch: 17/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.421009 acc: 0.573011 lr: 0.005717\n", + "INFO:root:epoch: 17/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.416962 acc: 0.574049 lr: 0.005707\n", + "INFO:root:epoch: 17/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.410605 acc: 0.577995 lr: 0.005697\n", + "INFO:root:epoch: 17/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.410618 acc: 0.577625 lr: 0.005687\n", + "INFO:root:epoch: 17/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.408961 acc: 0.579808 lr: 0.005677\n", + "INFO:root:epoch: 17/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.408255 acc: 0.579514 lr: 0.005667\n", + "INFO:root:epoch: 17/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.411916 acc: 0.577790 lr: 0.005658\n", + "INFO:root:epoch: 17/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.416356 acc: 0.575754 lr: 0.005648\n", + "INFO:root:epoch: 17/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.414138 acc: 0.576667 lr: 0.005638\n", + "INFO:root:epoch: 17/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.412640 acc: 0.577520 lr: 0.005628\n", + "INFO:root:epoch: 18/100 starts\n", + "INFO:root:epoch: 18/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.434226 acc: 0.587500 lr: 0.005616\n", + "INFO:root:epoch: 18/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.439470 acc: 0.568750 lr: 0.005606\n", + "INFO:root:epoch: 18/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.417359 acc: 0.567708 lr: 0.005596\n", + "INFO:root:epoch: 18/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.435288 acc: 0.560156 lr: 0.005586\n", + "INFO:root:epoch: 18/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.424090 acc: 0.565000 lr: 0.005577\n", + "INFO:root:epoch: 18/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.410704 acc: 0.568229 lr: 0.005567\n", + "INFO:root:epoch: 18/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.392055 acc: 0.574554 lr: 0.005557\n", + "INFO:root:epoch: 18/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.389274 acc: 0.573047 lr: 0.005548\n", + "INFO:root:epoch: 18/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.394016 acc: 0.575000 lr: 0.005538\n", + "INFO:root:epoch: 18/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.399155 acc: 0.573125 lr: 0.005529\n", + "INFO:root:epoch: 18/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.393997 acc: 0.575568 lr: 0.005519\n", + "INFO:root:epoch: 18/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.394213 acc: 0.575781 lr: 0.005510\n", + "INFO:root:epoch: 18/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.389671 acc: 0.579327 lr: 0.005500\n", + "INFO:root:epoch: 18/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.405613 acc: 0.576116 lr: 0.005490\n", + "INFO:root:epoch: 18/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.401376 acc: 0.578333 lr: 0.005481\n", + "INFO:root:epoch: 18/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.409675 acc: 0.577930 lr: 0.005471\n", + "INFO:root:epoch: 18/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.400967 acc: 0.581066 lr: 0.005462\n", + "INFO:root:epoch: 18/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.394321 acc: 0.584028 lr: 0.005453\n", + "INFO:root:epoch: 18/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.395155 acc: 0.584704 lr: 0.005443\n", + "INFO:root:epoch: 18/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.387986 acc: 0.587344 lr: 0.005434\n", + "INFO:root:epoch: 18/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.382231 acc: 0.589286 lr: 0.005424\n", + "INFO:root:epoch: 18/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.381454 acc: 0.589915 lr: 0.005415\n", + "INFO:root:epoch: 18/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.381818 acc: 0.589402 lr: 0.005406\n", + "INFO:root:epoch: 18/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.382215 acc: 0.586979 lr: 0.005396\n", + "INFO:root:epoch: 18/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.378305 acc: 0.588500 lr: 0.005387\n", + "INFO:root:epoch: 18/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.376115 acc: 0.589303 lr: 0.005377\n", + "INFO:root:epoch: 18/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.372010 acc: 0.590278 lr: 0.005368\n", + "INFO:root:epoch: 18/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.377616 acc: 0.588504 lr: 0.005359\n", + "INFO:root:epoch: 18/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.377417 acc: 0.587823 lr: 0.005350\n", + "INFO:root:epoch: 18/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.379251 acc: 0.586771 lr: 0.005340\n", + "INFO:root:epoch: 18/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.379968 acc: 0.586996 lr: 0.005331\n", + "INFO:root:epoch: 19/100 starts\n", + "INFO:root:epoch: 19/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.266186 acc: 0.600000 lr: 0.005319\n", + "INFO:root:epoch: 19/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.324750 acc: 0.592187 lr: 0.005310\n", + "INFO:root:epoch: 19/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.323163 acc: 0.594792 lr: 0.005301\n", + "INFO:root:epoch: 19/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.319002 acc: 0.589844 lr: 0.005292\n", + "INFO:root:epoch: 19/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.336284 acc: 0.589375 lr: 0.005282\n", + "INFO:root:epoch: 19/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.336506 acc: 0.589062 lr: 0.005273\n", + "INFO:root:epoch: 19/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.334003 acc: 0.593750 lr: 0.005264\n", + "INFO:root:epoch: 19/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.328551 acc: 0.596094 lr: 0.005255\n", + "INFO:root:epoch: 19/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.327437 acc: 0.597222 lr: 0.005246\n", + "INFO:root:epoch: 19/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.332014 acc: 0.595937 lr: 0.005237\n", + "INFO:root:epoch: 19/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.328956 acc: 0.595170 lr: 0.005228\n", + "INFO:root:epoch: 19/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.332096 acc: 0.594010 lr: 0.005219\n", + "INFO:root:epoch: 19/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.337599 acc: 0.594231 lr: 0.005210\n", + "INFO:root:epoch: 19/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.338545 acc: 0.595536 lr: 0.005201\n", + "INFO:root:epoch: 19/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.336003 acc: 0.596250 lr: 0.005192\n", + "INFO:root:epoch: 19/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.334583 acc: 0.594922 lr: 0.005183\n", + "INFO:root:epoch: 19/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.336509 acc: 0.595221 lr: 0.005174\n", + "INFO:root:epoch: 19/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.342039 acc: 0.593576 lr: 0.005165\n", + "INFO:root:epoch: 19/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.332631 acc: 0.594737 lr: 0.005156\n", + "INFO:root:epoch: 19/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.325157 acc: 0.597344 lr: 0.005147\n", + "INFO:root:epoch: 19/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.327025 acc: 0.596875 lr: 0.005138\n", + "INFO:root:epoch: 19/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.321534 acc: 0.598153 lr: 0.005129\n", + "INFO:root:epoch: 19/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.320908 acc: 0.598098 lr: 0.005120\n", + "INFO:root:epoch: 19/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.317470 acc: 0.600000 lr: 0.005111\n", + "INFO:root:epoch: 19/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.320002 acc: 0.600625 lr: 0.005102\n", + "INFO:root:epoch: 19/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.325889 acc: 0.598918 lr: 0.005094\n", + "INFO:root:epoch: 19/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.325976 acc: 0.598148 lr: 0.005085\n", + "INFO:root:epoch: 19/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.326348 acc: 0.599554 lr: 0.005076\n", + "INFO:root:epoch: 19/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.325642 acc: 0.598922 lr: 0.005067\n", + "INFO:root:epoch: 19/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.324873 acc: 0.598646 lr: 0.005058\n", + "INFO:root:epoch: 19/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.325314 acc: 0.598690 lr: 0.005050\n", + "INFO:root:epoch: 20/100 starts\n", + "INFO:root:epoch: 20/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.280708 acc: 0.634375 lr: 0.005038\n", + "INFO:root:epoch: 20/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.235159 acc: 0.634375 lr: 0.005030\n", + "INFO:root:epoch: 20/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.306348 acc: 0.607292 lr: 0.005021\n", + "INFO:root:epoch: 20/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.309960 acc: 0.605469 lr: 0.005012\n", + "INFO:root:epoch: 20/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.307539 acc: 0.606875 lr: 0.005003\n", + "INFO:root:epoch: 20/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.297137 acc: 0.608854 lr: 0.004995\n", + "INFO:root:epoch: 20/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.275117 acc: 0.616964 lr: 0.004986\n", + "INFO:root:epoch: 20/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.299075 acc: 0.607422 lr: 0.004978\n", + "INFO:root:epoch: 20/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.289445 acc: 0.607639 lr: 0.004969\n", + "INFO:root:epoch: 20/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.277936 acc: 0.610312 lr: 0.004960\n", + "INFO:root:epoch: 20/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.297285 acc: 0.604830 lr: 0.004952\n", + "INFO:root:epoch: 20/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.289607 acc: 0.609115 lr: 0.004943\n", + "INFO:root:epoch: 20/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.295020 acc: 0.609135 lr: 0.004935\n", + "INFO:root:epoch: 20/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.303560 acc: 0.608482 lr: 0.004926\n", + "INFO:root:epoch: 20/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.300337 acc: 0.609792 lr: 0.004918\n", + "INFO:root:epoch: 20/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.296412 acc: 0.612695 lr: 0.004909\n", + "INFO:root:epoch: 20/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.292786 acc: 0.613603 lr: 0.004900\n", + "INFO:root:epoch: 20/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.292811 acc: 0.613368 lr: 0.004892\n", + "INFO:root:epoch: 20/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.283824 acc: 0.615625 lr: 0.004884\n", + "INFO:root:epoch: 20/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.280958 acc: 0.616406 lr: 0.004875\n", + "INFO:root:epoch: 20/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.274672 acc: 0.618006 lr: 0.004867\n", + "INFO:root:epoch: 20/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.278945 acc: 0.617045 lr: 0.004858\n", + "INFO:root:epoch: 20/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.284810 acc: 0.616168 lr: 0.004850\n", + "INFO:root:epoch: 20/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.287501 acc: 0.615755 lr: 0.004841\n", + "INFO:root:epoch: 20/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.283725 acc: 0.616625 lr: 0.004833\n", + "INFO:root:epoch: 20/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.285955 acc: 0.615505 lr: 0.004825\n", + "INFO:root:epoch: 20/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.286474 acc: 0.614583 lr: 0.004816\n", + "INFO:root:epoch: 20/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.281536 acc: 0.615737 lr: 0.004808\n", + "INFO:root:epoch: 20/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.279781 acc: 0.616272 lr: 0.004800\n", + "INFO:root:epoch: 20/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.278089 acc: 0.617500 lr: 0.004791\n", + "INFO:root:epoch: 20/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.275513 acc: 0.617641 lr: 0.004783\n", + "INFO:root:epoch: 21/100 starts\n", + "INFO:root:epoch: 21/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.225873 acc: 0.621875 lr: 0.004772\n", + "INFO:root:epoch: 21/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.342575 acc: 0.593750 lr: 0.004764\n", + "INFO:root:epoch: 21/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.354508 acc: 0.590625 lr: 0.004756\n", + "INFO:root:epoch: 21/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.306069 acc: 0.599219 lr: 0.004748\n", + "INFO:root:epoch: 21/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.305087 acc: 0.601875 lr: 0.004739\n", + "INFO:root:epoch: 21/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.281034 acc: 0.611458 lr: 0.004731\n", + "INFO:root:epoch: 21/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.252286 acc: 0.620536 lr: 0.004723\n", + "INFO:root:epoch: 21/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.254262 acc: 0.621094 lr: 0.004715\n", + "INFO:root:epoch: 21/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.265275 acc: 0.620833 lr: 0.004707\n", + "INFO:root:epoch: 21/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.276074 acc: 0.618125 lr: 0.004698\n", + "INFO:root:epoch: 21/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.272756 acc: 0.617045 lr: 0.004690\n", + "INFO:root:epoch: 21/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.277829 acc: 0.615104 lr: 0.004682\n", + "INFO:root:epoch: 21/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.274355 acc: 0.615625 lr: 0.004674\n", + "INFO:root:epoch: 21/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.280641 acc: 0.613616 lr: 0.004666\n", + "INFO:root:epoch: 21/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.284520 acc: 0.613333 lr: 0.004658\n", + "INFO:root:epoch: 21/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.276668 acc: 0.614648 lr: 0.004650\n", + "INFO:root:epoch: 21/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.276837 acc: 0.612500 lr: 0.004642\n", + "INFO:root:epoch: 21/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.281698 acc: 0.612674 lr: 0.004634\n", + "INFO:root:epoch: 21/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.273948 acc: 0.613816 lr: 0.004626\n", + "INFO:root:epoch: 21/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.263855 acc: 0.616250 lr: 0.004618\n", + "INFO:root:epoch: 21/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.266876 acc: 0.616815 lr: 0.004610\n", + "INFO:root:epoch: 21/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.269532 acc: 0.615625 lr: 0.004602\n", + "INFO:root:epoch: 21/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.269190 acc: 0.616576 lr: 0.004594\n", + "INFO:root:epoch: 21/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.270316 acc: 0.614844 lr: 0.004586\n", + "INFO:root:epoch: 21/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.269163 acc: 0.614125 lr: 0.004578\n", + "INFO:root:epoch: 21/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.272235 acc: 0.613702 lr: 0.004570\n", + "INFO:root:epoch: 21/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.272532 acc: 0.614583 lr: 0.004562\n", + "INFO:root:epoch: 21/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.270373 acc: 0.614955 lr: 0.004554\n", + "INFO:root:epoch: 21/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.268165 acc: 0.614978 lr: 0.004546\n", + "INFO:root:epoch: 21/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.272549 acc: 0.614479 lr: 0.004538\n", + "INFO:root:epoch: 21/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.274493 acc: 0.613609 lr: 0.004531\n", + "INFO:root:epoch: 22/100 starts\n", + "INFO:root:epoch: 22/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.269285 acc: 0.631250 lr: 0.004520\n", + "INFO:root:epoch: 22/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.262039 acc: 0.620312 lr: 0.004513\n", + "INFO:root:epoch: 22/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.256257 acc: 0.618750 lr: 0.004505\n", + "INFO:root:epoch: 22/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.254888 acc: 0.620312 lr: 0.004497\n", + "INFO:root:epoch: 22/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.241882 acc: 0.625000 lr: 0.004489\n", + "INFO:root:epoch: 22/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.238519 acc: 0.621875 lr: 0.004481\n", + "INFO:root:epoch: 22/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.222724 acc: 0.625446 lr: 0.004474\n", + "INFO:root:epoch: 22/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.206227 acc: 0.632812 lr: 0.004466\n", + "INFO:root:epoch: 22/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.200030 acc: 0.632292 lr: 0.004458\n", + "INFO:root:epoch: 22/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.195600 acc: 0.635313 lr: 0.004450\n", + "INFO:root:epoch: 22/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.191617 acc: 0.640341 lr: 0.004443\n", + "INFO:root:epoch: 22/100 et: 5s eta: 9s batches: 120/313(38%) samples: 3840 loss: 1.192565 acc: 0.638802 lr: 0.004435\n", + "INFO:root:epoch: 22/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.197187 acc: 0.639183 lr: 0.004427\n", + "INFO:root:epoch: 22/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.200389 acc: 0.637946 lr: 0.004420\n", + "INFO:root:epoch: 22/100 et: 7s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.200150 acc: 0.636042 lr: 0.004412\n", + "INFO:root:epoch: 22/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.198456 acc: 0.636719 lr: 0.004404\n", + "INFO:root:epoch: 22/100 et: 8s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.194059 acc: 0.638419 lr: 0.004397\n", + "INFO:root:epoch: 22/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.188364 acc: 0.640625 lr: 0.004389\n", + "INFO:root:epoch: 22/100 et: 9s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.187410 acc: 0.640296 lr: 0.004382\n", + "INFO:root:epoch: 22/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.189615 acc: 0.640469 lr: 0.004374\n", + "INFO:root:epoch: 22/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.187115 acc: 0.642411 lr: 0.004366\n", + "INFO:root:epoch: 22/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.193034 acc: 0.640767 lr: 0.004359\n", + "INFO:root:epoch: 22/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.191524 acc: 0.640897 lr: 0.004351\n", + "INFO:root:epoch: 22/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.191986 acc: 0.641536 lr: 0.004344\n", + "INFO:root:epoch: 22/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.194074 acc: 0.641000 lr: 0.004336\n", + "INFO:root:epoch: 22/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.187428 acc: 0.643990 lr: 0.004329\n", + "INFO:root:epoch: 22/100 et: 12s eta: 2s batches: 270/313(86%) samples: 8640 loss: 1.182578 acc: 0.645949 lr: 0.004321\n", + "INFO:root:epoch: 22/100 et: 13s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.183534 acc: 0.646652 lr: 0.004314\n", + "INFO:root:epoch: 22/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.182079 acc: 0.647306 lr: 0.004306\n", + "INFO:root:epoch: 22/100 et: 14s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.181875 acc: 0.646979 lr: 0.004299\n", + "INFO:root:epoch: 22/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.180883 acc: 0.647379 lr: 0.004291\n", + "INFO:root:epoch: 23/100 starts\n", + "INFO:root:epoch: 23/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.335497 acc: 0.603125 lr: 0.004282\n", + "INFO:root:epoch: 23/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.256130 acc: 0.623437 lr: 0.004274\n", + "INFO:root:epoch: 23/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.220576 acc: 0.625000 lr: 0.004267\n", + "INFO:root:epoch: 23/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.215305 acc: 0.626563 lr: 0.004259\n", + "INFO:root:epoch: 23/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.196881 acc: 0.626250 lr: 0.004252\n", + "INFO:root:epoch: 23/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.220484 acc: 0.626042 lr: 0.004245\n", + "INFO:root:epoch: 23/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.205832 acc: 0.625893 lr: 0.004237\n", + "INFO:root:epoch: 23/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.209833 acc: 0.628125 lr: 0.004230\n", + "INFO:root:epoch: 23/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.212374 acc: 0.626736 lr: 0.004223\n", + "INFO:root:epoch: 23/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.205179 acc: 0.629688 lr: 0.004215\n", + "INFO:root:epoch: 23/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.206868 acc: 0.629261 lr: 0.004208\n", + "INFO:root:epoch: 23/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.201353 acc: 0.630990 lr: 0.004201\n", + "INFO:root:epoch: 23/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.207954 acc: 0.630769 lr: 0.004194\n", + "INFO:root:epoch: 23/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.211173 acc: 0.630134 lr: 0.004186\n", + "INFO:root:epoch: 23/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.209848 acc: 0.632083 lr: 0.004179\n", + "INFO:root:epoch: 23/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.210798 acc: 0.630469 lr: 0.004172\n", + "INFO:root:epoch: 23/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.209039 acc: 0.630882 lr: 0.004165\n", + "INFO:root:epoch: 23/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.211289 acc: 0.631771 lr: 0.004157\n", + "INFO:root:epoch: 23/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.219773 acc: 0.628783 lr: 0.004150\n", + "INFO:root:epoch: 23/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.214747 acc: 0.629375 lr: 0.004143\n", + "INFO:root:epoch: 23/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.218350 acc: 0.628571 lr: 0.004136\n", + "INFO:root:epoch: 23/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.219696 acc: 0.629545 lr: 0.004129\n", + "INFO:root:epoch: 23/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.218164 acc: 0.629755 lr: 0.004122\n", + "INFO:root:epoch: 23/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.224656 acc: 0.627734 lr: 0.004114\n", + "INFO:root:epoch: 23/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.228053 acc: 0.625875 lr: 0.004107\n", + "INFO:root:epoch: 23/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.226239 acc: 0.625841 lr: 0.004100\n", + "INFO:root:epoch: 23/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.223572 acc: 0.625810 lr: 0.004093\n", + "INFO:root:epoch: 23/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.221086 acc: 0.626786 lr: 0.004086\n", + "INFO:root:epoch: 23/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.223270 acc: 0.626401 lr: 0.004079\n", + "INFO:root:epoch: 23/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.221303 acc: 0.626979 lr: 0.004072\n", + "INFO:root:epoch: 23/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.218277 acc: 0.627419 lr: 0.004065\n", + "INFO:root:epoch: 24/100 starts\n", + "INFO:root:epoch: 24/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.220275 acc: 0.634375 lr: 0.004056\n", + "INFO:root:epoch: 24/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.152032 acc: 0.654688 lr: 0.004049\n", + "INFO:root:epoch: 24/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.201902 acc: 0.634375 lr: 0.004042\n", + "INFO:root:epoch: 24/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.187698 acc: 0.639063 lr: 0.004035\n", + "INFO:root:epoch: 24/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.199120 acc: 0.638750 lr: 0.004028\n", + "INFO:root:epoch: 24/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.171957 acc: 0.648438 lr: 0.004021\n", + "INFO:root:epoch: 24/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.179553 acc: 0.645536 lr: 0.004014\n", + "INFO:root:epoch: 24/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.193809 acc: 0.642578 lr: 0.004007\n", + "INFO:root:epoch: 24/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.189958 acc: 0.641667 lr: 0.004000\n", + "INFO:root:epoch: 24/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.191840 acc: 0.638438 lr: 0.003993\n", + "INFO:root:epoch: 24/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.175894 acc: 0.644602 lr: 0.003986\n", + "INFO:root:epoch: 24/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.177396 acc: 0.642708 lr: 0.003979\n", + "INFO:root:epoch: 24/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.178631 acc: 0.640865 lr: 0.003972\n", + "INFO:root:epoch: 24/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.181614 acc: 0.639732 lr: 0.003965\n", + "INFO:root:epoch: 24/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.179924 acc: 0.639375 lr: 0.003958\n", + "INFO:root:epoch: 24/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.183492 acc: 0.637305 lr: 0.003952\n", + "INFO:root:epoch: 24/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.179016 acc: 0.638419 lr: 0.003945\n", + "INFO:root:epoch: 24/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.174863 acc: 0.640278 lr: 0.003938\n", + "INFO:root:epoch: 24/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.176255 acc: 0.641118 lr: 0.003931\n", + "INFO:root:epoch: 24/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.176602 acc: 0.640625 lr: 0.003924\n", + "INFO:root:epoch: 24/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.174983 acc: 0.641220 lr: 0.003917\n", + "INFO:root:epoch: 24/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.178939 acc: 0.640199 lr: 0.003911\n", + "INFO:root:epoch: 24/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.180348 acc: 0.639946 lr: 0.003904\n", + "INFO:root:epoch: 24/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.179358 acc: 0.641536 lr: 0.003897\n", + "INFO:root:epoch: 24/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.172712 acc: 0.643625 lr: 0.003890\n", + "INFO:root:epoch: 24/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.169369 acc: 0.642668 lr: 0.003884\n", + "INFO:root:epoch: 24/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.169085 acc: 0.642708 lr: 0.003877\n", + "INFO:root:epoch: 24/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.167223 acc: 0.643527 lr: 0.003870\n", + "INFO:root:epoch: 24/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.164962 acc: 0.643750 lr: 0.003864\n", + "INFO:root:epoch: 24/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.163042 acc: 0.643958 lr: 0.003857\n", + "INFO:root:epoch: 24/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.163734 acc: 0.643750 lr: 0.003850\n", + "INFO:root:epoch: 25/100 starts\n", + "INFO:root:epoch: 25/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.140937 acc: 0.643750 lr: 0.003842\n", + "INFO:root:epoch: 25/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.089915 acc: 0.660937 lr: 0.003835\n", + "INFO:root:epoch: 25/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.104070 acc: 0.659375 lr: 0.003828\n", + "INFO:root:epoch: 25/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.111904 acc: 0.656250 lr: 0.003822\n", + "INFO:root:epoch: 25/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.104860 acc: 0.661250 lr: 0.003815\n", + "INFO:root:epoch: 25/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.106434 acc: 0.659896 lr: 0.003808\n", + "INFO:root:epoch: 25/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.104390 acc: 0.660714 lr: 0.003802\n", + "INFO:root:epoch: 25/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.096111 acc: 0.663281 lr: 0.003795\n", + "INFO:root:epoch: 25/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.104070 acc: 0.662847 lr: 0.003789\n", + "INFO:root:epoch: 25/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.088546 acc: 0.667812 lr: 0.003782\n", + "INFO:root:epoch: 25/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.088519 acc: 0.669886 lr: 0.003776\n", + "INFO:root:epoch: 25/100 et: 5s eta: 9s batches: 120/313(38%) samples: 3840 loss: 1.094578 acc: 0.669010 lr: 0.003769\n", + "INFO:root:epoch: 25/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.099351 acc: 0.667548 lr: 0.003762\n", + "INFO:root:epoch: 25/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.101559 acc: 0.664509 lr: 0.003756\n", + "INFO:root:epoch: 25/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.098943 acc: 0.666458 lr: 0.003749\n", + "INFO:root:epoch: 25/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.095799 acc: 0.667578 lr: 0.003743\n", + "INFO:root:epoch: 25/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.085671 acc: 0.671324 lr: 0.003736\n", + "INFO:root:epoch: 25/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.087780 acc: 0.671354 lr: 0.003730\n", + "INFO:root:epoch: 25/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.086485 acc: 0.671217 lr: 0.003724\n", + "INFO:root:epoch: 25/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.092644 acc: 0.669219 lr: 0.003717\n", + "INFO:root:epoch: 25/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.095882 acc: 0.669643 lr: 0.003711\n", + "INFO:root:epoch: 25/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.095610 acc: 0.670312 lr: 0.003704\n", + "INFO:root:epoch: 25/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.094495 acc: 0.670516 lr: 0.003698\n", + "INFO:root:epoch: 25/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.093734 acc: 0.671484 lr: 0.003691\n", + "INFO:root:epoch: 25/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.091036 acc: 0.672125 lr: 0.003685\n", + "INFO:root:epoch: 25/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.096078 acc: 0.670673 lr: 0.003679\n", + "INFO:root:epoch: 25/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.102640 acc: 0.668287 lr: 0.003672\n", + "INFO:root:epoch: 25/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.105674 acc: 0.666853 lr: 0.003666\n", + "INFO:root:epoch: 25/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.107556 acc: 0.666272 lr: 0.003660\n", + "INFO:root:epoch: 25/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.108547 acc: 0.665000 lr: 0.003653\n", + "INFO:root:epoch: 25/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.110510 acc: 0.663810 lr: 0.003647\n", + "INFO:root:epoch: 26/100 starts\n", + "INFO:root:epoch: 26/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.032128 acc: 0.675000 lr: 0.003639\n", + "INFO:root:epoch: 26/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.131578 acc: 0.660937 lr: 0.003632\n", + "INFO:root:epoch: 26/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.152281 acc: 0.651042 lr: 0.003626\n", + "INFO:root:epoch: 26/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.123092 acc: 0.660156 lr: 0.003620\n", + "INFO:root:epoch: 26/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.104351 acc: 0.666250 lr: 0.003614\n", + "INFO:root:epoch: 26/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.095404 acc: 0.663542 lr: 0.003607\n", + "INFO:root:epoch: 26/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.109545 acc: 0.657589 lr: 0.003601\n", + "INFO:root:epoch: 26/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.099196 acc: 0.661719 lr: 0.003595\n", + "INFO:root:epoch: 26/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.082922 acc: 0.669444 lr: 0.003589\n", + "INFO:root:epoch: 26/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.089567 acc: 0.666250 lr: 0.003582\n", + "INFO:root:epoch: 26/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.100713 acc: 0.662500 lr: 0.003576\n", + "INFO:root:epoch: 26/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.100876 acc: 0.664844 lr: 0.003570\n", + "INFO:root:epoch: 26/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.099064 acc: 0.666106 lr: 0.003564\n", + "INFO:root:epoch: 26/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.102101 acc: 0.665402 lr: 0.003558\n", + "INFO:root:epoch: 26/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.097139 acc: 0.667083 lr: 0.003551\n", + "INFO:root:epoch: 26/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 1.098766 acc: 0.666406 lr: 0.003545\n", + "INFO:root:epoch: 26/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.102775 acc: 0.664890 lr: 0.003539\n", + "INFO:root:epoch: 26/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.102847 acc: 0.665104 lr: 0.003533\n", + "INFO:root:epoch: 26/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.104444 acc: 0.663158 lr: 0.003527\n", + "INFO:root:epoch: 26/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.108503 acc: 0.662656 lr: 0.003521\n", + "INFO:root:epoch: 26/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.100149 acc: 0.663542 lr: 0.003515\n", + "INFO:root:epoch: 26/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.092307 acc: 0.666619 lr: 0.003509\n", + "INFO:root:epoch: 26/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.098929 acc: 0.663859 lr: 0.003503\n", + "INFO:root:epoch: 26/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.098367 acc: 0.664453 lr: 0.003497\n", + "INFO:root:epoch: 26/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.101024 acc: 0.664125 lr: 0.003490\n", + "INFO:root:epoch: 26/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.100228 acc: 0.664303 lr: 0.003484\n", + "INFO:root:epoch: 26/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.100727 acc: 0.663426 lr: 0.003478\n", + "INFO:root:epoch: 26/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.097767 acc: 0.663504 lr: 0.003472\n", + "INFO:root:epoch: 26/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.094813 acc: 0.664547 lr: 0.003466\n", + "INFO:root:epoch: 26/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.095614 acc: 0.665312 lr: 0.003460\n", + "INFO:root:epoch: 26/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.097389 acc: 0.665020 lr: 0.003454\n", + "INFO:root:epoch: 27/100 starts\n", + "INFO:root:epoch: 27/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.184505 acc: 0.662500 lr: 0.003447\n", + "INFO:root:epoch: 27/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.138632 acc: 0.659375 lr: 0.003441\n", + "INFO:root:epoch: 27/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.120402 acc: 0.669792 lr: 0.003435\n", + "INFO:root:epoch: 27/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.113013 acc: 0.670312 lr: 0.003429\n", + "INFO:root:epoch: 27/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.081761 acc: 0.670625 lr: 0.003423\n", + "INFO:root:epoch: 27/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.101537 acc: 0.664062 lr: 0.003417\n", + "INFO:root:epoch: 27/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.086233 acc: 0.666518 lr: 0.003411\n", + "INFO:root:epoch: 27/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.086780 acc: 0.663281 lr: 0.003405\n", + "INFO:root:epoch: 27/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.085530 acc: 0.664931 lr: 0.003399\n", + "INFO:root:epoch: 27/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.083584 acc: 0.665000 lr: 0.003393\n", + "INFO:root:epoch: 27/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.077050 acc: 0.667898 lr: 0.003387\n", + "INFO:root:epoch: 27/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.086782 acc: 0.665885 lr: 0.003382\n", + "INFO:root:epoch: 27/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.092208 acc: 0.663462 lr: 0.003376\n", + "INFO:root:epoch: 27/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.100480 acc: 0.661830 lr: 0.003370\n", + "INFO:root:epoch: 27/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.097543 acc: 0.662708 lr: 0.003364\n", + "INFO:root:epoch: 27/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.103326 acc: 0.661914 lr: 0.003358\n", + "INFO:root:epoch: 27/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.094556 acc: 0.664154 lr: 0.003352\n", + "INFO:root:epoch: 27/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.097146 acc: 0.662674 lr: 0.003347\n", + "INFO:root:epoch: 27/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.097222 acc: 0.662829 lr: 0.003341\n", + "INFO:root:epoch: 27/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.095615 acc: 0.663281 lr: 0.003335\n", + "INFO:root:epoch: 27/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.095298 acc: 0.662649 lr: 0.003329\n", + "INFO:root:epoch: 27/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.093007 acc: 0.662358 lr: 0.003323\n", + "INFO:root:epoch: 27/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.097329 acc: 0.660462 lr: 0.003318\n", + "INFO:root:epoch: 27/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.101940 acc: 0.659635 lr: 0.003312\n", + "INFO:root:epoch: 27/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.101342 acc: 0.658500 lr: 0.003306\n", + "INFO:root:epoch: 27/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.098830 acc: 0.660096 lr: 0.003300\n", + "INFO:root:epoch: 27/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.104528 acc: 0.659838 lr: 0.003295\n", + "INFO:root:epoch: 27/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.100905 acc: 0.661272 lr: 0.003289\n", + "INFO:root:epoch: 27/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.102857 acc: 0.660560 lr: 0.003283\n", + "INFO:root:epoch: 27/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.105049 acc: 0.659792 lr: 0.003278\n", + "INFO:root:epoch: 27/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.103576 acc: 0.660081 lr: 0.003272\n", + "INFO:root:epoch: 28/100 starts\n", + "INFO:root:epoch: 28/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.227883 acc: 0.637500 lr: 0.003265\n", + "INFO:root:epoch: 28/100 et: 0s eta: 14s batches: 20/313(6%) samples: 640 loss: 1.145105 acc: 0.670313 lr: 0.003259\n", + "INFO:root:epoch: 28/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.129914 acc: 0.670833 lr: 0.003253\n", + "INFO:root:epoch: 28/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.166356 acc: 0.654687 lr: 0.003248\n", + "INFO:root:epoch: 28/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.128786 acc: 0.663750 lr: 0.003242\n", + "INFO:root:epoch: 28/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.114269 acc: 0.670833 lr: 0.003236\n", + "INFO:root:epoch: 28/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.103913 acc: 0.672321 lr: 0.003231\n", + "INFO:root:epoch: 28/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.088877 acc: 0.677344 lr: 0.003225\n", + "INFO:root:epoch: 28/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.089763 acc: 0.677083 lr: 0.003220\n", + "INFO:root:epoch: 28/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.081860 acc: 0.675000 lr: 0.003214\n", + "INFO:root:epoch: 28/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.081396 acc: 0.676705 lr: 0.003209\n", + "INFO:root:epoch: 28/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.083598 acc: 0.672656 lr: 0.003203\n", + "INFO:root:epoch: 28/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.074400 acc: 0.675000 lr: 0.003197\n", + "INFO:root:epoch: 28/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.066627 acc: 0.675223 lr: 0.003192\n", + "INFO:root:epoch: 28/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.074734 acc: 0.672083 lr: 0.003186\n", + "INFO:root:epoch: 28/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.073389 acc: 0.674219 lr: 0.003181\n", + "INFO:root:epoch: 28/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.073848 acc: 0.675184 lr: 0.003175\n", + "INFO:root:epoch: 28/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.065460 acc: 0.676562 lr: 0.003170\n", + "INFO:root:epoch: 28/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.056416 acc: 0.679605 lr: 0.003164\n", + "INFO:root:epoch: 28/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.055903 acc: 0.677656 lr: 0.003159\n", + "INFO:root:epoch: 28/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.056110 acc: 0.676637 lr: 0.003153\n", + "INFO:root:epoch: 28/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.054593 acc: 0.677131 lr: 0.003148\n", + "INFO:root:epoch: 28/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.055392 acc: 0.675815 lr: 0.003143\n", + "INFO:root:epoch: 28/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.053244 acc: 0.675911 lr: 0.003137\n", + "INFO:root:epoch: 28/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.051967 acc: 0.675625 lr: 0.003132\n", + "INFO:root:epoch: 28/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.052468 acc: 0.675361 lr: 0.003126\n", + "INFO:root:epoch: 28/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.052359 acc: 0.676042 lr: 0.003121\n", + "INFO:root:epoch: 28/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.051918 acc: 0.676451 lr: 0.003115\n", + "INFO:root:epoch: 28/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.052795 acc: 0.675754 lr: 0.003110\n", + "INFO:root:epoch: 28/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.060497 acc: 0.674167 lr: 0.003105\n", + "INFO:root:epoch: 28/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.061997 acc: 0.673286 lr: 0.003099\n", + "INFO:root:epoch: 29/100 starts\n", + "INFO:root:epoch: 29/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.970789 acc: 0.700000 lr: 0.003092\n", + "INFO:root:epoch: 29/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.958573 acc: 0.709375 lr: 0.003087\n", + "INFO:root:epoch: 29/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.977617 acc: 0.702083 lr: 0.003082\n", + "INFO:root:epoch: 29/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.991939 acc: 0.691406 lr: 0.003076\n", + "INFO:root:epoch: 29/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.050411 acc: 0.686250 lr: 0.003071\n", + "INFO:root:epoch: 29/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.055734 acc: 0.684375 lr: 0.003066\n", + "INFO:root:epoch: 29/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.057889 acc: 0.683929 lr: 0.003060\n", + "INFO:root:epoch: 29/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.046052 acc: 0.686719 lr: 0.003055\n", + "INFO:root:epoch: 29/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.056212 acc: 0.683681 lr: 0.003050\n", + "INFO:root:epoch: 29/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.045250 acc: 0.687813 lr: 0.003044\n", + "INFO:root:epoch: 29/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.047811 acc: 0.685795 lr: 0.003039\n", + "INFO:root:epoch: 29/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.047982 acc: 0.685156 lr: 0.003034\n", + "INFO:root:epoch: 29/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.045031 acc: 0.684135 lr: 0.003029\n", + "INFO:root:epoch: 29/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 1.062456 acc: 0.680134 lr: 0.003023\n", + "INFO:root:epoch: 29/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.055318 acc: 0.682292 lr: 0.003018\n", + "INFO:root:epoch: 29/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.055131 acc: 0.683398 lr: 0.003013\n", + "INFO:root:epoch: 29/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.054986 acc: 0.682353 lr: 0.003008\n", + "INFO:root:epoch: 29/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.056818 acc: 0.681771 lr: 0.003003\n", + "INFO:root:epoch: 29/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.054681 acc: 0.681414 lr: 0.002997\n", + "INFO:root:epoch: 29/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.049460 acc: 0.682969 lr: 0.002992\n", + "INFO:root:epoch: 29/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.051103 acc: 0.682887 lr: 0.002987\n", + "INFO:root:epoch: 29/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.052097 acc: 0.682386 lr: 0.002982\n", + "INFO:root:epoch: 29/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.051775 acc: 0.682065 lr: 0.002977\n", + "INFO:root:epoch: 29/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.051963 acc: 0.681641 lr: 0.002971\n", + "INFO:root:epoch: 29/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.051166 acc: 0.682250 lr: 0.002966\n", + "INFO:root:epoch: 29/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.053132 acc: 0.681611 lr: 0.002961\n", + "INFO:root:epoch: 29/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.048926 acc: 0.683218 lr: 0.002956\n", + "INFO:root:epoch: 29/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.055068 acc: 0.680246 lr: 0.002951\n", + "INFO:root:epoch: 29/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.052462 acc: 0.681034 lr: 0.002946\n", + "INFO:root:epoch: 29/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.053347 acc: 0.680313 lr: 0.002941\n", + "INFO:root:epoch: 29/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.051773 acc: 0.679335 lr: 0.002936\n", + "INFO:root:epoch: 30/100 starts\n", + "INFO:root:epoch: 30/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.058173 acc: 0.687500 lr: 0.002929\n", + "INFO:root:epoch: 30/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.028388 acc: 0.709375 lr: 0.002924\n", + "INFO:root:epoch: 30/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.005509 acc: 0.704167 lr: 0.002919\n", + "INFO:root:epoch: 30/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.011474 acc: 0.703906 lr: 0.002914\n", + "INFO:root:epoch: 30/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.029843 acc: 0.694375 lr: 0.002909\n", + "INFO:root:epoch: 30/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.032557 acc: 0.692188 lr: 0.002904\n", + "INFO:root:epoch: 30/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.029639 acc: 0.687946 lr: 0.002899\n", + "INFO:root:epoch: 30/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.019300 acc: 0.691797 lr: 0.002894\n", + "INFO:root:epoch: 30/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.022530 acc: 0.690278 lr: 0.002889\n", + "INFO:root:epoch: 30/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.038184 acc: 0.687188 lr: 0.002884\n", + "INFO:root:epoch: 30/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.032987 acc: 0.690341 lr: 0.002879\n", + "INFO:root:epoch: 30/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.038354 acc: 0.688021 lr: 0.002874\n", + "INFO:root:epoch: 30/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.044560 acc: 0.686779 lr: 0.002869\n", + "INFO:root:epoch: 30/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.046210 acc: 0.685268 lr: 0.002864\n", + "INFO:root:epoch: 30/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.041039 acc: 0.684792 lr: 0.002859\n", + "INFO:root:epoch: 30/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.040980 acc: 0.683008 lr: 0.002854\n", + "INFO:root:epoch: 30/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.038761 acc: 0.684559 lr: 0.002849\n", + "INFO:root:epoch: 30/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.037539 acc: 0.683333 lr: 0.002844\n", + "INFO:root:epoch: 30/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.029771 acc: 0.684868 lr: 0.002839\n", + "INFO:root:epoch: 30/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.030799 acc: 0.685156 lr: 0.002834\n", + "INFO:root:epoch: 30/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.030693 acc: 0.686012 lr: 0.002829\n", + "INFO:root:epoch: 30/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.032144 acc: 0.685511 lr: 0.002824\n", + "INFO:root:epoch: 30/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.032008 acc: 0.683967 lr: 0.002819\n", + "INFO:root:epoch: 30/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.028640 acc: 0.684896 lr: 0.002815\n", + "INFO:root:epoch: 30/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.025480 acc: 0.686500 lr: 0.002810\n", + "INFO:root:epoch: 30/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.027657 acc: 0.684976 lr: 0.002805\n", + "INFO:root:epoch: 30/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 1.031818 acc: 0.683565 lr: 0.002800\n", + "INFO:root:epoch: 30/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.026295 acc: 0.684933 lr: 0.002795\n", + "INFO:root:epoch: 30/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.026661 acc: 0.685453 lr: 0.002790\n", + "INFO:root:epoch: 30/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.025083 acc: 0.685625 lr: 0.002785\n", + "INFO:root:epoch: 30/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.023058 acc: 0.686794 lr: 0.002781\n", + "INFO:root:epoch: 31/100 starts\n", + "INFO:root:epoch: 31/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.878643 acc: 0.725000 lr: 0.002774\n", + "INFO:root:epoch: 31/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.947811 acc: 0.707812 lr: 0.002770\n", + "INFO:root:epoch: 31/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.990170 acc: 0.684375 lr: 0.002765\n", + "INFO:root:epoch: 31/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.010024 acc: 0.678906 lr: 0.002760\n", + "INFO:root:epoch: 31/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.987055 acc: 0.692500 lr: 0.002755\n", + "INFO:root:epoch: 31/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.997396 acc: 0.686979 lr: 0.002750\n", + "INFO:root:epoch: 31/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.007594 acc: 0.682143 lr: 0.002746\n", + "INFO:root:epoch: 31/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.005843 acc: 0.683594 lr: 0.002741\n", + "INFO:root:epoch: 31/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.002664 acc: 0.683681 lr: 0.002736\n", + "INFO:root:epoch: 31/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.002633 acc: 0.682812 lr: 0.002731\n", + "INFO:root:epoch: 31/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.006251 acc: 0.684091 lr: 0.002727\n", + "INFO:root:epoch: 31/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.006203 acc: 0.684635 lr: 0.002722\n", + "INFO:root:epoch: 31/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.008185 acc: 0.685096 lr: 0.002717\n", + "INFO:root:epoch: 31/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.006810 acc: 0.686607 lr: 0.002713\n", + "INFO:root:epoch: 31/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.003456 acc: 0.688333 lr: 0.002708\n", + "INFO:root:epoch: 31/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.008638 acc: 0.688477 lr: 0.002703\n", + "INFO:root:epoch: 31/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.011245 acc: 0.688787 lr: 0.002699\n", + "INFO:root:epoch: 31/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.007533 acc: 0.688889 lr: 0.002694\n", + "INFO:root:epoch: 31/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.008936 acc: 0.688322 lr: 0.002689\n", + "INFO:root:epoch: 31/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 1.008792 acc: 0.687969 lr: 0.002685\n", + "INFO:root:epoch: 31/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 1.008772 acc: 0.688542 lr: 0.002680\n", + "INFO:root:epoch: 31/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 1.004743 acc: 0.689773 lr: 0.002675\n", + "INFO:root:epoch: 31/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 1.006531 acc: 0.689946 lr: 0.002671\n", + "INFO:root:epoch: 31/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 1.007611 acc: 0.690495 lr: 0.002666\n", + "INFO:root:epoch: 31/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 1.009190 acc: 0.689000 lr: 0.002661\n", + "INFO:root:epoch: 31/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 1.009587 acc: 0.688101 lr: 0.002657\n", + "INFO:root:epoch: 31/100 et: 12s eta: 2s batches: 270/313(86%) samples: 8640 loss: 1.005554 acc: 0.689236 lr: 0.002652\n", + "INFO:root:epoch: 31/100 et: 13s eta: 1s batches: 280/313(89%) samples: 8960 loss: 1.007945 acc: 0.687612 lr: 0.002648\n", + "INFO:root:epoch: 31/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 1.008952 acc: 0.688470 lr: 0.002643\n", + "INFO:root:epoch: 31/100 et: 14s eta: 0s batches: 300/313(95%) samples: 9600 loss: 1.002400 acc: 0.690521 lr: 0.002638\n", + "INFO:root:epoch: 31/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 1.000607 acc: 0.691331 lr: 0.002634\n", + "INFO:root:epoch: 32/100 starts\n", + "INFO:root:epoch: 32/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.120256 acc: 0.646875 lr: 0.002628\n", + "INFO:root:epoch: 32/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.032334 acc: 0.671875 lr: 0.002623\n", + "INFO:root:epoch: 32/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 1.055711 acc: 0.673958 lr: 0.002619\n", + "INFO:root:epoch: 32/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.031622 acc: 0.691406 lr: 0.002614\n", + "INFO:root:epoch: 32/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.043867 acc: 0.685000 lr: 0.002610\n", + "INFO:root:epoch: 32/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.012254 acc: 0.692188 lr: 0.002605\n", + "INFO:root:epoch: 32/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 1.005487 acc: 0.685714 lr: 0.002601\n", + "INFO:root:epoch: 32/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 1.022546 acc: 0.682422 lr: 0.002596\n", + "INFO:root:epoch: 32/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 1.046185 acc: 0.678125 lr: 0.002592\n", + "INFO:root:epoch: 32/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 1.050502 acc: 0.675938 lr: 0.002587\n", + "INFO:root:epoch: 32/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 1.039648 acc: 0.680114 lr: 0.002583\n", + "INFO:root:epoch: 32/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 1.029496 acc: 0.682552 lr: 0.002578\n", + "INFO:root:epoch: 32/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 1.033686 acc: 0.681971 lr: 0.002574\n", + "INFO:root:epoch: 32/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 1.024202 acc: 0.685714 lr: 0.002569\n", + "INFO:root:epoch: 32/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.021629 acc: 0.686875 lr: 0.002565\n", + "INFO:root:epoch: 32/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.012940 acc: 0.689844 lr: 0.002561\n", + "INFO:root:epoch: 32/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.002413 acc: 0.693015 lr: 0.002556\n", + "INFO:root:epoch: 32/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.996699 acc: 0.693750 lr: 0.002552\n", + "INFO:root:epoch: 32/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.992887 acc: 0.694572 lr: 0.002547\n", + "INFO:root:epoch: 32/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.989263 acc: 0.695781 lr: 0.002543\n", + "INFO:root:epoch: 32/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.991710 acc: 0.694792 lr: 0.002538\n", + "INFO:root:epoch: 32/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.992421 acc: 0.695170 lr: 0.002534\n", + "INFO:root:epoch: 32/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.993581 acc: 0.696196 lr: 0.002530\n", + "INFO:root:epoch: 32/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.988297 acc: 0.698047 lr: 0.002525\n", + "INFO:root:epoch: 32/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.982323 acc: 0.699125 lr: 0.002521\n", + "INFO:root:epoch: 32/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.981103 acc: 0.699038 lr: 0.002517\n", + "INFO:root:epoch: 32/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.983450 acc: 0.697685 lr: 0.002512\n", + "INFO:root:epoch: 32/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.984714 acc: 0.696429 lr: 0.002508\n", + "INFO:root:epoch: 32/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.982777 acc: 0.696552 lr: 0.002503\n", + "INFO:root:epoch: 32/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.986487 acc: 0.695104 lr: 0.002499\n", + "INFO:root:epoch: 32/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.986540 acc: 0.694758 lr: 0.002495\n", + "INFO:root:epoch: 33/100 starts\n", + "INFO:root:epoch: 33/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 1.017195 acc: 0.696875 lr: 0.002489\n", + "INFO:root:epoch: 33/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.995858 acc: 0.687500 lr: 0.002485\n", + "INFO:root:epoch: 33/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.054737 acc: 0.675000 lr: 0.002481\n", + "INFO:root:epoch: 33/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.030615 acc: 0.679688 lr: 0.002476\n", + "INFO:root:epoch: 33/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.012855 acc: 0.690000 lr: 0.002472\n", + "INFO:root:epoch: 33/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.986055 acc: 0.692708 lr: 0.002468\n", + "INFO:root:epoch: 33/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.975835 acc: 0.696875 lr: 0.002463\n", + "INFO:root:epoch: 33/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.981859 acc: 0.692187 lr: 0.002459\n", + "INFO:root:epoch: 33/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.985633 acc: 0.694444 lr: 0.002455\n", + "INFO:root:epoch: 33/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.983942 acc: 0.696250 lr: 0.002451\n", + "INFO:root:epoch: 33/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.982093 acc: 0.695739 lr: 0.002446\n", + "INFO:root:epoch: 33/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.985689 acc: 0.696615 lr: 0.002442\n", + "INFO:root:epoch: 33/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.995205 acc: 0.692067 lr: 0.002438\n", + "INFO:root:epoch: 33/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.989432 acc: 0.692187 lr: 0.002434\n", + "INFO:root:epoch: 33/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 1.005926 acc: 0.687292 lr: 0.002430\n", + "INFO:root:epoch: 33/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 1.006885 acc: 0.687500 lr: 0.002425\n", + "INFO:root:epoch: 33/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 1.001893 acc: 0.688603 lr: 0.002421\n", + "INFO:root:epoch: 33/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 1.000617 acc: 0.689236 lr: 0.002417\n", + "INFO:root:epoch: 33/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 1.003761 acc: 0.689474 lr: 0.002413\n", + "INFO:root:epoch: 33/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.995013 acc: 0.691875 lr: 0.002409\n", + "INFO:root:epoch: 33/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.990297 acc: 0.694494 lr: 0.002404\n", + "INFO:root:epoch: 33/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.995231 acc: 0.692045 lr: 0.002400\n", + "INFO:root:epoch: 33/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.995214 acc: 0.692391 lr: 0.002396\n", + "INFO:root:epoch: 33/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.993845 acc: 0.692188 lr: 0.002392\n", + "INFO:root:epoch: 33/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.993042 acc: 0.692125 lr: 0.002388\n", + "INFO:root:epoch: 33/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.993076 acc: 0.691827 lr: 0.002384\n", + "INFO:root:epoch: 33/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.994440 acc: 0.691667 lr: 0.002380\n", + "INFO:root:epoch: 33/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.992376 acc: 0.692076 lr: 0.002375\n", + "INFO:root:epoch: 33/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.990601 acc: 0.692888 lr: 0.002371\n", + "INFO:root:epoch: 33/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.993241 acc: 0.692604 lr: 0.002367\n", + "INFO:root:epoch: 33/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.994272 acc: 0.692238 lr: 0.002363\n", + "INFO:root:epoch: 34/100 starts\n", + "INFO:root:epoch: 34/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.829187 acc: 0.737500 lr: 0.002358\n", + "INFO:root:epoch: 34/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.951155 acc: 0.692187 lr: 0.002354\n", + "INFO:root:epoch: 34/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.957783 acc: 0.692708 lr: 0.002350\n", + "INFO:root:epoch: 34/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.989606 acc: 0.681250 lr: 0.002346\n", + "INFO:root:epoch: 34/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.983047 acc: 0.685000 lr: 0.002341\n", + "INFO:root:epoch: 34/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.972665 acc: 0.692708 lr: 0.002337\n", + "INFO:root:epoch: 34/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.970072 acc: 0.696429 lr: 0.002333\n", + "INFO:root:epoch: 34/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.965343 acc: 0.698438 lr: 0.002329\n", + "INFO:root:epoch: 34/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.952787 acc: 0.703472 lr: 0.002325\n", + "INFO:root:epoch: 34/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.951892 acc: 0.702187 lr: 0.002321\n", + "INFO:root:epoch: 34/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.951053 acc: 0.702557 lr: 0.002317\n", + "INFO:root:epoch: 34/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.964162 acc: 0.698438 lr: 0.002313\n", + "INFO:root:epoch: 34/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.965018 acc: 0.699279 lr: 0.002309\n", + "INFO:root:epoch: 34/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.961824 acc: 0.699554 lr: 0.002305\n", + "INFO:root:epoch: 34/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.957807 acc: 0.701875 lr: 0.002301\n", + "INFO:root:epoch: 34/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.959694 acc: 0.700195 lr: 0.002297\n", + "INFO:root:epoch: 34/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.955124 acc: 0.702022 lr: 0.002293\n", + "INFO:root:epoch: 34/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.961792 acc: 0.700694 lr: 0.002289\n", + "INFO:root:epoch: 34/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.963842 acc: 0.700329 lr: 0.002285\n", + "INFO:root:epoch: 34/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.970577 acc: 0.699687 lr: 0.002281\n", + "INFO:root:epoch: 34/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.968876 acc: 0.700893 lr: 0.002277\n", + "INFO:root:epoch: 34/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.965493 acc: 0.702273 lr: 0.002274\n", + "INFO:root:epoch: 34/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.965751 acc: 0.700815 lr: 0.002270\n", + "INFO:root:epoch: 34/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.967889 acc: 0.700391 lr: 0.002266\n", + "INFO:root:epoch: 34/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.970088 acc: 0.699500 lr: 0.002262\n", + "INFO:root:epoch: 34/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.969501 acc: 0.699880 lr: 0.002258\n", + "INFO:root:epoch: 34/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.965315 acc: 0.701620 lr: 0.002254\n", + "INFO:root:epoch: 34/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.962363 acc: 0.702679 lr: 0.002250\n", + "INFO:root:epoch: 34/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.962301 acc: 0.701401 lr: 0.002246\n", + "INFO:root:epoch: 34/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.963125 acc: 0.699896 lr: 0.002242\n", + "INFO:root:epoch: 34/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.964709 acc: 0.700000 lr: 0.002238\n", + "INFO:root:epoch: 35/100 starts\n", + "INFO:root:epoch: 35/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.009297 acc: 0.696875 lr: 0.002233\n", + "INFO:root:epoch: 35/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 1.023766 acc: 0.692188 lr: 0.002229\n", + "INFO:root:epoch: 35/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 1.057812 acc: 0.683333 lr: 0.002226\n", + "INFO:root:epoch: 35/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 1.042550 acc: 0.686719 lr: 0.002222\n", + "INFO:root:epoch: 35/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.014933 acc: 0.692500 lr: 0.002218\n", + "INFO:root:epoch: 35/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 1.001648 acc: 0.700000 lr: 0.002214\n", + "INFO:root:epoch: 35/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.987618 acc: 0.702232 lr: 0.002210\n", + "INFO:root:epoch: 35/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.991443 acc: 0.701172 lr: 0.002206\n", + "INFO:root:epoch: 35/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.997950 acc: 0.698958 lr: 0.002203\n", + "INFO:root:epoch: 35/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.994920 acc: 0.699063 lr: 0.002199\n", + "INFO:root:epoch: 35/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.984630 acc: 0.701705 lr: 0.002195\n", + "INFO:root:epoch: 35/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.991273 acc: 0.700000 lr: 0.002191\n", + "INFO:root:epoch: 35/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.988474 acc: 0.699519 lr: 0.002187\n", + "INFO:root:epoch: 35/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.996823 acc: 0.697768 lr: 0.002184\n", + "INFO:root:epoch: 35/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.989630 acc: 0.697292 lr: 0.002180\n", + "INFO:root:epoch: 35/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.985122 acc: 0.698047 lr: 0.002176\n", + "INFO:root:epoch: 35/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.984472 acc: 0.696875 lr: 0.002172\n", + "INFO:root:epoch: 35/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.979952 acc: 0.697743 lr: 0.002168\n", + "INFO:root:epoch: 35/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.978553 acc: 0.698191 lr: 0.002165\n", + "INFO:root:epoch: 35/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.977551 acc: 0.698281 lr: 0.002161\n", + "INFO:root:epoch: 35/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.970447 acc: 0.701488 lr: 0.002157\n", + "INFO:root:epoch: 35/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.968814 acc: 0.702841 lr: 0.002153\n", + "INFO:root:epoch: 35/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.968661 acc: 0.702174 lr: 0.002150\n", + "INFO:root:epoch: 35/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.969574 acc: 0.702214 lr: 0.002146\n", + "INFO:root:epoch: 35/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.970585 acc: 0.702125 lr: 0.002142\n", + "INFO:root:epoch: 35/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.972630 acc: 0.702284 lr: 0.002139\n", + "INFO:root:epoch: 35/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.971067 acc: 0.702083 lr: 0.002135\n", + "INFO:root:epoch: 35/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.968114 acc: 0.702455 lr: 0.002131\n", + "INFO:root:epoch: 35/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.966663 acc: 0.702802 lr: 0.002128\n", + "INFO:root:epoch: 35/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.964420 acc: 0.703958 lr: 0.002124\n", + "INFO:root:epoch: 35/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.960788 acc: 0.704839 lr: 0.002120\n", + "INFO:root:epoch: 36/100 starts\n", + "INFO:root:epoch: 36/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.091243 acc: 0.687500 lr: 0.002115\n", + "INFO:root:epoch: 36/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.977320 acc: 0.712500 lr: 0.002112\n", + "INFO:root:epoch: 36/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.966894 acc: 0.710417 lr: 0.002108\n", + "INFO:root:epoch: 36/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.945779 acc: 0.711719 lr: 0.002104\n", + "INFO:root:epoch: 36/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.927921 acc: 0.715625 lr: 0.002101\n", + "INFO:root:epoch: 36/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.918030 acc: 0.718750 lr: 0.002097\n", + "INFO:root:epoch: 36/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.921133 acc: 0.718304 lr: 0.002094\n", + "INFO:root:epoch: 36/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.933628 acc: 0.718750 lr: 0.002090\n", + "INFO:root:epoch: 36/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.930386 acc: 0.717361 lr: 0.002086\n", + "INFO:root:epoch: 36/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.934669 acc: 0.716562 lr: 0.002083\n", + "INFO:root:epoch: 36/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.944246 acc: 0.713920 lr: 0.002079\n", + "INFO:root:epoch: 36/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.947096 acc: 0.713802 lr: 0.002075\n", + "INFO:root:epoch: 36/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.939754 acc: 0.716827 lr: 0.002072\n", + "INFO:root:epoch: 36/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.941787 acc: 0.715625 lr: 0.002068\n", + "INFO:root:epoch: 36/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.941816 acc: 0.715833 lr: 0.002065\n", + "INFO:root:epoch: 36/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.943801 acc: 0.714453 lr: 0.002061\n", + "INFO:root:epoch: 36/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.946200 acc: 0.714338 lr: 0.002058\n", + "INFO:root:epoch: 36/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.940760 acc: 0.714757 lr: 0.002054\n", + "INFO:root:epoch: 36/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.945329 acc: 0.713651 lr: 0.002050\n", + "INFO:root:epoch: 36/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.946733 acc: 0.713594 lr: 0.002047\n", + "INFO:root:epoch: 36/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.945527 acc: 0.714137 lr: 0.002043\n", + "INFO:root:epoch: 36/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.949205 acc: 0.712784 lr: 0.002040\n", + "INFO:root:epoch: 36/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.955257 acc: 0.711549 lr: 0.002036\n", + "INFO:root:epoch: 36/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.967109 acc: 0.706901 lr: 0.002033\n", + "INFO:root:epoch: 36/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.966555 acc: 0.707125 lr: 0.002029\n", + "INFO:root:epoch: 36/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.963576 acc: 0.708774 lr: 0.002026\n", + "INFO:root:epoch: 36/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.958365 acc: 0.709606 lr: 0.002022\n", + "INFO:root:epoch: 36/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.956237 acc: 0.710379 lr: 0.002019\n", + "INFO:root:epoch: 36/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.958801 acc: 0.709698 lr: 0.002015\n", + "INFO:root:epoch: 36/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.956470 acc: 0.709375 lr: 0.002012\n", + "INFO:root:epoch: 36/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.959181 acc: 0.708770 lr: 0.002008\n", + "INFO:root:epoch: 37/100 starts\n", + "INFO:root:epoch: 37/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.947274 acc: 0.696875 lr: 0.002004\n", + "INFO:root:epoch: 37/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.937142 acc: 0.706250 lr: 0.002000\n", + "INFO:root:epoch: 37/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.940878 acc: 0.715625 lr: 0.001997\n", + "INFO:root:epoch: 37/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.966049 acc: 0.709375 lr: 0.001993\n", + "INFO:root:epoch: 37/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 1.001778 acc: 0.698750 lr: 0.001990\n", + "INFO:root:epoch: 37/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.985789 acc: 0.703646 lr: 0.001986\n", + "INFO:root:epoch: 37/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.976270 acc: 0.703571 lr: 0.001983\n", + "INFO:root:epoch: 37/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.971587 acc: 0.704688 lr: 0.001980\n", + "INFO:root:epoch: 37/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.962920 acc: 0.707639 lr: 0.001976\n", + "INFO:root:epoch: 37/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.954290 acc: 0.710625 lr: 0.001973\n", + "INFO:root:epoch: 37/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.943198 acc: 0.713920 lr: 0.001969\n", + "INFO:root:epoch: 37/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.946688 acc: 0.715365 lr: 0.001966\n", + "INFO:root:epoch: 37/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.947125 acc: 0.713942 lr: 0.001962\n", + "INFO:root:epoch: 37/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.945415 acc: 0.714732 lr: 0.001959\n", + "INFO:root:epoch: 37/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.939685 acc: 0.714583 lr: 0.001956\n", + "INFO:root:epoch: 37/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.941070 acc: 0.714063 lr: 0.001952\n", + "INFO:root:epoch: 37/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.938366 acc: 0.713419 lr: 0.001949\n", + "INFO:root:epoch: 37/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.935093 acc: 0.713194 lr: 0.001946\n", + "INFO:root:epoch: 37/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.931260 acc: 0.713651 lr: 0.001942\n", + "INFO:root:epoch: 37/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.933529 acc: 0.711875 lr: 0.001939\n", + "INFO:root:epoch: 37/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.937069 acc: 0.711012 lr: 0.001935\n", + "INFO:root:epoch: 37/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.938863 acc: 0.710511 lr: 0.001932\n", + "INFO:root:epoch: 37/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.939828 acc: 0.709783 lr: 0.001929\n", + "INFO:root:epoch: 37/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.939845 acc: 0.710156 lr: 0.001925\n", + "INFO:root:epoch: 37/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.940458 acc: 0.709625 lr: 0.001922\n", + "INFO:root:epoch: 37/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.944485 acc: 0.707452 lr: 0.001919\n", + "INFO:root:epoch: 37/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.943645 acc: 0.706944 lr: 0.001915\n", + "INFO:root:epoch: 37/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.945993 acc: 0.707366 lr: 0.001912\n", + "INFO:root:epoch: 37/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.941415 acc: 0.708297 lr: 0.001909\n", + "INFO:root:epoch: 37/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.940199 acc: 0.709167 lr: 0.001906\n", + "INFO:root:epoch: 37/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.940290 acc: 0.709980 lr: 0.001902\n", + "INFO:root:epoch: 38/100 starts\n", + "INFO:root:epoch: 38/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.915666 acc: 0.690625 lr: 0.001898\n", + "INFO:root:epoch: 38/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.961178 acc: 0.693750 lr: 0.001895\n", + "INFO:root:epoch: 38/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.954265 acc: 0.690625 lr: 0.001891\n", + "INFO:root:epoch: 38/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.928703 acc: 0.699219 lr: 0.001888\n", + "INFO:root:epoch: 38/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.951760 acc: 0.690000 lr: 0.001885\n", + "INFO:root:epoch: 38/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.931337 acc: 0.694271 lr: 0.001882\n", + "INFO:root:epoch: 38/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.919142 acc: 0.704018 lr: 0.001878\n", + "INFO:root:epoch: 38/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.930935 acc: 0.697656 lr: 0.001875\n", + "INFO:root:epoch: 38/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.920327 acc: 0.706250 lr: 0.001872\n", + "INFO:root:epoch: 38/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.918248 acc: 0.709375 lr: 0.001869\n", + "INFO:root:epoch: 38/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.916976 acc: 0.708239 lr: 0.001865\n", + "INFO:root:epoch: 38/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.913504 acc: 0.708073 lr: 0.001862\n", + "INFO:root:epoch: 38/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.920087 acc: 0.707692 lr: 0.001859\n", + "INFO:root:epoch: 38/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.912807 acc: 0.710491 lr: 0.001856\n", + "INFO:root:epoch: 38/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.901150 acc: 0.715417 lr: 0.001852\n", + "INFO:root:epoch: 38/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.902415 acc: 0.716406 lr: 0.001849\n", + "INFO:root:epoch: 38/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.903299 acc: 0.716176 lr: 0.001846\n", + "INFO:root:epoch: 38/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.902306 acc: 0.715799 lr: 0.001843\n", + "INFO:root:epoch: 38/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.905362 acc: 0.714803 lr: 0.001840\n", + "INFO:root:epoch: 38/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.909509 acc: 0.715000 lr: 0.001836\n", + "INFO:root:epoch: 38/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.912586 acc: 0.715476 lr: 0.001833\n", + "INFO:root:epoch: 38/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.914302 acc: 0.713636 lr: 0.001830\n", + "INFO:root:epoch: 38/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.915388 acc: 0.714946 lr: 0.001827\n", + "INFO:root:epoch: 38/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.917406 acc: 0.713411 lr: 0.001824\n", + "INFO:root:epoch: 38/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.919715 acc: 0.712750 lr: 0.001821\n", + "INFO:root:epoch: 38/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.919115 acc: 0.713101 lr: 0.001817\n", + "INFO:root:epoch: 38/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.921368 acc: 0.712731 lr: 0.001814\n", + "INFO:root:epoch: 38/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.923399 acc: 0.712165 lr: 0.001811\n", + "INFO:root:epoch: 38/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.923672 acc: 0.711746 lr: 0.001808\n", + "INFO:root:epoch: 38/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.928174 acc: 0.709896 lr: 0.001805\n", + "INFO:root:epoch: 38/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.925099 acc: 0.710484 lr: 0.001802\n", + "INFO:root:epoch: 39/100 starts\n", + "INFO:root:epoch: 39/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.921871 acc: 0.718750 lr: 0.001798\n", + "INFO:root:epoch: 39/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.929147 acc: 0.712500 lr: 0.001795\n", + "INFO:root:epoch: 39/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.916779 acc: 0.726042 lr: 0.001792\n", + "INFO:root:epoch: 39/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.913380 acc: 0.714063 lr: 0.001788\n", + "INFO:root:epoch: 39/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.898546 acc: 0.716875 lr: 0.001785\n", + "INFO:root:epoch: 39/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.891703 acc: 0.719792 lr: 0.001782\n", + "INFO:root:epoch: 39/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.891161 acc: 0.718304 lr: 0.001779\n", + "INFO:root:epoch: 39/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.884714 acc: 0.716406 lr: 0.001776\n", + "INFO:root:epoch: 39/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.902276 acc: 0.711458 lr: 0.001773\n", + "INFO:root:epoch: 39/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.900820 acc: 0.714375 lr: 0.001770\n", + "INFO:root:epoch: 39/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.901034 acc: 0.715057 lr: 0.001767\n", + "INFO:root:epoch: 39/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.905887 acc: 0.715365 lr: 0.001764\n", + "INFO:root:epoch: 39/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.908046 acc: 0.713942 lr: 0.001761\n", + "INFO:root:epoch: 39/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.906175 acc: 0.711830 lr: 0.001758\n", + "INFO:root:epoch: 39/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.903609 acc: 0.712708 lr: 0.001755\n", + "INFO:root:epoch: 39/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.908699 acc: 0.711133 lr: 0.001752\n", + "INFO:root:epoch: 39/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.909322 acc: 0.711581 lr: 0.001749\n", + "INFO:root:epoch: 39/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.904612 acc: 0.714236 lr: 0.001746\n", + "INFO:root:epoch: 39/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.908094 acc: 0.714474 lr: 0.001743\n", + "INFO:root:epoch: 39/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.906864 acc: 0.716562 lr: 0.001740\n", + "INFO:root:epoch: 39/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.908377 acc: 0.715774 lr: 0.001736\n", + "INFO:root:epoch: 39/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.909021 acc: 0.716903 lr: 0.001733\n", + "INFO:root:epoch: 39/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.907651 acc: 0.716168 lr: 0.001730\n", + "INFO:root:epoch: 39/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.910696 acc: 0.715234 lr: 0.001727\n", + "INFO:root:epoch: 39/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.907546 acc: 0.715625 lr: 0.001724\n", + "INFO:root:epoch: 39/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.908025 acc: 0.716707 lr: 0.001722\n", + "INFO:root:epoch: 39/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.912086 acc: 0.715509 lr: 0.001719\n", + "INFO:root:epoch: 39/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.912558 acc: 0.714732 lr: 0.001716\n", + "INFO:root:epoch: 39/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.909682 acc: 0.715302 lr: 0.001713\n", + "INFO:root:epoch: 39/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.906029 acc: 0.716979 lr: 0.001710\n", + "INFO:root:epoch: 39/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.903505 acc: 0.717036 lr: 0.001707\n", + "INFO:root:epoch: 40/100 starts\n", + "INFO:root:epoch: 40/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.780566 acc: 0.775000 lr: 0.001703\n", + "INFO:root:epoch: 40/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.838111 acc: 0.748437 lr: 0.001700\n", + "INFO:root:epoch: 40/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.856948 acc: 0.733333 lr: 0.001697\n", + "INFO:root:epoch: 40/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.862112 acc: 0.732812 lr: 0.001694\n", + "INFO:root:epoch: 40/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.860197 acc: 0.730000 lr: 0.001691\n", + "INFO:root:epoch: 40/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.863356 acc: 0.728125 lr: 0.001688\n", + "INFO:root:epoch: 40/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.878911 acc: 0.724554 lr: 0.001685\n", + "INFO:root:epoch: 40/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.888058 acc: 0.722656 lr: 0.001682\n", + "INFO:root:epoch: 40/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.894968 acc: 0.722222 lr: 0.001679\n", + "INFO:root:epoch: 40/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.894365 acc: 0.721563 lr: 0.001676\n", + "INFO:root:epoch: 40/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.903343 acc: 0.718466 lr: 0.001674\n", + "INFO:root:epoch: 40/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.910384 acc: 0.715365 lr: 0.001671\n", + "INFO:root:epoch: 40/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.899847 acc: 0.717788 lr: 0.001668\n", + "INFO:root:epoch: 40/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.894946 acc: 0.718973 lr: 0.001665\n", + "INFO:root:epoch: 40/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.903341 acc: 0.717708 lr: 0.001662\n", + "INFO:root:epoch: 40/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.897506 acc: 0.718750 lr: 0.001659\n", + "INFO:root:epoch: 40/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.898148 acc: 0.720956 lr: 0.001656\n", + "INFO:root:epoch: 40/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.905616 acc: 0.718750 lr: 0.001653\n", + "INFO:root:epoch: 40/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.904865 acc: 0.719408 lr: 0.001651\n", + "INFO:root:epoch: 40/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.906802 acc: 0.718438 lr: 0.001648\n", + "INFO:root:epoch: 40/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.904453 acc: 0.719345 lr: 0.001645\n", + "INFO:root:epoch: 40/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.903755 acc: 0.720170 lr: 0.001642\n", + "INFO:root:epoch: 40/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.899484 acc: 0.721196 lr: 0.001639\n", + "INFO:root:epoch: 40/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.897623 acc: 0.721484 lr: 0.001636\n", + "INFO:root:epoch: 40/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.893251 acc: 0.722375 lr: 0.001633\n", + "INFO:root:epoch: 40/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.892736 acc: 0.722356 lr: 0.001631\n", + "INFO:root:epoch: 40/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.897582 acc: 0.722106 lr: 0.001628\n", + "INFO:root:epoch: 40/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.900767 acc: 0.721987 lr: 0.001625\n", + "INFO:root:epoch: 40/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.904848 acc: 0.721659 lr: 0.001622\n", + "INFO:root:epoch: 40/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.904811 acc: 0.721250 lr: 0.001619\n", + "INFO:root:epoch: 40/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.905520 acc: 0.720464 lr: 0.001617\n", + "INFO:root:epoch: 41/100 starts\n", + "INFO:root:epoch: 41/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.893831 acc: 0.746875 lr: 0.001613\n", + "INFO:root:epoch: 41/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.886461 acc: 0.735937 lr: 0.001610\n", + "INFO:root:epoch: 41/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.885964 acc: 0.736458 lr: 0.001607\n", + "INFO:root:epoch: 41/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.863613 acc: 0.744531 lr: 0.001605\n", + "INFO:root:epoch: 41/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.868521 acc: 0.738750 lr: 0.001602\n", + "INFO:root:epoch: 41/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.889519 acc: 0.728125 lr: 0.001599\n", + "INFO:root:epoch: 41/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.882022 acc: 0.731250 lr: 0.001596\n", + "INFO:root:epoch: 41/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.883197 acc: 0.727344 lr: 0.001593\n", + "INFO:root:epoch: 41/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.870593 acc: 0.729167 lr: 0.001591\n", + "INFO:root:epoch: 41/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.859885 acc: 0.731250 lr: 0.001588\n", + "INFO:root:epoch: 41/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.863553 acc: 0.730682 lr: 0.001585\n", + "INFO:root:epoch: 41/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.859644 acc: 0.734375 lr: 0.001582\n", + "INFO:root:epoch: 41/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.865917 acc: 0.733654 lr: 0.001580\n", + "INFO:root:epoch: 41/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.870035 acc: 0.731250 lr: 0.001577\n", + "INFO:root:epoch: 41/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.862183 acc: 0.733542 lr: 0.001574\n", + "INFO:root:epoch: 41/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.863656 acc: 0.732812 lr: 0.001572\n", + "INFO:root:epoch: 41/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.866855 acc: 0.731250 lr: 0.001569\n", + "INFO:root:epoch: 41/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.867640 acc: 0.730903 lr: 0.001566\n", + "INFO:root:epoch: 41/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.866226 acc: 0.730757 lr: 0.001563\n", + "INFO:root:epoch: 41/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.863282 acc: 0.731875 lr: 0.001561\n", + "INFO:root:epoch: 41/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.864556 acc: 0.731101 lr: 0.001558\n", + "INFO:root:epoch: 41/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.863321 acc: 0.730966 lr: 0.001555\n", + "INFO:root:epoch: 41/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.861878 acc: 0.731522 lr: 0.001553\n", + "INFO:root:epoch: 41/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.859133 acc: 0.733594 lr: 0.001550\n", + "INFO:root:epoch: 41/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.862252 acc: 0.732375 lr: 0.001547\n", + "INFO:root:epoch: 41/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.861156 acc: 0.733053 lr: 0.001545\n", + "INFO:root:epoch: 41/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.861156 acc: 0.733681 lr: 0.001542\n", + "INFO:root:epoch: 41/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.862086 acc: 0.732924 lr: 0.001539\n", + "INFO:root:epoch: 41/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.864426 acc: 0.732651 lr: 0.001537\n", + "INFO:root:epoch: 41/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.866150 acc: 0.732083 lr: 0.001534\n", + "INFO:root:epoch: 41/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.864671 acc: 0.731855 lr: 0.001531\n", + "INFO:root:epoch: 42/100 starts\n", + "INFO:root:epoch: 42/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.892114 acc: 0.743750 lr: 0.001528\n", + "INFO:root:epoch: 42/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.844824 acc: 0.765625 lr: 0.001525\n", + "INFO:root:epoch: 42/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.867760 acc: 0.752083 lr: 0.001522\n", + "INFO:root:epoch: 42/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.872542 acc: 0.754687 lr: 0.001520\n", + "INFO:root:epoch: 42/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.869582 acc: 0.753125 lr: 0.001517\n", + "INFO:root:epoch: 42/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.867329 acc: 0.750521 lr: 0.001515\n", + "INFO:root:epoch: 42/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.850506 acc: 0.753571 lr: 0.001512\n", + "INFO:root:epoch: 42/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.838864 acc: 0.755469 lr: 0.001509\n", + "INFO:root:epoch: 42/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.852119 acc: 0.747917 lr: 0.001507\n", + "INFO:root:epoch: 42/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.847287 acc: 0.746875 lr: 0.001504\n", + "INFO:root:epoch: 42/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.849393 acc: 0.747159 lr: 0.001502\n", + "INFO:root:epoch: 42/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.849027 acc: 0.746354 lr: 0.001499\n", + "INFO:root:epoch: 42/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.852098 acc: 0.745433 lr: 0.001496\n", + "INFO:root:epoch: 42/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.856265 acc: 0.741295 lr: 0.001494\n", + "INFO:root:epoch: 42/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.859484 acc: 0.738750 lr: 0.001491\n", + "INFO:root:epoch: 42/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.858422 acc: 0.738086 lr: 0.001489\n", + "INFO:root:epoch: 42/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.860048 acc: 0.739522 lr: 0.001486\n", + "INFO:root:epoch: 42/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.863799 acc: 0.736979 lr: 0.001483\n", + "INFO:root:epoch: 42/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.864196 acc: 0.736184 lr: 0.001481\n", + "INFO:root:epoch: 42/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.868736 acc: 0.733594 lr: 0.001478\n", + "INFO:root:epoch: 42/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.871624 acc: 0.733333 lr: 0.001476\n", + "INFO:root:epoch: 42/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.870885 acc: 0.734375 lr: 0.001473\n", + "INFO:root:epoch: 42/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.870002 acc: 0.733696 lr: 0.001471\n", + "INFO:root:epoch: 42/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.870474 acc: 0.733984 lr: 0.001468\n", + "INFO:root:epoch: 42/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.873119 acc: 0.732250 lr: 0.001466\n", + "INFO:root:epoch: 42/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.870922 acc: 0.732452 lr: 0.001463\n", + "INFO:root:epoch: 42/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.875132 acc: 0.731944 lr: 0.001460\n", + "INFO:root:epoch: 42/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.877856 acc: 0.730915 lr: 0.001458\n", + "INFO:root:epoch: 42/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.874014 acc: 0.731897 lr: 0.001455\n", + "INFO:root:epoch: 42/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.877910 acc: 0.730625 lr: 0.001453\n", + "INFO:root:epoch: 42/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.880198 acc: 0.730242 lr: 0.001450\n", + "INFO:root:epoch: 43/100 starts\n", + "INFO:root:epoch: 43/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.932633 acc: 0.715625 lr: 0.001447\n", + "INFO:root:epoch: 43/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.976757 acc: 0.687500 lr: 0.001445\n", + "INFO:root:epoch: 43/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.899957 acc: 0.709375 lr: 0.001442\n", + "INFO:root:epoch: 43/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.895332 acc: 0.713281 lr: 0.001440\n", + "INFO:root:epoch: 43/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.928037 acc: 0.708125 lr: 0.001437\n", + "INFO:root:epoch: 43/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.916536 acc: 0.705729 lr: 0.001435\n", + "INFO:root:epoch: 43/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.907680 acc: 0.708482 lr: 0.001432\n", + "INFO:root:epoch: 43/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.906414 acc: 0.710937 lr: 0.001430\n", + "INFO:root:epoch: 43/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.915844 acc: 0.707639 lr: 0.001427\n", + "INFO:root:epoch: 43/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.908398 acc: 0.711875 lr: 0.001425\n", + "INFO:root:epoch: 43/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.912975 acc: 0.708807 lr: 0.001422\n", + "INFO:root:epoch: 43/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.912979 acc: 0.710417 lr: 0.001420\n", + "INFO:root:epoch: 43/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.906021 acc: 0.712981 lr: 0.001417\n", + "INFO:root:epoch: 43/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.916571 acc: 0.710268 lr: 0.001415\n", + "INFO:root:epoch: 43/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.909473 acc: 0.712083 lr: 0.001412\n", + "INFO:root:epoch: 43/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.907134 acc: 0.713867 lr: 0.001410\n", + "INFO:root:epoch: 43/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.904646 acc: 0.713419 lr: 0.001408\n", + "INFO:root:epoch: 43/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.906878 acc: 0.713889 lr: 0.001405\n", + "INFO:root:epoch: 43/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.902582 acc: 0.714967 lr: 0.001403\n", + "INFO:root:epoch: 43/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.900949 acc: 0.716563 lr: 0.001400\n", + "INFO:root:epoch: 43/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.894646 acc: 0.718601 lr: 0.001398\n", + "INFO:root:epoch: 43/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.891527 acc: 0.720739 lr: 0.001395\n", + "INFO:root:epoch: 43/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.892044 acc: 0.721603 lr: 0.001393\n", + "INFO:root:epoch: 43/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.890487 acc: 0.722396 lr: 0.001391\n", + "INFO:root:epoch: 43/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.893807 acc: 0.721125 lr: 0.001388\n", + "INFO:root:epoch: 43/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.895236 acc: 0.721274 lr: 0.001386\n", + "INFO:root:epoch: 43/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.895607 acc: 0.720718 lr: 0.001383\n", + "INFO:root:epoch: 43/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.895230 acc: 0.720424 lr: 0.001381\n", + "INFO:root:epoch: 43/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.894518 acc: 0.720259 lr: 0.001379\n", + "INFO:root:epoch: 43/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.894952 acc: 0.720521 lr: 0.001376\n", + "INFO:root:epoch: 43/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.896812 acc: 0.721169 lr: 0.001374\n", + "INFO:root:epoch: 44/100 starts\n", + "INFO:root:epoch: 44/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.854088 acc: 0.734375 lr: 0.001371\n", + "INFO:root:epoch: 44/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.893576 acc: 0.731250 lr: 0.001368\n", + "INFO:root:epoch: 44/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.901208 acc: 0.722917 lr: 0.001366\n", + "INFO:root:epoch: 44/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.911228 acc: 0.722656 lr: 0.001364\n", + "INFO:root:epoch: 44/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.913262 acc: 0.727500 lr: 0.001361\n", + "INFO:root:epoch: 44/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.915631 acc: 0.722917 lr: 0.001359\n", + "INFO:root:epoch: 44/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.918094 acc: 0.719196 lr: 0.001357\n", + "INFO:root:epoch: 44/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.907978 acc: 0.722266 lr: 0.001354\n", + "INFO:root:epoch: 44/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.915693 acc: 0.718403 lr: 0.001352\n", + "INFO:root:epoch: 44/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.908236 acc: 0.719688 lr: 0.001350\n", + "INFO:root:epoch: 44/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.903183 acc: 0.722727 lr: 0.001347\n", + "INFO:root:epoch: 44/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.888709 acc: 0.726563 lr: 0.001345\n", + "INFO:root:epoch: 44/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.881539 acc: 0.728365 lr: 0.001343\n", + "INFO:root:epoch: 44/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.882548 acc: 0.727009 lr: 0.001340\n", + "INFO:root:epoch: 44/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.886563 acc: 0.725000 lr: 0.001338\n", + "INFO:root:epoch: 44/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.880728 acc: 0.728125 lr: 0.001336\n", + "INFO:root:epoch: 44/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.888216 acc: 0.726471 lr: 0.001333\n", + "INFO:root:epoch: 44/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.882822 acc: 0.728472 lr: 0.001331\n", + "INFO:root:epoch: 44/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.881073 acc: 0.729112 lr: 0.001329\n", + "INFO:root:epoch: 44/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.882001 acc: 0.729063 lr: 0.001326\n", + "INFO:root:epoch: 44/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.879309 acc: 0.730357 lr: 0.001324\n", + "INFO:root:epoch: 44/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.877268 acc: 0.730256 lr: 0.001322\n", + "INFO:root:epoch: 44/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.871074 acc: 0.732065 lr: 0.001319\n", + "INFO:root:epoch: 44/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.869375 acc: 0.732682 lr: 0.001317\n", + "INFO:root:epoch: 44/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.869586 acc: 0.732750 lr: 0.001315\n", + "INFO:root:epoch: 44/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.870473 acc: 0.732572 lr: 0.001313\n", + "INFO:root:epoch: 44/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.874371 acc: 0.731366 lr: 0.001310\n", + "INFO:root:epoch: 44/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.878666 acc: 0.730022 lr: 0.001308\n", + "INFO:root:epoch: 44/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.881055 acc: 0.729310 lr: 0.001306\n", + "INFO:root:epoch: 44/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.882445 acc: 0.728854 lr: 0.001304\n", + "INFO:root:epoch: 44/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.887197 acc: 0.727520 lr: 0.001301\n", + "INFO:root:epoch: 45/100 starts\n", + "INFO:root:epoch: 45/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.721088 acc: 0.812500 lr: 0.001298\n", + "INFO:root:epoch: 45/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.854947 acc: 0.745313 lr: 0.001296\n", + "INFO:root:epoch: 45/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.881816 acc: 0.736458 lr: 0.001294\n", + "INFO:root:epoch: 45/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.884462 acc: 0.729687 lr: 0.001292\n", + "INFO:root:epoch: 45/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.880657 acc: 0.730625 lr: 0.001289\n", + "INFO:root:epoch: 45/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.873069 acc: 0.731771 lr: 0.001287\n", + "INFO:root:epoch: 45/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.875560 acc: 0.733036 lr: 0.001285\n", + "INFO:root:epoch: 45/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.876301 acc: 0.732813 lr: 0.001283\n", + "INFO:root:epoch: 45/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.869964 acc: 0.734375 lr: 0.001280\n", + "INFO:root:epoch: 45/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.861509 acc: 0.733438 lr: 0.001278\n", + "INFO:root:epoch: 45/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.858668 acc: 0.732102 lr: 0.001276\n", + "INFO:root:epoch: 45/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.861969 acc: 0.731250 lr: 0.001274\n", + "INFO:root:epoch: 45/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.862078 acc: 0.730288 lr: 0.001272\n", + "INFO:root:epoch: 45/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.864396 acc: 0.731250 lr: 0.001269\n", + "INFO:root:epoch: 45/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.864452 acc: 0.731875 lr: 0.001267\n", + "INFO:root:epoch: 45/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.873887 acc: 0.727148 lr: 0.001265\n", + "INFO:root:epoch: 45/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.874892 acc: 0.727022 lr: 0.001263\n", + "INFO:root:epoch: 45/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.868370 acc: 0.730035 lr: 0.001261\n", + "INFO:root:epoch: 45/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.864252 acc: 0.730099 lr: 0.001258\n", + "INFO:root:epoch: 45/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.868312 acc: 0.728906 lr: 0.001256\n", + "INFO:root:epoch: 45/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.870158 acc: 0.729018 lr: 0.001254\n", + "INFO:root:epoch: 45/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.867791 acc: 0.729688 lr: 0.001252\n", + "INFO:root:epoch: 45/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.866661 acc: 0.729484 lr: 0.001250\n", + "INFO:root:epoch: 45/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.868936 acc: 0.728385 lr: 0.001248\n", + "INFO:root:epoch: 45/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.874296 acc: 0.727375 lr: 0.001245\n", + "INFO:root:epoch: 45/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.875546 acc: 0.728125 lr: 0.001243\n", + "INFO:root:epoch: 45/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.877334 acc: 0.727778 lr: 0.001241\n", + "INFO:root:epoch: 45/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.879317 acc: 0.726897 lr: 0.001239\n", + "INFO:root:epoch: 45/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.876840 acc: 0.727802 lr: 0.001237\n", + "INFO:root:epoch: 45/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.873443 acc: 0.728958 lr: 0.001235\n", + "INFO:root:epoch: 45/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.878095 acc: 0.727823 lr: 0.001233\n", + "INFO:root:epoch: 46/100 starts\n", + "INFO:root:epoch: 46/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.801696 acc: 0.737500 lr: 0.001230\n", + "INFO:root:epoch: 46/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.773932 acc: 0.745313 lr: 0.001228\n", + "INFO:root:epoch: 46/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.809889 acc: 0.736458 lr: 0.001226\n", + "INFO:root:epoch: 46/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.833769 acc: 0.737500 lr: 0.001223\n", + "INFO:root:epoch: 46/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.865570 acc: 0.736875 lr: 0.001221\n", + "INFO:root:epoch: 46/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.871965 acc: 0.733333 lr: 0.001219\n", + "INFO:root:epoch: 46/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.891527 acc: 0.728125 lr: 0.001217\n", + "INFO:root:epoch: 46/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.870470 acc: 0.733594 lr: 0.001215\n", + "INFO:root:epoch: 46/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.866191 acc: 0.734375 lr: 0.001213\n", + "INFO:root:epoch: 46/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.853962 acc: 0.739688 lr: 0.001211\n", + "INFO:root:epoch: 46/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.864541 acc: 0.736080 lr: 0.001209\n", + "INFO:root:epoch: 46/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.865457 acc: 0.738021 lr: 0.001207\n", + "INFO:root:epoch: 46/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.866309 acc: 0.737260 lr: 0.001204\n", + "INFO:root:epoch: 46/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.862072 acc: 0.737500 lr: 0.001202\n", + "INFO:root:epoch: 46/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.859909 acc: 0.738125 lr: 0.001200\n", + "INFO:root:epoch: 46/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.863566 acc: 0.736133 lr: 0.001198\n", + "INFO:root:epoch: 46/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.866700 acc: 0.736029 lr: 0.001196\n", + "INFO:root:epoch: 46/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.862114 acc: 0.737847 lr: 0.001194\n", + "INFO:root:epoch: 46/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.861451 acc: 0.738651 lr: 0.001192\n", + "INFO:root:epoch: 46/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.864858 acc: 0.738437 lr: 0.001190\n", + "INFO:root:epoch: 46/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.861198 acc: 0.738542 lr: 0.001188\n", + "INFO:root:epoch: 46/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.858953 acc: 0.738210 lr: 0.001186\n", + "INFO:root:epoch: 46/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.858204 acc: 0.737772 lr: 0.001184\n", + "INFO:root:epoch: 46/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.862027 acc: 0.736328 lr: 0.001182\n", + "INFO:root:epoch: 46/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.859693 acc: 0.737125 lr: 0.001180\n", + "INFO:root:epoch: 46/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.864086 acc: 0.735697 lr: 0.001178\n", + "INFO:root:epoch: 46/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.859521 acc: 0.736690 lr: 0.001176\n", + "INFO:root:epoch: 46/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.863924 acc: 0.735156 lr: 0.001174\n", + "INFO:root:epoch: 46/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.868049 acc: 0.734806 lr: 0.001172\n", + "INFO:root:epoch: 46/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.870123 acc: 0.733854 lr: 0.001170\n", + "INFO:root:epoch: 46/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.872136 acc: 0.733266 lr: 0.001168\n", + "INFO:root:epoch: 47/100 starts\n", + "INFO:root:epoch: 47/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.869445 acc: 0.712500 lr: 0.001165\n", + "INFO:root:epoch: 47/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.851833 acc: 0.735938 lr: 0.001163\n", + "INFO:root:epoch: 47/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.853480 acc: 0.728125 lr: 0.001161\n", + "INFO:root:epoch: 47/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.871268 acc: 0.719531 lr: 0.001159\n", + "INFO:root:epoch: 47/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.853625 acc: 0.726250 lr: 0.001157\n", + "INFO:root:epoch: 47/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.847571 acc: 0.731771 lr: 0.001155\n", + "INFO:root:epoch: 47/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.846769 acc: 0.729911 lr: 0.001153\n", + "INFO:root:epoch: 47/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.858061 acc: 0.727344 lr: 0.001151\n", + "INFO:root:epoch: 47/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.864871 acc: 0.725000 lr: 0.001149\n", + "INFO:root:epoch: 47/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.858079 acc: 0.727500 lr: 0.001147\n", + "INFO:root:epoch: 47/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.861945 acc: 0.727841 lr: 0.001145\n", + "INFO:root:epoch: 47/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.862408 acc: 0.727865 lr: 0.001143\n", + "INFO:root:epoch: 47/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.862439 acc: 0.728606 lr: 0.001141\n", + "INFO:root:epoch: 47/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.860833 acc: 0.727902 lr: 0.001139\n", + "INFO:root:epoch: 47/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.863450 acc: 0.726458 lr: 0.001137\n", + "INFO:root:epoch: 47/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.861756 acc: 0.726758 lr: 0.001135\n", + "INFO:root:epoch: 47/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.860222 acc: 0.728309 lr: 0.001133\n", + "INFO:root:epoch: 47/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.858540 acc: 0.730556 lr: 0.001131\n", + "INFO:root:epoch: 47/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.861725 acc: 0.729770 lr: 0.001129\n", + "INFO:root:epoch: 47/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.868071 acc: 0.728750 lr: 0.001127\n", + "INFO:root:epoch: 47/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.873154 acc: 0.726042 lr: 0.001125\n", + "INFO:root:epoch: 47/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.875945 acc: 0.726420 lr: 0.001123\n", + "INFO:root:epoch: 47/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.875667 acc: 0.726223 lr: 0.001121\n", + "INFO:root:epoch: 47/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.869773 acc: 0.727344 lr: 0.001119\n", + "INFO:root:epoch: 47/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.869154 acc: 0.727875 lr: 0.001117\n", + "INFO:root:epoch: 47/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.870076 acc: 0.727043 lr: 0.001115\n", + "INFO:root:epoch: 47/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.870282 acc: 0.728009 lr: 0.001114\n", + "INFO:root:epoch: 47/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.871832 acc: 0.727902 lr: 0.001112\n", + "INFO:root:epoch: 47/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.874218 acc: 0.727047 lr: 0.001110\n", + "INFO:root:epoch: 47/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.874666 acc: 0.727500 lr: 0.001108\n", + "INFO:root:epoch: 47/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.873458 acc: 0.727722 lr: 0.001106\n", + "INFO:root:epoch: 48/100 starts\n", + "INFO:root:epoch: 48/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.915660 acc: 0.706250 lr: 0.001103\n", + "INFO:root:epoch: 48/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.908028 acc: 0.718750 lr: 0.001101\n", + "INFO:root:epoch: 48/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.908596 acc: 0.717708 lr: 0.001100\n", + "INFO:root:epoch: 48/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.874110 acc: 0.732813 lr: 0.001098\n", + "INFO:root:epoch: 48/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.858332 acc: 0.737500 lr: 0.001096\n", + "INFO:root:epoch: 48/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.871197 acc: 0.732292 lr: 0.001094\n", + "INFO:root:epoch: 48/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.861462 acc: 0.733482 lr: 0.001092\n", + "INFO:root:epoch: 48/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.870519 acc: 0.730859 lr: 0.001090\n", + "INFO:root:epoch: 48/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.856348 acc: 0.736806 lr: 0.001088\n", + "INFO:root:epoch: 48/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.850899 acc: 0.736250 lr: 0.001086\n", + "INFO:root:epoch: 48/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.851499 acc: 0.736648 lr: 0.001084\n", + "INFO:root:epoch: 48/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.849902 acc: 0.738542 lr: 0.001083\n", + "INFO:root:epoch: 48/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.848409 acc: 0.741827 lr: 0.001081\n", + "INFO:root:epoch: 48/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.851525 acc: 0.739732 lr: 0.001079\n", + "INFO:root:epoch: 48/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.847849 acc: 0.740000 lr: 0.001077\n", + "INFO:root:epoch: 48/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.838211 acc: 0.743945 lr: 0.001075\n", + "INFO:root:epoch: 48/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.838068 acc: 0.743566 lr: 0.001073\n", + "INFO:root:epoch: 48/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.840692 acc: 0.741667 lr: 0.001071\n", + "INFO:root:epoch: 48/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.848038 acc: 0.739474 lr: 0.001069\n", + "INFO:root:epoch: 48/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.841606 acc: 0.742187 lr: 0.001068\n", + "INFO:root:epoch: 48/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.843021 acc: 0.741518 lr: 0.001066\n", + "INFO:root:epoch: 48/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.839918 acc: 0.743324 lr: 0.001064\n", + "INFO:root:epoch: 48/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.839910 acc: 0.743071 lr: 0.001062\n", + "INFO:root:epoch: 48/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.838402 acc: 0.743359 lr: 0.001060\n", + "INFO:root:epoch: 48/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.843530 acc: 0.742250 lr: 0.001058\n", + "INFO:root:epoch: 48/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.844521 acc: 0.742067 lr: 0.001057\n", + "INFO:root:epoch: 48/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.849001 acc: 0.740046 lr: 0.001055\n", + "INFO:root:epoch: 48/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.849362 acc: 0.740067 lr: 0.001053\n", + "INFO:root:epoch: 48/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.849835 acc: 0.739655 lr: 0.001051\n", + "INFO:root:epoch: 48/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.849499 acc: 0.739375 lr: 0.001049\n", + "INFO:root:epoch: 48/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.846603 acc: 0.740222 lr: 0.001047\n", + "INFO:root:epoch: 49/100 starts\n", + "INFO:root:epoch: 49/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.871459 acc: 0.740625 lr: 0.001045\n", + "INFO:root:epoch: 49/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.880735 acc: 0.728125 lr: 0.001043\n", + "INFO:root:epoch: 49/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.887872 acc: 0.728125 lr: 0.001042\n", + "INFO:root:epoch: 49/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.875082 acc: 0.734375 lr: 0.001040\n", + "INFO:root:epoch: 49/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.870226 acc: 0.735000 lr: 0.001038\n", + "INFO:root:epoch: 49/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.869364 acc: 0.733333 lr: 0.001036\n", + "INFO:root:epoch: 49/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.873440 acc: 0.733482 lr: 0.001034\n", + "INFO:root:epoch: 49/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.861439 acc: 0.737891 lr: 0.001033\n", + "INFO:root:epoch: 49/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.862657 acc: 0.734028 lr: 0.001031\n", + "INFO:root:epoch: 49/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.854155 acc: 0.736563 lr: 0.001029\n", + "INFO:root:epoch: 49/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.850740 acc: 0.734659 lr: 0.001027\n", + "INFO:root:epoch: 49/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.844839 acc: 0.737500 lr: 0.001025\n", + "INFO:root:epoch: 49/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.844869 acc: 0.737019 lr: 0.001024\n", + "INFO:root:epoch: 49/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.847296 acc: 0.735268 lr: 0.001022\n", + "INFO:root:epoch: 49/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.849787 acc: 0.736458 lr: 0.001020\n", + "INFO:root:epoch: 49/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.847962 acc: 0.736719 lr: 0.001018\n", + "INFO:root:epoch: 49/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.847930 acc: 0.738787 lr: 0.001017\n", + "INFO:root:epoch: 49/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.841681 acc: 0.740625 lr: 0.001015\n", + "INFO:root:epoch: 49/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.849310 acc: 0.738487 lr: 0.001013\n", + "INFO:root:epoch: 49/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.847989 acc: 0.738906 lr: 0.001011\n", + "INFO:root:epoch: 49/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.844989 acc: 0.740030 lr: 0.001010\n", + "INFO:root:epoch: 49/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.844893 acc: 0.739205 lr: 0.001008\n", + "INFO:root:epoch: 49/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.844115 acc: 0.739402 lr: 0.001006\n", + "INFO:root:epoch: 49/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.842952 acc: 0.740755 lr: 0.001004\n", + "INFO:root:epoch: 49/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.841943 acc: 0.740875 lr: 0.001003\n", + "INFO:root:epoch: 49/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.842328 acc: 0.740745 lr: 0.001001\n", + "INFO:root:epoch: 49/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.846242 acc: 0.739352 lr: 0.000999\n", + "INFO:root:epoch: 49/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.844139 acc: 0.739509 lr: 0.000997\n", + "INFO:root:epoch: 49/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.844395 acc: 0.739224 lr: 0.000996\n", + "INFO:root:epoch: 49/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.847630 acc: 0.737812 lr: 0.000994\n", + "INFO:root:epoch: 49/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.851312 acc: 0.736290 lr: 0.000992\n", + "INFO:root:epoch: 50/100 starts\n", + "INFO:root:epoch: 50/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 1.015676 acc: 0.703125 lr: 0.000990\n", + "INFO:root:epoch: 50/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.923647 acc: 0.734375 lr: 0.000988\n", + "INFO:root:epoch: 50/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.890851 acc: 0.737500 lr: 0.000987\n", + "INFO:root:epoch: 50/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.874218 acc: 0.742969 lr: 0.000985\n", + "INFO:root:epoch: 50/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.854517 acc: 0.743125 lr: 0.000983\n", + "INFO:root:epoch: 50/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.849495 acc: 0.748958 lr: 0.000981\n", + "INFO:root:epoch: 50/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.858147 acc: 0.745089 lr: 0.000980\n", + "INFO:root:epoch: 50/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.857522 acc: 0.745312 lr: 0.000978\n", + "INFO:root:epoch: 50/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.850765 acc: 0.746181 lr: 0.000976\n", + "INFO:root:epoch: 50/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.850748 acc: 0.745312 lr: 0.000975\n", + "INFO:root:epoch: 50/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.853070 acc: 0.741761 lr: 0.000973\n", + "INFO:root:epoch: 50/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.849600 acc: 0.742448 lr: 0.000971\n", + "INFO:root:epoch: 50/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.841722 acc: 0.743990 lr: 0.000970\n", + "INFO:root:epoch: 50/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.839816 acc: 0.743080 lr: 0.000968\n", + "INFO:root:epoch: 50/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.840112 acc: 0.741667 lr: 0.000966\n", + "INFO:root:epoch: 50/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.845308 acc: 0.740820 lr: 0.000965\n", + "INFO:root:epoch: 50/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.847078 acc: 0.740625 lr: 0.000963\n", + "INFO:root:epoch: 50/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.852572 acc: 0.738715 lr: 0.000961\n", + "INFO:root:epoch: 50/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.853428 acc: 0.738980 lr: 0.000960\n", + "INFO:root:epoch: 50/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.850048 acc: 0.740781 lr: 0.000958\n", + "INFO:root:epoch: 50/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.846018 acc: 0.741667 lr: 0.000956\n", + "INFO:root:epoch: 50/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.845517 acc: 0.742330 lr: 0.000955\n", + "INFO:root:epoch: 50/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.844762 acc: 0.741304 lr: 0.000953\n", + "INFO:root:epoch: 50/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.841995 acc: 0.741667 lr: 0.000951\n", + "INFO:root:epoch: 50/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.836288 acc: 0.743875 lr: 0.000950\n", + "INFO:root:epoch: 50/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.835791 acc: 0.743870 lr: 0.000948\n", + "INFO:root:epoch: 50/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.838450 acc: 0.742940 lr: 0.000946\n", + "INFO:root:epoch: 50/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.838960 acc: 0.742522 lr: 0.000945\n", + "INFO:root:epoch: 50/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.844399 acc: 0.740409 lr: 0.000943\n", + "INFO:root:epoch: 50/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.845635 acc: 0.739896 lr: 0.000941\n", + "INFO:root:epoch: 50/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.849859 acc: 0.738105 lr: 0.000940\n", + "INFO:root:epoch: 51/100 starts\n", + "INFO:root:epoch: 51/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.831457 acc: 0.706250 lr: 0.000938\n", + "INFO:root:epoch: 51/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.797039 acc: 0.731250 lr: 0.000936\n", + "INFO:root:epoch: 51/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.826571 acc: 0.731250 lr: 0.000934\n", + "INFO:root:epoch: 51/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.827990 acc: 0.736719 lr: 0.000933\n", + "INFO:root:epoch: 51/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.848228 acc: 0.731875 lr: 0.000931\n", + "INFO:root:epoch: 51/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.847784 acc: 0.734375 lr: 0.000930\n", + "INFO:root:epoch: 51/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.853702 acc: 0.733929 lr: 0.000928\n", + "INFO:root:epoch: 51/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.849845 acc: 0.736328 lr: 0.000926\n", + "INFO:root:epoch: 51/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.855950 acc: 0.734722 lr: 0.000925\n", + "INFO:root:epoch: 51/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.853829 acc: 0.736563 lr: 0.000923\n", + "INFO:root:epoch: 51/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.863996 acc: 0.734091 lr: 0.000922\n", + "INFO:root:epoch: 51/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.867579 acc: 0.734896 lr: 0.000920\n", + "INFO:root:epoch: 51/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.861326 acc: 0.736298 lr: 0.000918\n", + "INFO:root:epoch: 51/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.858348 acc: 0.737277 lr: 0.000917\n", + "INFO:root:epoch: 51/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.864448 acc: 0.735625 lr: 0.000915\n", + "INFO:root:epoch: 51/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.865278 acc: 0.733789 lr: 0.000914\n", + "INFO:root:epoch: 51/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.858263 acc: 0.736581 lr: 0.000912\n", + "INFO:root:epoch: 51/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.855220 acc: 0.738194 lr: 0.000910\n", + "INFO:root:epoch: 51/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.857610 acc: 0.737500 lr: 0.000909\n", + "INFO:root:epoch: 51/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.858362 acc: 0.736094 lr: 0.000907\n", + "INFO:root:epoch: 51/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.856905 acc: 0.736310 lr: 0.000906\n", + "INFO:root:epoch: 51/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.860737 acc: 0.735653 lr: 0.000904\n", + "INFO:root:epoch: 51/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.857090 acc: 0.736957 lr: 0.000903\n", + "INFO:root:epoch: 51/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.855965 acc: 0.737109 lr: 0.000901\n", + "INFO:root:epoch: 51/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.856065 acc: 0.736125 lr: 0.000899\n", + "INFO:root:epoch: 51/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.858335 acc: 0.735577 lr: 0.000898\n", + "INFO:root:epoch: 51/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.863159 acc: 0.734722 lr: 0.000896\n", + "INFO:root:epoch: 51/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.863976 acc: 0.734263 lr: 0.000895\n", + "INFO:root:epoch: 51/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.863092 acc: 0.734698 lr: 0.000893\n", + "INFO:root:epoch: 51/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.863980 acc: 0.734375 lr: 0.000892\n", + "INFO:root:epoch: 51/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.867559 acc: 0.732863 lr: 0.000890\n", + "INFO:root:epoch: 52/100 starts\n", + "INFO:root:epoch: 52/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.743803 acc: 0.771875 lr: 0.000888\n", + "INFO:root:epoch: 52/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.762042 acc: 0.759375 lr: 0.000887\n", + "INFO:root:epoch: 52/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.825565 acc: 0.748958 lr: 0.000885\n", + "INFO:root:epoch: 52/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.841342 acc: 0.739062 lr: 0.000884\n", + "INFO:root:epoch: 52/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.845629 acc: 0.740625 lr: 0.000882\n", + "INFO:root:epoch: 52/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.834545 acc: 0.738542 lr: 0.000881\n", + "INFO:root:epoch: 52/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.843534 acc: 0.736607 lr: 0.000879\n", + "INFO:root:epoch: 52/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.842049 acc: 0.736328 lr: 0.000877\n", + "INFO:root:epoch: 52/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.827880 acc: 0.740625 lr: 0.000876\n", + "INFO:root:epoch: 52/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.820315 acc: 0.742812 lr: 0.000874\n", + "INFO:root:epoch: 52/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.825259 acc: 0.743182 lr: 0.000873\n", + "INFO:root:epoch: 52/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.830174 acc: 0.741146 lr: 0.000871\n", + "INFO:root:epoch: 52/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.833646 acc: 0.739423 lr: 0.000870\n", + "INFO:root:epoch: 52/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.833089 acc: 0.739955 lr: 0.000868\n", + "INFO:root:epoch: 52/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.829194 acc: 0.741042 lr: 0.000867\n", + "INFO:root:epoch: 52/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.834378 acc: 0.740820 lr: 0.000865\n", + "INFO:root:epoch: 52/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.832258 acc: 0.740993 lr: 0.000864\n", + "INFO:root:epoch: 52/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.829481 acc: 0.740625 lr: 0.000862\n", + "INFO:root:epoch: 52/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.831247 acc: 0.740461 lr: 0.000861\n", + "INFO:root:epoch: 52/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.829492 acc: 0.740781 lr: 0.000859\n", + "INFO:root:epoch: 52/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.826394 acc: 0.742262 lr: 0.000858\n", + "INFO:root:epoch: 52/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.829801 acc: 0.741051 lr: 0.000856\n", + "INFO:root:epoch: 52/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.826304 acc: 0.741168 lr: 0.000855\n", + "INFO:root:epoch: 52/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.826644 acc: 0.740625 lr: 0.000853\n", + "INFO:root:epoch: 52/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.828600 acc: 0.740000 lr: 0.000852\n", + "INFO:root:epoch: 52/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.828024 acc: 0.740625 lr: 0.000851\n", + "INFO:root:epoch: 52/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.828098 acc: 0.740394 lr: 0.000849\n", + "INFO:root:epoch: 52/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.832659 acc: 0.738281 lr: 0.000848\n", + "INFO:root:epoch: 52/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.832256 acc: 0.738578 lr: 0.000846\n", + "INFO:root:epoch: 52/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.828255 acc: 0.740208 lr: 0.000845\n", + "INFO:root:epoch: 52/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.828766 acc: 0.740524 lr: 0.000843\n", + "INFO:root:epoch: 53/100 starts\n", + "INFO:root:epoch: 53/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.894693 acc: 0.715625 lr: 0.000841\n", + "INFO:root:epoch: 53/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.920465 acc: 0.707812 lr: 0.000840\n", + "INFO:root:epoch: 53/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.854437 acc: 0.734375 lr: 0.000838\n", + "INFO:root:epoch: 53/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.836487 acc: 0.739063 lr: 0.000837\n", + "INFO:root:epoch: 53/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.837251 acc: 0.740625 lr: 0.000835\n", + "INFO:root:epoch: 53/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.835642 acc: 0.744271 lr: 0.000834\n", + "INFO:root:epoch: 53/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.848960 acc: 0.740179 lr: 0.000833\n", + "INFO:root:epoch: 53/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.836181 acc: 0.744141 lr: 0.000831\n", + "INFO:root:epoch: 53/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.841505 acc: 0.745139 lr: 0.000830\n", + "INFO:root:epoch: 53/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.843772 acc: 0.744688 lr: 0.000828\n", + "INFO:root:epoch: 53/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.843461 acc: 0.743182 lr: 0.000827\n", + "INFO:root:epoch: 53/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.847543 acc: 0.742708 lr: 0.000825\n", + "INFO:root:epoch: 53/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.850061 acc: 0.740385 lr: 0.000824\n", + "INFO:root:epoch: 53/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.852352 acc: 0.736384 lr: 0.000823\n", + "INFO:root:epoch: 53/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.850986 acc: 0.738333 lr: 0.000821\n", + "INFO:root:epoch: 53/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.856137 acc: 0.736133 lr: 0.000820\n", + "INFO:root:epoch: 53/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.855595 acc: 0.736397 lr: 0.000818\n", + "INFO:root:epoch: 53/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.850728 acc: 0.736632 lr: 0.000817\n", + "INFO:root:epoch: 53/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.847100 acc: 0.737500 lr: 0.000815\n", + "INFO:root:epoch: 53/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.846957 acc: 0.736719 lr: 0.000814\n", + "INFO:root:epoch: 53/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.851465 acc: 0.734226 lr: 0.000813\n", + "INFO:root:epoch: 53/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.858108 acc: 0.731250 lr: 0.000811\n", + "INFO:root:epoch: 53/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.851864 acc: 0.733288 lr: 0.000810\n", + "INFO:root:epoch: 53/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.851618 acc: 0.733594 lr: 0.000808\n", + "INFO:root:epoch: 53/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.853848 acc: 0.733375 lr: 0.000807\n", + "INFO:root:epoch: 53/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.857994 acc: 0.731851 lr: 0.000806\n", + "INFO:root:epoch: 53/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.855323 acc: 0.733218 lr: 0.000804\n", + "INFO:root:epoch: 53/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.856908 acc: 0.732812 lr: 0.000803\n", + "INFO:root:epoch: 53/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.855603 acc: 0.732543 lr: 0.000801\n", + "INFO:root:epoch: 53/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.856527 acc: 0.731979 lr: 0.000800\n", + "INFO:root:epoch: 53/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.855646 acc: 0.733367 lr: 0.000799\n", + "INFO:root:epoch: 54/100 starts\n", + "INFO:root:epoch: 54/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.839867 acc: 0.753125 lr: 0.000797\n", + "INFO:root:epoch: 54/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.853545 acc: 0.742188 lr: 0.000795\n", + "INFO:root:epoch: 54/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.918257 acc: 0.725000 lr: 0.000794\n", + "INFO:root:epoch: 54/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.896112 acc: 0.725781 lr: 0.000793\n", + "INFO:root:epoch: 54/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.889089 acc: 0.729375 lr: 0.000791\n", + "INFO:root:epoch: 54/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.864582 acc: 0.732812 lr: 0.000790\n", + "INFO:root:epoch: 54/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.863641 acc: 0.731250 lr: 0.000789\n", + "INFO:root:epoch: 54/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.870458 acc: 0.729687 lr: 0.000787\n", + "INFO:root:epoch: 54/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.856251 acc: 0.734375 lr: 0.000786\n", + "INFO:root:epoch: 54/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.855672 acc: 0.733750 lr: 0.000785\n", + "INFO:root:epoch: 54/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.848062 acc: 0.737216 lr: 0.000783\n", + "INFO:root:epoch: 54/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.847672 acc: 0.737500 lr: 0.000782\n", + "INFO:root:epoch: 54/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.857222 acc: 0.735817 lr: 0.000780\n", + "INFO:root:epoch: 54/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.853165 acc: 0.737054 lr: 0.000779\n", + "INFO:root:epoch: 54/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.851349 acc: 0.737292 lr: 0.000778\n", + "INFO:root:epoch: 54/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.848894 acc: 0.740820 lr: 0.000776\n", + "INFO:root:epoch: 54/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.851145 acc: 0.739522 lr: 0.000775\n", + "INFO:root:epoch: 54/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.848070 acc: 0.740625 lr: 0.000774\n", + "INFO:root:epoch: 54/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.847512 acc: 0.740625 lr: 0.000772\n", + "INFO:root:epoch: 54/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.850677 acc: 0.739375 lr: 0.000771\n", + "INFO:root:epoch: 54/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.847481 acc: 0.739286 lr: 0.000770\n", + "INFO:root:epoch: 54/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.845264 acc: 0.739347 lr: 0.000768\n", + "INFO:root:epoch: 54/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.844385 acc: 0.738043 lr: 0.000767\n", + "INFO:root:epoch: 54/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.847182 acc: 0.736979 lr: 0.000766\n", + "INFO:root:epoch: 54/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.844987 acc: 0.737250 lr: 0.000764\n", + "INFO:root:epoch: 54/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.842700 acc: 0.738582 lr: 0.000763\n", + "INFO:root:epoch: 54/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.841688 acc: 0.738310 lr: 0.000762\n", + "INFO:root:epoch: 54/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.845397 acc: 0.737612 lr: 0.000760\n", + "INFO:root:epoch: 54/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.845304 acc: 0.737716 lr: 0.000759\n", + "INFO:root:epoch: 54/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.847718 acc: 0.738333 lr: 0.000758\n", + "INFO:root:epoch: 54/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.846684 acc: 0.738810 lr: 0.000757\n", + "INFO:root:epoch: 55/100 starts\n", + "INFO:root:epoch: 55/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.825940 acc: 0.775000 lr: 0.000755\n", + "INFO:root:epoch: 55/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.822556 acc: 0.745313 lr: 0.000753\n", + "INFO:root:epoch: 55/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.832248 acc: 0.738542 lr: 0.000752\n", + "INFO:root:epoch: 55/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.839558 acc: 0.739844 lr: 0.000751\n", + "INFO:root:epoch: 55/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.862434 acc: 0.735000 lr: 0.000750\n", + "INFO:root:epoch: 55/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.850984 acc: 0.738021 lr: 0.000748\n", + "INFO:root:epoch: 55/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.866064 acc: 0.730804 lr: 0.000747\n", + "INFO:root:epoch: 55/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.864999 acc: 0.731250 lr: 0.000746\n", + "INFO:root:epoch: 55/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.890002 acc: 0.724306 lr: 0.000744\n", + "INFO:root:epoch: 55/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.880109 acc: 0.726875 lr: 0.000743\n", + "INFO:root:epoch: 55/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.875342 acc: 0.729261 lr: 0.000742\n", + "INFO:root:epoch: 55/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.877625 acc: 0.729167 lr: 0.000741\n", + "INFO:root:epoch: 55/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.867685 acc: 0.732692 lr: 0.000739\n", + "INFO:root:epoch: 55/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.870108 acc: 0.731473 lr: 0.000738\n", + "INFO:root:epoch: 55/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.865714 acc: 0.732708 lr: 0.000737\n", + "INFO:root:epoch: 55/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.858313 acc: 0.735352 lr: 0.000735\n", + "INFO:root:epoch: 55/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.852047 acc: 0.736213 lr: 0.000734\n", + "INFO:root:epoch: 55/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.851131 acc: 0.736458 lr: 0.000733\n", + "INFO:root:epoch: 55/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.852184 acc: 0.735526 lr: 0.000732\n", + "INFO:root:epoch: 55/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.853389 acc: 0.735313 lr: 0.000730\n", + "INFO:root:epoch: 55/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.855622 acc: 0.734226 lr: 0.000729\n", + "INFO:root:epoch: 55/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.857671 acc: 0.733523 lr: 0.000728\n", + "INFO:root:epoch: 55/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.856719 acc: 0.734103 lr: 0.000727\n", + "INFO:root:epoch: 55/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.852812 acc: 0.735547 lr: 0.000725\n", + "INFO:root:epoch: 55/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.848211 acc: 0.737875 lr: 0.000724\n", + "INFO:root:epoch: 55/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.849574 acc: 0.738101 lr: 0.000723\n", + "INFO:root:epoch: 55/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.843678 acc: 0.740046 lr: 0.000722\n", + "INFO:root:epoch: 55/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.842221 acc: 0.740067 lr: 0.000720\n", + "INFO:root:epoch: 55/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.841687 acc: 0.740086 lr: 0.000719\n", + "INFO:root:epoch: 55/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.843619 acc: 0.738854 lr: 0.000718\n", + "INFO:root:epoch: 55/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.843388 acc: 0.738710 lr: 0.000717\n", + "INFO:root:epoch: 56/100 starts\n", + "INFO:root:epoch: 56/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.878755 acc: 0.743750 lr: 0.000715\n", + "INFO:root:epoch: 56/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.891985 acc: 0.732812 lr: 0.000714\n", + "INFO:root:epoch: 56/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.860375 acc: 0.737500 lr: 0.000712\n", + "INFO:root:epoch: 56/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.871171 acc: 0.731250 lr: 0.000711\n", + "INFO:root:epoch: 56/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.875896 acc: 0.728125 lr: 0.000710\n", + "INFO:root:epoch: 56/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.884268 acc: 0.722396 lr: 0.000709\n", + "INFO:root:epoch: 56/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.893236 acc: 0.718304 lr: 0.000708\n", + "INFO:root:epoch: 56/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.881897 acc: 0.721094 lr: 0.000706\n", + "INFO:root:epoch: 56/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.868662 acc: 0.729514 lr: 0.000705\n", + "INFO:root:epoch: 56/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.859089 acc: 0.730000 lr: 0.000704\n", + "INFO:root:epoch: 56/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.851870 acc: 0.731534 lr: 0.000703\n", + "INFO:root:epoch: 56/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.856896 acc: 0.731250 lr: 0.000701\n", + "INFO:root:epoch: 56/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.850384 acc: 0.733894 lr: 0.000700\n", + "INFO:root:epoch: 56/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.849512 acc: 0.734375 lr: 0.000699\n", + "INFO:root:epoch: 56/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.844825 acc: 0.736458 lr: 0.000698\n", + "INFO:root:epoch: 56/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.841548 acc: 0.737891 lr: 0.000697\n", + "INFO:root:epoch: 56/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.841932 acc: 0.736765 lr: 0.000695\n", + "INFO:root:epoch: 56/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.841608 acc: 0.735243 lr: 0.000694\n", + "INFO:root:epoch: 56/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.836072 acc: 0.737336 lr: 0.000693\n", + "INFO:root:epoch: 56/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.837611 acc: 0.737813 lr: 0.000692\n", + "INFO:root:epoch: 56/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.838714 acc: 0.736905 lr: 0.000691\n", + "INFO:root:epoch: 56/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.838372 acc: 0.737500 lr: 0.000689\n", + "INFO:root:epoch: 56/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.836598 acc: 0.737772 lr: 0.000688\n", + "INFO:root:epoch: 56/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.841651 acc: 0.737109 lr: 0.000687\n", + "INFO:root:epoch: 56/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.838858 acc: 0.737125 lr: 0.000686\n", + "INFO:root:epoch: 56/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.833190 acc: 0.739183 lr: 0.000685\n", + "INFO:root:epoch: 56/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.832718 acc: 0.739699 lr: 0.000683\n", + "INFO:root:epoch: 56/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.832299 acc: 0.739844 lr: 0.000682\n", + "INFO:root:epoch: 56/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.830403 acc: 0.740841 lr: 0.000681\n", + "INFO:root:epoch: 56/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.832268 acc: 0.740208 lr: 0.000680\n", + "INFO:root:epoch: 56/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.835169 acc: 0.738710 lr: 0.000679\n", + "INFO:root:epoch: 57/100 starts\n", + "INFO:root:epoch: 57/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.909517 acc: 0.725000 lr: 0.000677\n", + "INFO:root:epoch: 57/100 et: 0s eta: 14s batches: 20/313(6%) samples: 640 loss: 0.867697 acc: 0.728125 lr: 0.000676\n", + "INFO:root:epoch: 57/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.840410 acc: 0.731250 lr: 0.000675\n", + "INFO:root:epoch: 57/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.805389 acc: 0.741406 lr: 0.000674\n", + "INFO:root:epoch: 57/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.814985 acc: 0.741250 lr: 0.000673\n", + "INFO:root:epoch: 57/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.809866 acc: 0.744271 lr: 0.000671\n", + "INFO:root:epoch: 57/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.817273 acc: 0.741964 lr: 0.000670\n", + "INFO:root:epoch: 57/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.812742 acc: 0.741016 lr: 0.000669\n", + "INFO:root:epoch: 57/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.815324 acc: 0.743056 lr: 0.000668\n", + "INFO:root:epoch: 57/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.821292 acc: 0.738437 lr: 0.000667\n", + "INFO:root:epoch: 57/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.816058 acc: 0.739773 lr: 0.000666\n", + "INFO:root:epoch: 57/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.817590 acc: 0.739583 lr: 0.000664\n", + "INFO:root:epoch: 57/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.818817 acc: 0.738221 lr: 0.000663\n", + "INFO:root:epoch: 57/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.816559 acc: 0.736607 lr: 0.000662\n", + "INFO:root:epoch: 57/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.825244 acc: 0.737083 lr: 0.000661\n", + "INFO:root:epoch: 57/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.823392 acc: 0.738281 lr: 0.000660\n", + "INFO:root:epoch: 57/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.823858 acc: 0.739154 lr: 0.000659\n", + "INFO:root:epoch: 57/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.822548 acc: 0.739757 lr: 0.000658\n", + "INFO:root:epoch: 57/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.820885 acc: 0.741283 lr: 0.000656\n", + "INFO:root:epoch: 57/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.820076 acc: 0.742500 lr: 0.000655\n", + "INFO:root:epoch: 57/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.824961 acc: 0.741220 lr: 0.000654\n", + "INFO:root:epoch: 57/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.825085 acc: 0.741335 lr: 0.000653\n", + "INFO:root:epoch: 57/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.827870 acc: 0.739810 lr: 0.000652\n", + "INFO:root:epoch: 57/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.828434 acc: 0.739844 lr: 0.000651\n", + "INFO:root:epoch: 57/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.828785 acc: 0.739250 lr: 0.000650\n", + "INFO:root:epoch: 57/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.827633 acc: 0.739543 lr: 0.000648\n", + "INFO:root:epoch: 57/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.825659 acc: 0.739699 lr: 0.000647\n", + "INFO:root:epoch: 57/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.823095 acc: 0.739732 lr: 0.000646\n", + "INFO:root:epoch: 57/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.823756 acc: 0.739009 lr: 0.000645\n", + "INFO:root:epoch: 57/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.825304 acc: 0.738854 lr: 0.000644\n", + "INFO:root:epoch: 57/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.827320 acc: 0.738206 lr: 0.000643\n", + "INFO:root:epoch: 58/100 starts\n", + "INFO:root:epoch: 58/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.798118 acc: 0.734375 lr: 0.000641\n", + "INFO:root:epoch: 58/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.830979 acc: 0.731250 lr: 0.000640\n", + "INFO:root:epoch: 58/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.848662 acc: 0.725000 lr: 0.000639\n", + "INFO:root:epoch: 58/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.838207 acc: 0.728906 lr: 0.000638\n", + "INFO:root:epoch: 58/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.838796 acc: 0.736875 lr: 0.000637\n", + "INFO:root:epoch: 58/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.818739 acc: 0.743229 lr: 0.000636\n", + "INFO:root:epoch: 58/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.809720 acc: 0.745536 lr: 0.000635\n", + "INFO:root:epoch: 58/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.822597 acc: 0.743750 lr: 0.000634\n", + "INFO:root:epoch: 58/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.830851 acc: 0.738889 lr: 0.000633\n", + "INFO:root:epoch: 58/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.834010 acc: 0.738750 lr: 0.000632\n", + "INFO:root:epoch: 58/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.829917 acc: 0.738352 lr: 0.000630\n", + "INFO:root:epoch: 58/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.823273 acc: 0.741667 lr: 0.000629\n", + "INFO:root:epoch: 58/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.812068 acc: 0.746154 lr: 0.000628\n", + "INFO:root:epoch: 58/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.811988 acc: 0.746875 lr: 0.000627\n", + "INFO:root:epoch: 58/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.805599 acc: 0.748542 lr: 0.000626\n", + "INFO:root:epoch: 58/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.806976 acc: 0.748242 lr: 0.000625\n", + "INFO:root:epoch: 58/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.811154 acc: 0.746507 lr: 0.000624\n", + "INFO:root:epoch: 58/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.809796 acc: 0.746181 lr: 0.000623\n", + "INFO:root:epoch: 58/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.809086 acc: 0.748520 lr: 0.000622\n", + "INFO:root:epoch: 58/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.810409 acc: 0.746719 lr: 0.000621\n", + "INFO:root:epoch: 58/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.812442 acc: 0.745833 lr: 0.000620\n", + "INFO:root:epoch: 58/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.811211 acc: 0.746591 lr: 0.000619\n", + "INFO:root:epoch: 58/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.812541 acc: 0.746875 lr: 0.000617\n", + "INFO:root:epoch: 58/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.816855 acc: 0.744531 lr: 0.000616\n", + "INFO:root:epoch: 58/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.819121 acc: 0.744000 lr: 0.000615\n", + "INFO:root:epoch: 58/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.819055 acc: 0.742668 lr: 0.000614\n", + "INFO:root:epoch: 58/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.820233 acc: 0.742940 lr: 0.000613\n", + "INFO:root:epoch: 58/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.818864 acc: 0.743973 lr: 0.000612\n", + "INFO:root:epoch: 58/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.821589 acc: 0.743211 lr: 0.000611\n", + "INFO:root:epoch: 58/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.821042 acc: 0.743229 lr: 0.000610\n", + "INFO:root:epoch: 58/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.823857 acc: 0.743145 lr: 0.000609\n", + "INFO:root:epoch: 59/100 starts\n", + "INFO:root:epoch: 59/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.789549 acc: 0.765625 lr: 0.000608\n", + "INFO:root:epoch: 59/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.790341 acc: 0.760938 lr: 0.000607\n", + "INFO:root:epoch: 59/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.806979 acc: 0.755208 lr: 0.000605\n", + "INFO:root:epoch: 59/100 et: 1s eta: 13s batches: 40/313(12%) samples: 1280 loss: 0.815923 acc: 0.753906 lr: 0.000604\n", + "INFO:root:epoch: 59/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.822824 acc: 0.753750 lr: 0.000603\n", + "INFO:root:epoch: 59/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.822709 acc: 0.751563 lr: 0.000602\n", + "INFO:root:epoch: 59/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.832924 acc: 0.747321 lr: 0.000601\n", + "INFO:root:epoch: 59/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.847297 acc: 0.740234 lr: 0.000600\n", + "INFO:root:epoch: 59/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.860101 acc: 0.734375 lr: 0.000599\n", + "INFO:root:epoch: 59/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.860811 acc: 0.734062 lr: 0.000598\n", + "INFO:root:epoch: 59/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.854995 acc: 0.737500 lr: 0.000597\n", + "INFO:root:epoch: 59/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.848008 acc: 0.739844 lr: 0.000596\n", + "INFO:root:epoch: 59/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.834319 acc: 0.743029 lr: 0.000595\n", + "INFO:root:epoch: 59/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.833186 acc: 0.741518 lr: 0.000594\n", + "INFO:root:epoch: 59/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.835036 acc: 0.741458 lr: 0.000593\n", + "INFO:root:epoch: 59/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.833203 acc: 0.741602 lr: 0.000592\n", + "INFO:root:epoch: 59/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.833846 acc: 0.740625 lr: 0.000591\n", + "INFO:root:epoch: 59/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.831953 acc: 0.741493 lr: 0.000590\n", + "INFO:root:epoch: 59/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.831190 acc: 0.741941 lr: 0.000589\n", + "INFO:root:epoch: 59/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.839709 acc: 0.739688 lr: 0.000588\n", + "INFO:root:epoch: 59/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.841757 acc: 0.737500 lr: 0.000587\n", + "INFO:root:epoch: 59/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.842198 acc: 0.737642 lr: 0.000586\n", + "INFO:root:epoch: 59/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.841420 acc: 0.736821 lr: 0.000585\n", + "INFO:root:epoch: 59/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.838895 acc: 0.738021 lr: 0.000584\n", + "INFO:root:epoch: 59/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.834055 acc: 0.739500 lr: 0.000583\n", + "INFO:root:epoch: 59/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.834251 acc: 0.739904 lr: 0.000582\n", + "INFO:root:epoch: 59/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.834817 acc: 0.739005 lr: 0.000581\n", + "INFO:root:epoch: 59/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.835571 acc: 0.738616 lr: 0.000580\n", + "INFO:root:epoch: 59/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.833812 acc: 0.740409 lr: 0.000579\n", + "INFO:root:epoch: 59/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.830910 acc: 0.741979 lr: 0.000578\n", + "INFO:root:epoch: 59/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.831648 acc: 0.742036 lr: 0.000577\n", + "INFO:root:epoch: 60/100 starts\n", + "INFO:root:epoch: 60/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.885892 acc: 0.775000 lr: 0.000576\n", + "INFO:root:epoch: 60/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.879384 acc: 0.754688 lr: 0.000575\n", + "INFO:root:epoch: 60/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.839739 acc: 0.762500 lr: 0.000574\n", + "INFO:root:epoch: 60/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.829774 acc: 0.758594 lr: 0.000573\n", + "INFO:root:epoch: 60/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.843364 acc: 0.753125 lr: 0.000572\n", + "INFO:root:epoch: 60/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.826290 acc: 0.756250 lr: 0.000571\n", + "INFO:root:epoch: 60/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.831055 acc: 0.752232 lr: 0.000570\n", + "INFO:root:epoch: 60/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.820983 acc: 0.754297 lr: 0.000569\n", + "INFO:root:epoch: 60/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.812713 acc: 0.754167 lr: 0.000568\n", + "INFO:root:epoch: 60/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.808644 acc: 0.753750 lr: 0.000567\n", + "INFO:root:epoch: 60/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.812561 acc: 0.750284 lr: 0.000566\n", + "INFO:root:epoch: 60/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.824937 acc: 0.746615 lr: 0.000565\n", + "INFO:root:epoch: 60/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.827743 acc: 0.744471 lr: 0.000564\n", + "INFO:root:epoch: 60/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.827845 acc: 0.742188 lr: 0.000563\n", + "INFO:root:epoch: 60/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.824984 acc: 0.742500 lr: 0.000562\n", + "INFO:root:epoch: 60/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.820623 acc: 0.742773 lr: 0.000561\n", + "INFO:root:epoch: 60/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.824389 acc: 0.740441 lr: 0.000560\n", + "INFO:root:epoch: 60/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.819917 acc: 0.741840 lr: 0.000559\n", + "INFO:root:epoch: 60/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.821452 acc: 0.741612 lr: 0.000558\n", + "INFO:root:epoch: 60/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.821996 acc: 0.740781 lr: 0.000557\n", + "INFO:root:epoch: 60/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.825038 acc: 0.741071 lr: 0.000556\n", + "INFO:root:epoch: 60/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.833056 acc: 0.739489 lr: 0.000555\n", + "INFO:root:epoch: 60/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.834126 acc: 0.740625 lr: 0.000554\n", + "INFO:root:epoch: 60/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.838261 acc: 0.739193 lr: 0.000553\n", + "INFO:root:epoch: 60/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.838391 acc: 0.738250 lr: 0.000552\n", + "INFO:root:epoch: 60/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.837515 acc: 0.738462 lr: 0.000551\n", + "INFO:root:epoch: 60/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.840794 acc: 0.738079 lr: 0.000550\n", + "INFO:root:epoch: 60/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.841568 acc: 0.738058 lr: 0.000549\n", + "INFO:root:epoch: 60/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.842523 acc: 0.738362 lr: 0.000548\n", + "INFO:root:epoch: 60/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.844872 acc: 0.737187 lr: 0.000547\n", + "INFO:root:epoch: 60/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.842277 acc: 0.736794 lr: 0.000546\n", + "INFO:root:epoch: 61/100 starts\n", + "INFO:root:epoch: 61/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.805544 acc: 0.753125 lr: 0.000545\n", + "INFO:root:epoch: 61/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.833780 acc: 0.742188 lr: 0.000544\n", + "INFO:root:epoch: 61/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.839274 acc: 0.740625 lr: 0.000543\n", + "INFO:root:epoch: 61/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.833520 acc: 0.744531 lr: 0.000542\n", + "INFO:root:epoch: 61/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.835566 acc: 0.743125 lr: 0.000541\n", + "INFO:root:epoch: 61/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.820548 acc: 0.746875 lr: 0.000540\n", + "INFO:root:epoch: 61/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.813290 acc: 0.748214 lr: 0.000539\n", + "INFO:root:epoch: 61/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.806995 acc: 0.748437 lr: 0.000539\n", + "INFO:root:epoch: 61/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.822883 acc: 0.743403 lr: 0.000538\n", + "INFO:root:epoch: 61/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.815852 acc: 0.746250 lr: 0.000537\n", + "INFO:root:epoch: 61/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.810577 acc: 0.747159 lr: 0.000536\n", + "INFO:root:epoch: 61/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.816468 acc: 0.744010 lr: 0.000535\n", + "INFO:root:epoch: 61/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.810262 acc: 0.745913 lr: 0.000534\n", + "INFO:root:epoch: 61/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.818650 acc: 0.743973 lr: 0.000533\n", + "INFO:root:epoch: 61/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.823645 acc: 0.742083 lr: 0.000532\n", + "INFO:root:epoch: 61/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.829649 acc: 0.740039 lr: 0.000531\n", + "INFO:root:epoch: 61/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.826165 acc: 0.740257 lr: 0.000530\n", + "INFO:root:epoch: 61/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.823962 acc: 0.740972 lr: 0.000529\n", + "INFO:root:epoch: 61/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.824045 acc: 0.741447 lr: 0.000528\n", + "INFO:root:epoch: 61/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.822660 acc: 0.741875 lr: 0.000527\n", + "INFO:root:epoch: 61/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.822528 acc: 0.741815 lr: 0.000527\n", + "INFO:root:epoch: 61/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.820247 acc: 0.741477 lr: 0.000526\n", + "INFO:root:epoch: 61/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.824795 acc: 0.739810 lr: 0.000525\n", + "INFO:root:epoch: 61/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.825198 acc: 0.739193 lr: 0.000524\n", + "INFO:root:epoch: 61/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.820621 acc: 0.741750 lr: 0.000523\n", + "INFO:root:epoch: 61/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.822385 acc: 0.740986 lr: 0.000522\n", + "INFO:root:epoch: 61/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.825277 acc: 0.739815 lr: 0.000521\n", + "INFO:root:epoch: 61/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.827828 acc: 0.738728 lr: 0.000520\n", + "INFO:root:epoch: 61/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.824658 acc: 0.739547 lr: 0.000519\n", + "INFO:root:epoch: 61/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.828213 acc: 0.737813 lr: 0.000518\n", + "INFO:root:epoch: 61/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.828741 acc: 0.738206 lr: 0.000518\n", + "INFO:root:epoch: 62/100 starts\n", + "INFO:root:epoch: 62/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.844680 acc: 0.765625 lr: 0.000516\n", + "INFO:root:epoch: 62/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.862761 acc: 0.767188 lr: 0.000515\n", + "INFO:root:epoch: 62/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.863700 acc: 0.759375 lr: 0.000515\n", + "INFO:root:epoch: 62/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.832684 acc: 0.765625 lr: 0.000514\n", + "INFO:root:epoch: 62/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.837586 acc: 0.765625 lr: 0.000513\n", + "INFO:root:epoch: 62/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.814643 acc: 0.770313 lr: 0.000512\n", + "INFO:root:epoch: 62/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.811375 acc: 0.763839 lr: 0.000511\n", + "INFO:root:epoch: 62/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.820167 acc: 0.757422 lr: 0.000510\n", + "INFO:root:epoch: 62/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.824017 acc: 0.757986 lr: 0.000509\n", + "INFO:root:epoch: 62/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.825212 acc: 0.756563 lr: 0.000508\n", + "INFO:root:epoch: 62/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.821173 acc: 0.757102 lr: 0.000507\n", + "INFO:root:epoch: 62/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.820457 acc: 0.756250 lr: 0.000507\n", + "INFO:root:epoch: 62/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.819386 acc: 0.756010 lr: 0.000506\n", + "INFO:root:epoch: 62/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.821911 acc: 0.753571 lr: 0.000505\n", + "INFO:root:epoch: 62/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.819128 acc: 0.752708 lr: 0.000504\n", + "INFO:root:epoch: 62/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.816960 acc: 0.753320 lr: 0.000503\n", + "INFO:root:epoch: 62/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.815645 acc: 0.753493 lr: 0.000502\n", + "INFO:root:epoch: 62/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.819335 acc: 0.750868 lr: 0.000501\n", + "INFO:root:epoch: 62/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.815293 acc: 0.750658 lr: 0.000500\n", + "INFO:root:epoch: 62/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.813565 acc: 0.750312 lr: 0.000500\n", + "INFO:root:epoch: 62/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.819647 acc: 0.748065 lr: 0.000499\n", + "INFO:root:epoch: 62/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.821927 acc: 0.747585 lr: 0.000498\n", + "INFO:root:epoch: 62/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.819023 acc: 0.748505 lr: 0.000497\n", + "INFO:root:epoch: 62/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.820804 acc: 0.747266 lr: 0.000496\n", + "INFO:root:epoch: 62/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.820380 acc: 0.747250 lr: 0.000495\n", + "INFO:root:epoch: 62/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.814690 acc: 0.748558 lr: 0.000494\n", + "INFO:root:epoch: 62/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.814140 acc: 0.750000 lr: 0.000494\n", + "INFO:root:epoch: 62/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.812839 acc: 0.750781 lr: 0.000493\n", + "INFO:root:epoch: 62/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.812621 acc: 0.750216 lr: 0.000492\n", + "INFO:root:epoch: 62/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.813526 acc: 0.748958 lr: 0.000491\n", + "INFO:root:epoch: 62/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.815095 acc: 0.747379 lr: 0.000490\n", + "INFO:root:epoch: 63/100 starts\n", + "INFO:root:epoch: 63/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.796004 acc: 0.759375 lr: 0.000489\n", + "INFO:root:epoch: 63/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.827343 acc: 0.740625 lr: 0.000488\n", + "INFO:root:epoch: 63/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.810678 acc: 0.753125 lr: 0.000487\n", + "INFO:root:epoch: 63/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.817466 acc: 0.749219 lr: 0.000487\n", + "INFO:root:epoch: 63/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.814195 acc: 0.745625 lr: 0.000486\n", + "INFO:root:epoch: 63/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.837556 acc: 0.738021 lr: 0.000485\n", + "INFO:root:epoch: 63/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.830012 acc: 0.738839 lr: 0.000484\n", + "INFO:root:epoch: 63/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.845837 acc: 0.737891 lr: 0.000483\n", + "INFO:root:epoch: 63/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.837978 acc: 0.742014 lr: 0.000482\n", + "INFO:root:epoch: 63/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.839345 acc: 0.739063 lr: 0.000482\n", + "INFO:root:epoch: 63/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.840577 acc: 0.738068 lr: 0.000481\n", + "INFO:root:epoch: 63/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.837437 acc: 0.739323 lr: 0.000480\n", + "INFO:root:epoch: 63/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.833851 acc: 0.740865 lr: 0.000479\n", + "INFO:root:epoch: 63/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.831390 acc: 0.739286 lr: 0.000478\n", + "INFO:root:epoch: 63/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.825265 acc: 0.740417 lr: 0.000477\n", + "INFO:root:epoch: 63/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.823050 acc: 0.742773 lr: 0.000477\n", + "INFO:root:epoch: 63/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.817195 acc: 0.744485 lr: 0.000476\n", + "INFO:root:epoch: 63/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.818153 acc: 0.743750 lr: 0.000475\n", + "INFO:root:epoch: 63/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.819347 acc: 0.743914 lr: 0.000474\n", + "INFO:root:epoch: 63/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.820129 acc: 0.742813 lr: 0.000473\n", + "INFO:root:epoch: 63/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.822119 acc: 0.741667 lr: 0.000472\n", + "INFO:root:epoch: 63/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.820984 acc: 0.741903 lr: 0.000472\n", + "INFO:root:epoch: 63/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.818035 acc: 0.743071 lr: 0.000471\n", + "INFO:root:epoch: 63/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.820342 acc: 0.742708 lr: 0.000470\n", + "INFO:root:epoch: 63/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.822475 acc: 0.742000 lr: 0.000469\n", + "INFO:root:epoch: 63/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.822982 acc: 0.741466 lr: 0.000468\n", + "INFO:root:epoch: 63/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.818938 acc: 0.742361 lr: 0.000468\n", + "INFO:root:epoch: 63/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.822081 acc: 0.741518 lr: 0.000467\n", + "INFO:root:epoch: 63/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.819623 acc: 0.741595 lr: 0.000466\n", + "INFO:root:epoch: 63/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.818924 acc: 0.742396 lr: 0.000465\n", + "INFO:root:epoch: 63/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.819387 acc: 0.742238 lr: 0.000464\n", + "INFO:root:epoch: 64/100 starts\n", + "INFO:root:epoch: 64/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.781804 acc: 0.743750 lr: 0.000463\n", + "INFO:root:epoch: 64/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.778554 acc: 0.751563 lr: 0.000462\n", + "INFO:root:epoch: 64/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.765933 acc: 0.766667 lr: 0.000462\n", + "INFO:root:epoch: 64/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.799363 acc: 0.753125 lr: 0.000461\n", + "INFO:root:epoch: 64/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.818070 acc: 0.745625 lr: 0.000460\n", + "INFO:root:epoch: 64/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.788903 acc: 0.754687 lr: 0.000459\n", + "INFO:root:epoch: 64/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.792313 acc: 0.753125 lr: 0.000458\n", + "INFO:root:epoch: 64/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.794060 acc: 0.752734 lr: 0.000458\n", + "INFO:root:epoch: 64/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.796208 acc: 0.750694 lr: 0.000457\n", + "INFO:root:epoch: 64/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.801066 acc: 0.749687 lr: 0.000456\n", + "INFO:root:epoch: 64/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.806042 acc: 0.748011 lr: 0.000455\n", + "INFO:root:epoch: 64/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.813346 acc: 0.748177 lr: 0.000455\n", + "INFO:root:epoch: 64/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.820467 acc: 0.746635 lr: 0.000454\n", + "INFO:root:epoch: 64/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.826604 acc: 0.743973 lr: 0.000453\n", + "INFO:root:epoch: 64/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.825852 acc: 0.743750 lr: 0.000452\n", + "INFO:root:epoch: 64/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.822501 acc: 0.744727 lr: 0.000451\n", + "INFO:root:epoch: 64/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.822466 acc: 0.744301 lr: 0.000451\n", + "INFO:root:epoch: 64/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.822896 acc: 0.745486 lr: 0.000450\n", + "INFO:root:epoch: 64/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.827110 acc: 0.743750 lr: 0.000449\n", + "INFO:root:epoch: 64/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.832103 acc: 0.742187 lr: 0.000448\n", + "INFO:root:epoch: 64/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.831318 acc: 0.743155 lr: 0.000447\n", + "INFO:root:epoch: 64/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.824343 acc: 0.745170 lr: 0.000447\n", + "INFO:root:epoch: 64/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.825736 acc: 0.743342 lr: 0.000446\n", + "INFO:root:epoch: 64/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.823029 acc: 0.744141 lr: 0.000445\n", + "INFO:root:epoch: 64/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.821226 acc: 0.744250 lr: 0.000444\n", + "INFO:root:epoch: 64/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.821424 acc: 0.743630 lr: 0.000444\n", + "INFO:root:epoch: 64/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.823106 acc: 0.744213 lr: 0.000443\n", + "INFO:root:epoch: 64/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.819443 acc: 0.744866 lr: 0.000442\n", + "INFO:root:epoch: 64/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.813378 acc: 0.746444 lr: 0.000441\n", + "INFO:root:epoch: 64/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.814149 acc: 0.745521 lr: 0.000441\n", + "INFO:root:epoch: 64/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.811714 acc: 0.746169 lr: 0.000440\n", + "INFO:root:epoch: 65/100 starts\n", + "INFO:root:epoch: 65/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.758955 acc: 0.775000 lr: 0.000439\n", + "INFO:root:epoch: 65/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.791680 acc: 0.759375 lr: 0.000438\n", + "INFO:root:epoch: 65/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.805555 acc: 0.753125 lr: 0.000437\n", + "INFO:root:epoch: 65/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.838570 acc: 0.740625 lr: 0.000437\n", + "INFO:root:epoch: 65/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.835562 acc: 0.741875 lr: 0.000436\n", + "INFO:root:epoch: 65/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.838233 acc: 0.738542 lr: 0.000435\n", + "INFO:root:epoch: 65/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.834390 acc: 0.740625 lr: 0.000434\n", + "INFO:root:epoch: 65/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.829112 acc: 0.741797 lr: 0.000434\n", + "INFO:root:epoch: 65/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.826130 acc: 0.744097 lr: 0.000433\n", + "INFO:root:epoch: 65/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.822832 acc: 0.742500 lr: 0.000432\n", + "INFO:root:epoch: 65/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.814990 acc: 0.744886 lr: 0.000431\n", + "INFO:root:epoch: 65/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.814008 acc: 0.745052 lr: 0.000431\n", + "INFO:root:epoch: 65/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.817677 acc: 0.743990 lr: 0.000430\n", + "INFO:root:epoch: 65/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.823342 acc: 0.741741 lr: 0.000429\n", + "INFO:root:epoch: 65/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.819397 acc: 0.742083 lr: 0.000428\n", + "INFO:root:epoch: 65/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.824206 acc: 0.741211 lr: 0.000428\n", + "INFO:root:epoch: 65/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.820364 acc: 0.743566 lr: 0.000427\n", + "INFO:root:epoch: 65/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.825177 acc: 0.742882 lr: 0.000426\n", + "INFO:root:epoch: 65/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.820837 acc: 0.743257 lr: 0.000425\n", + "INFO:root:epoch: 65/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.821903 acc: 0.743125 lr: 0.000425\n", + "INFO:root:epoch: 65/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.824014 acc: 0.741964 lr: 0.000424\n", + "INFO:root:epoch: 65/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.824602 acc: 0.742330 lr: 0.000423\n", + "INFO:root:epoch: 65/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.826400 acc: 0.742391 lr: 0.000422\n", + "INFO:root:epoch: 65/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.825911 acc: 0.742187 lr: 0.000422\n", + "INFO:root:epoch: 65/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.832440 acc: 0.739625 lr: 0.000421\n", + "INFO:root:epoch: 65/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.830574 acc: 0.740625 lr: 0.000420\n", + "INFO:root:epoch: 65/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.828070 acc: 0.742130 lr: 0.000419\n", + "INFO:root:epoch: 65/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.826871 acc: 0.742522 lr: 0.000419\n", + "INFO:root:epoch: 65/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.829686 acc: 0.742457 lr: 0.000418\n", + "INFO:root:epoch: 65/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.827813 acc: 0.742292 lr: 0.000417\n", + "INFO:root:epoch: 65/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.830362 acc: 0.741835 lr: 0.000417\n", + "INFO:root:epoch: 66/100 starts\n", + "INFO:root:epoch: 66/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.745997 acc: 0.765625 lr: 0.000416\n", + "INFO:root:epoch: 66/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.756653 acc: 0.767187 lr: 0.000415\n", + "INFO:root:epoch: 66/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.760714 acc: 0.768750 lr: 0.000414\n", + "INFO:root:epoch: 66/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.774805 acc: 0.764844 lr: 0.000413\n", + "INFO:root:epoch: 66/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.775891 acc: 0.766875 lr: 0.000413\n", + "INFO:root:epoch: 66/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.783079 acc: 0.761979 lr: 0.000412\n", + "INFO:root:epoch: 66/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.784908 acc: 0.760268 lr: 0.000411\n", + "INFO:root:epoch: 66/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.784080 acc: 0.760938 lr: 0.000411\n", + "INFO:root:epoch: 66/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.787995 acc: 0.761458 lr: 0.000410\n", + "INFO:root:epoch: 66/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.794286 acc: 0.760312 lr: 0.000409\n", + "INFO:root:epoch: 66/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.803357 acc: 0.759659 lr: 0.000409\n", + "INFO:root:epoch: 66/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.806682 acc: 0.755729 lr: 0.000408\n", + "INFO:root:epoch: 66/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.811768 acc: 0.755048 lr: 0.000407\n", + "INFO:root:epoch: 66/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.816728 acc: 0.753571 lr: 0.000406\n", + "INFO:root:epoch: 66/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.819195 acc: 0.750000 lr: 0.000406\n", + "INFO:root:epoch: 66/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.815099 acc: 0.751172 lr: 0.000405\n", + "INFO:root:epoch: 66/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.815097 acc: 0.750919 lr: 0.000404\n", + "INFO:root:epoch: 66/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.818380 acc: 0.749132 lr: 0.000404\n", + "INFO:root:epoch: 66/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.818319 acc: 0.747533 lr: 0.000403\n", + "INFO:root:epoch: 66/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.817883 acc: 0.747656 lr: 0.000402\n", + "INFO:root:epoch: 66/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.815824 acc: 0.748512 lr: 0.000401\n", + "INFO:root:epoch: 66/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.814059 acc: 0.749006 lr: 0.000401\n", + "INFO:root:epoch: 66/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.817707 acc: 0.747826 lr: 0.000400\n", + "INFO:root:epoch: 66/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.819252 acc: 0.746354 lr: 0.000399\n", + "INFO:root:epoch: 66/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.822368 acc: 0.745875 lr: 0.000399\n", + "INFO:root:epoch: 66/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.819690 acc: 0.745913 lr: 0.000398\n", + "INFO:root:epoch: 66/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.821290 acc: 0.744444 lr: 0.000397\n", + "INFO:root:epoch: 66/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.820228 acc: 0.744978 lr: 0.000397\n", + "INFO:root:epoch: 66/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.817387 acc: 0.745797 lr: 0.000396\n", + "INFO:root:epoch: 66/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.817338 acc: 0.746458 lr: 0.000395\n", + "INFO:root:epoch: 66/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.820686 acc: 0.745766 lr: 0.000395\n", + "INFO:root:epoch: 67/100 starts\n", + "INFO:root:epoch: 67/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.832244 acc: 0.743750 lr: 0.000394\n", + "INFO:root:epoch: 67/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.813995 acc: 0.745313 lr: 0.000393\n", + "INFO:root:epoch: 67/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.843474 acc: 0.736458 lr: 0.000392\n", + "INFO:root:epoch: 67/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.814236 acc: 0.746875 lr: 0.000392\n", + "INFO:root:epoch: 67/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.804321 acc: 0.753750 lr: 0.000391\n", + "INFO:root:epoch: 67/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.818051 acc: 0.746354 lr: 0.000390\n", + "INFO:root:epoch: 67/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.814831 acc: 0.750893 lr: 0.000390\n", + "INFO:root:epoch: 67/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.827012 acc: 0.743750 lr: 0.000389\n", + "INFO:root:epoch: 67/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.823958 acc: 0.745139 lr: 0.000388\n", + "INFO:root:epoch: 67/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.819067 acc: 0.745938 lr: 0.000388\n", + "INFO:root:epoch: 67/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.811314 acc: 0.749148 lr: 0.000387\n", + "INFO:root:epoch: 67/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.803624 acc: 0.752083 lr: 0.000386\n", + "INFO:root:epoch: 67/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.809094 acc: 0.750240 lr: 0.000386\n", + "INFO:root:epoch: 67/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.818883 acc: 0.747545 lr: 0.000385\n", + "INFO:root:epoch: 67/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.821867 acc: 0.746667 lr: 0.000384\n", + "INFO:root:epoch: 67/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.817816 acc: 0.749219 lr: 0.000384\n", + "INFO:root:epoch: 67/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.811076 acc: 0.752206 lr: 0.000383\n", + "INFO:root:epoch: 67/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.804419 acc: 0.753993 lr: 0.000382\n", + "INFO:root:epoch: 67/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.798674 acc: 0.755921 lr: 0.000382\n", + "INFO:root:epoch: 67/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.799432 acc: 0.756562 lr: 0.000381\n", + "INFO:root:epoch: 67/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.796897 acc: 0.756696 lr: 0.000380\n", + "INFO:root:epoch: 67/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.792756 acc: 0.757955 lr: 0.000380\n", + "INFO:root:epoch: 67/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.791548 acc: 0.757880 lr: 0.000379\n", + "INFO:root:epoch: 67/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.793815 acc: 0.757161 lr: 0.000378\n", + "INFO:root:epoch: 67/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.792313 acc: 0.757250 lr: 0.000378\n", + "INFO:root:epoch: 67/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.790128 acc: 0.757212 lr: 0.000377\n", + "INFO:root:epoch: 67/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.792920 acc: 0.757755 lr: 0.000376\n", + "INFO:root:epoch: 67/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.793132 acc: 0.757478 lr: 0.000376\n", + "INFO:root:epoch: 67/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.796018 acc: 0.756573 lr: 0.000375\n", + "INFO:root:epoch: 67/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.799939 acc: 0.755000 lr: 0.000374\n", + "INFO:root:epoch: 67/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.801079 acc: 0.754536 lr: 0.000374\n", + "INFO:root:epoch: 68/100 starts\n", + "INFO:root:epoch: 68/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.805098 acc: 0.771875 lr: 0.000373\n", + "INFO:root:epoch: 68/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.794243 acc: 0.753125 lr: 0.000372\n", + "INFO:root:epoch: 68/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.841131 acc: 0.738542 lr: 0.000372\n", + "INFO:root:epoch: 68/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.868295 acc: 0.736719 lr: 0.000371\n", + "INFO:root:epoch: 68/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.864982 acc: 0.738125 lr: 0.000370\n", + "INFO:root:epoch: 68/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.843730 acc: 0.742188 lr: 0.000370\n", + "INFO:root:epoch: 68/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.833939 acc: 0.745982 lr: 0.000369\n", + "INFO:root:epoch: 68/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.832561 acc: 0.747656 lr: 0.000368\n", + "INFO:root:epoch: 68/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.831154 acc: 0.747917 lr: 0.000368\n", + "INFO:root:epoch: 68/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.831324 acc: 0.748750 lr: 0.000367\n", + "INFO:root:epoch: 68/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.821556 acc: 0.751420 lr: 0.000367\n", + "INFO:root:epoch: 68/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.820977 acc: 0.749479 lr: 0.000366\n", + "INFO:root:epoch: 68/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.814694 acc: 0.750240 lr: 0.000365\n", + "INFO:root:epoch: 68/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.820349 acc: 0.748661 lr: 0.000365\n", + "INFO:root:epoch: 68/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.825379 acc: 0.747708 lr: 0.000364\n", + "INFO:root:epoch: 68/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.822750 acc: 0.748828 lr: 0.000363\n", + "INFO:root:epoch: 68/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.825821 acc: 0.747426 lr: 0.000363\n", + "INFO:root:epoch: 68/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.823634 acc: 0.748958 lr: 0.000362\n", + "INFO:root:epoch: 68/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.823277 acc: 0.747862 lr: 0.000361\n", + "INFO:root:epoch: 68/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.820041 acc: 0.748437 lr: 0.000361\n", + "INFO:root:epoch: 68/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.820466 acc: 0.748512 lr: 0.000360\n", + "INFO:root:epoch: 68/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.820309 acc: 0.748864 lr: 0.000360\n", + "INFO:root:epoch: 68/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.825311 acc: 0.747962 lr: 0.000359\n", + "INFO:root:epoch: 68/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.827428 acc: 0.747396 lr: 0.000358\n", + "INFO:root:epoch: 68/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.829084 acc: 0.746625 lr: 0.000358\n", + "INFO:root:epoch: 68/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.827185 acc: 0.747236 lr: 0.000357\n", + "INFO:root:epoch: 68/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.822909 acc: 0.748264 lr: 0.000356\n", + "INFO:root:epoch: 68/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.820255 acc: 0.748437 lr: 0.000356\n", + "INFO:root:epoch: 68/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.824639 acc: 0.747737 lr: 0.000355\n", + "INFO:root:epoch: 68/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.827008 acc: 0.746979 lr: 0.000355\n", + "INFO:root:epoch: 68/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.825204 acc: 0.747581 lr: 0.000354\n", + "INFO:root:epoch: 69/100 starts\n", + "INFO:root:epoch: 69/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.838380 acc: 0.721875 lr: 0.000353\n", + "INFO:root:epoch: 69/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.807019 acc: 0.742188 lr: 0.000353\n", + "INFO:root:epoch: 69/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.810526 acc: 0.729167 lr: 0.000352\n", + "INFO:root:epoch: 69/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.808304 acc: 0.731250 lr: 0.000351\n", + "INFO:root:epoch: 69/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.813057 acc: 0.736875 lr: 0.000351\n", + "INFO:root:epoch: 69/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.828840 acc: 0.730729 lr: 0.000350\n", + "INFO:root:epoch: 69/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.827348 acc: 0.730804 lr: 0.000350\n", + "INFO:root:epoch: 69/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.823257 acc: 0.732813 lr: 0.000349\n", + "INFO:root:epoch: 69/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.824329 acc: 0.732986 lr: 0.000348\n", + "INFO:root:epoch: 69/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.817774 acc: 0.735313 lr: 0.000348\n", + "INFO:root:epoch: 69/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.811549 acc: 0.738920 lr: 0.000347\n", + "INFO:root:epoch: 69/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.810713 acc: 0.740365 lr: 0.000347\n", + "INFO:root:epoch: 69/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.805829 acc: 0.745433 lr: 0.000346\n", + "INFO:root:epoch: 69/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.802905 acc: 0.747545 lr: 0.000345\n", + "INFO:root:epoch: 69/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.804984 acc: 0.748542 lr: 0.000345\n", + "INFO:root:epoch: 69/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.807312 acc: 0.748242 lr: 0.000344\n", + "INFO:root:epoch: 69/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.815804 acc: 0.746507 lr: 0.000344\n", + "INFO:root:epoch: 69/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.820381 acc: 0.745486 lr: 0.000343\n", + "INFO:root:epoch: 69/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.824466 acc: 0.743750 lr: 0.000342\n", + "INFO:root:epoch: 69/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.825536 acc: 0.744063 lr: 0.000342\n", + "INFO:root:epoch: 69/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.824606 acc: 0.745536 lr: 0.000341\n", + "INFO:root:epoch: 69/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.827560 acc: 0.744886 lr: 0.000341\n", + "INFO:root:epoch: 69/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.824883 acc: 0.744565 lr: 0.000340\n", + "INFO:root:epoch: 69/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.824868 acc: 0.744661 lr: 0.000339\n", + "INFO:root:epoch: 69/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.827894 acc: 0.744000 lr: 0.000339\n", + "INFO:root:epoch: 69/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.825162 acc: 0.744351 lr: 0.000338\n", + "INFO:root:epoch: 69/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.825078 acc: 0.744560 lr: 0.000338\n", + "INFO:root:epoch: 69/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.823879 acc: 0.744196 lr: 0.000337\n", + "INFO:root:epoch: 69/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.829744 acc: 0.743427 lr: 0.000336\n", + "INFO:root:epoch: 69/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.832037 acc: 0.741563 lr: 0.000336\n", + "INFO:root:epoch: 69/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.828244 acc: 0.742641 lr: 0.000335\n", + "INFO:root:epoch: 70/100 starts\n", + "INFO:root:epoch: 70/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.798392 acc: 0.756250 lr: 0.000335\n", + "INFO:root:epoch: 70/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.785699 acc: 0.754688 lr: 0.000334\n", + "INFO:root:epoch: 70/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.834265 acc: 0.741667 lr: 0.000333\n", + "INFO:root:epoch: 70/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.814712 acc: 0.745313 lr: 0.000333\n", + "INFO:root:epoch: 70/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.783671 acc: 0.753750 lr: 0.000332\n", + "INFO:root:epoch: 70/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.773007 acc: 0.758333 lr: 0.000332\n", + "INFO:root:epoch: 70/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.792273 acc: 0.757143 lr: 0.000331\n", + "INFO:root:epoch: 70/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.794986 acc: 0.754297 lr: 0.000331\n", + "INFO:root:epoch: 70/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.799806 acc: 0.752778 lr: 0.000330\n", + "INFO:root:epoch: 70/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.813534 acc: 0.750000 lr: 0.000329\n", + "INFO:root:epoch: 70/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.811018 acc: 0.751420 lr: 0.000329\n", + "INFO:root:epoch: 70/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.809025 acc: 0.750781 lr: 0.000328\n", + "INFO:root:epoch: 70/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.812925 acc: 0.750240 lr: 0.000328\n", + "INFO:root:epoch: 70/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.812998 acc: 0.750000 lr: 0.000327\n", + "INFO:root:epoch: 70/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.815848 acc: 0.748542 lr: 0.000327\n", + "INFO:root:epoch: 70/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.812975 acc: 0.748828 lr: 0.000326\n", + "INFO:root:epoch: 70/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.810961 acc: 0.750184 lr: 0.000325\n", + "INFO:root:epoch: 70/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.815061 acc: 0.748785 lr: 0.000325\n", + "INFO:root:epoch: 70/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.808749 acc: 0.750329 lr: 0.000324\n", + "INFO:root:epoch: 70/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.807923 acc: 0.749844 lr: 0.000324\n", + "INFO:root:epoch: 70/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.811348 acc: 0.748810 lr: 0.000323\n", + "INFO:root:epoch: 70/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.809734 acc: 0.750000 lr: 0.000323\n", + "INFO:root:epoch: 70/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.812528 acc: 0.749185 lr: 0.000322\n", + "INFO:root:epoch: 70/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.818263 acc: 0.747005 lr: 0.000322\n", + "INFO:root:epoch: 70/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.818961 acc: 0.747625 lr: 0.000321\n", + "INFO:root:epoch: 70/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.815984 acc: 0.748438 lr: 0.000320\n", + "INFO:root:epoch: 70/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.818170 acc: 0.747685 lr: 0.000320\n", + "INFO:root:epoch: 70/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.820473 acc: 0.746987 lr: 0.000319\n", + "INFO:root:epoch: 70/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.817501 acc: 0.747953 lr: 0.000319\n", + "INFO:root:epoch: 70/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.816513 acc: 0.749375 lr: 0.000318\n", + "INFO:root:epoch: 70/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.815202 acc: 0.750302 lr: 0.000318\n", + "INFO:root:epoch: 71/100 starts\n", + "INFO:root:epoch: 71/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.864179 acc: 0.731250 lr: 0.000317\n", + "INFO:root:epoch: 71/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.829603 acc: 0.742188 lr: 0.000316\n", + "INFO:root:epoch: 71/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.809511 acc: 0.745833 lr: 0.000316\n", + "INFO:root:epoch: 71/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.791211 acc: 0.760938 lr: 0.000315\n", + "INFO:root:epoch: 71/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.795259 acc: 0.758750 lr: 0.000315\n", + "INFO:root:epoch: 71/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.782706 acc: 0.764583 lr: 0.000314\n", + "INFO:root:epoch: 71/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.787226 acc: 0.761607 lr: 0.000314\n", + "INFO:root:epoch: 71/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.782050 acc: 0.759375 lr: 0.000313\n", + "INFO:root:epoch: 71/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.789894 acc: 0.756597 lr: 0.000313\n", + "INFO:root:epoch: 71/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.785893 acc: 0.756875 lr: 0.000312\n", + "INFO:root:epoch: 71/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.789324 acc: 0.755114 lr: 0.000311\n", + "INFO:root:epoch: 71/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.790187 acc: 0.756510 lr: 0.000311\n", + "INFO:root:epoch: 71/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.792512 acc: 0.755769 lr: 0.000310\n", + "INFO:root:epoch: 71/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.787107 acc: 0.757589 lr: 0.000310\n", + "INFO:root:epoch: 71/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.786991 acc: 0.758750 lr: 0.000309\n", + "INFO:root:epoch: 71/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.789724 acc: 0.759375 lr: 0.000309\n", + "INFO:root:epoch: 71/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.788426 acc: 0.759926 lr: 0.000308\n", + "INFO:root:epoch: 71/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.785897 acc: 0.760764 lr: 0.000308\n", + "INFO:root:epoch: 71/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.787667 acc: 0.760033 lr: 0.000307\n", + "INFO:root:epoch: 71/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.787929 acc: 0.759688 lr: 0.000307\n", + "INFO:root:epoch: 71/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.791317 acc: 0.758333 lr: 0.000306\n", + "INFO:root:epoch: 71/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.795959 acc: 0.756392 lr: 0.000306\n", + "INFO:root:epoch: 71/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.796229 acc: 0.756658 lr: 0.000305\n", + "INFO:root:epoch: 71/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.795460 acc: 0.756901 lr: 0.000305\n", + "INFO:root:epoch: 71/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.797127 acc: 0.757000 lr: 0.000304\n", + "INFO:root:epoch: 71/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.796646 acc: 0.756250 lr: 0.000303\n", + "INFO:root:epoch: 71/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.797799 acc: 0.755787 lr: 0.000303\n", + "INFO:root:epoch: 71/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.800690 acc: 0.755580 lr: 0.000302\n", + "INFO:root:epoch: 71/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.796875 acc: 0.755388 lr: 0.000302\n", + "INFO:root:epoch: 71/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.798822 acc: 0.753333 lr: 0.000301\n", + "INFO:root:epoch: 71/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.794765 acc: 0.754738 lr: 0.000301\n", + "INFO:root:epoch: 72/100 starts\n", + "INFO:root:epoch: 72/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.877658 acc: 0.693750 lr: 0.000300\n", + "INFO:root:epoch: 72/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.812990 acc: 0.734375 lr: 0.000300\n", + "INFO:root:epoch: 72/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.780840 acc: 0.746875 lr: 0.000299\n", + "INFO:root:epoch: 72/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.779216 acc: 0.754687 lr: 0.000299\n", + "INFO:root:epoch: 72/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.775524 acc: 0.758750 lr: 0.000298\n", + "INFO:root:epoch: 72/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.787118 acc: 0.756250 lr: 0.000298\n", + "INFO:root:epoch: 72/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.775308 acc: 0.758036 lr: 0.000297\n", + "INFO:root:epoch: 72/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.778646 acc: 0.758203 lr: 0.000297\n", + "INFO:root:epoch: 72/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.783685 acc: 0.755903 lr: 0.000296\n", + "INFO:root:epoch: 72/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.785047 acc: 0.756562 lr: 0.000296\n", + "INFO:root:epoch: 72/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.794285 acc: 0.755966 lr: 0.000295\n", + "INFO:root:epoch: 72/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.800572 acc: 0.754167 lr: 0.000295\n", + "INFO:root:epoch: 72/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.800480 acc: 0.754327 lr: 0.000294\n", + "INFO:root:epoch: 72/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.799564 acc: 0.754464 lr: 0.000293\n", + "INFO:root:epoch: 72/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.792825 acc: 0.756667 lr: 0.000293\n", + "INFO:root:epoch: 72/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.792193 acc: 0.754102 lr: 0.000292\n", + "INFO:root:epoch: 72/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.794815 acc: 0.754228 lr: 0.000292\n", + "INFO:root:epoch: 72/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.791455 acc: 0.756076 lr: 0.000291\n", + "INFO:root:epoch: 72/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.794742 acc: 0.755757 lr: 0.000291\n", + "INFO:root:epoch: 72/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.795464 acc: 0.755625 lr: 0.000290\n", + "INFO:root:epoch: 72/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.788110 acc: 0.758482 lr: 0.000290\n", + "INFO:root:epoch: 72/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.788900 acc: 0.758523 lr: 0.000289\n", + "INFO:root:epoch: 72/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.790933 acc: 0.755707 lr: 0.000289\n", + "INFO:root:epoch: 72/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.794669 acc: 0.755729 lr: 0.000288\n", + "INFO:root:epoch: 72/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.798293 acc: 0.754000 lr: 0.000288\n", + "INFO:root:epoch: 72/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.795453 acc: 0.754447 lr: 0.000287\n", + "INFO:root:epoch: 72/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.799001 acc: 0.754167 lr: 0.000287\n", + "INFO:root:epoch: 72/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.798975 acc: 0.754241 lr: 0.000286\n", + "INFO:root:epoch: 72/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.799768 acc: 0.753772 lr: 0.000286\n", + "INFO:root:epoch: 72/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.803339 acc: 0.753021 lr: 0.000285\n", + "INFO:root:epoch: 72/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.806513 acc: 0.752016 lr: 0.000285\n", + "INFO:root:epoch: 73/100 starts\n", + "INFO:root:epoch: 73/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.859161 acc: 0.743750 lr: 0.000284\n", + "INFO:root:epoch: 73/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.795976 acc: 0.745313 lr: 0.000284\n", + "INFO:root:epoch: 73/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.796629 acc: 0.745833 lr: 0.000283\n", + "INFO:root:epoch: 73/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.785419 acc: 0.753906 lr: 0.000283\n", + "INFO:root:epoch: 73/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.798214 acc: 0.749375 lr: 0.000282\n", + "INFO:root:epoch: 73/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.816244 acc: 0.742188 lr: 0.000282\n", + "INFO:root:epoch: 73/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.820838 acc: 0.737946 lr: 0.000281\n", + "INFO:root:epoch: 73/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.836950 acc: 0.732422 lr: 0.000281\n", + "INFO:root:epoch: 73/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.836056 acc: 0.731944 lr: 0.000280\n", + "INFO:root:epoch: 73/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.826064 acc: 0.735312 lr: 0.000280\n", + "INFO:root:epoch: 73/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.817924 acc: 0.740341 lr: 0.000279\n", + "INFO:root:epoch: 73/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.818547 acc: 0.741146 lr: 0.000279\n", + "INFO:root:epoch: 73/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.820180 acc: 0.740625 lr: 0.000278\n", + "INFO:root:epoch: 73/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.819756 acc: 0.740402 lr: 0.000278\n", + "INFO:root:epoch: 73/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.813831 acc: 0.742708 lr: 0.000278\n", + "INFO:root:epoch: 73/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.824447 acc: 0.738867 lr: 0.000277\n", + "INFO:root:epoch: 73/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.817084 acc: 0.740809 lr: 0.000277\n", + "INFO:root:epoch: 73/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.820232 acc: 0.740104 lr: 0.000276\n", + "INFO:root:epoch: 73/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.817827 acc: 0.741118 lr: 0.000276\n", + "INFO:root:epoch: 73/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.811755 acc: 0.742031 lr: 0.000275\n", + "INFO:root:epoch: 73/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.810096 acc: 0.741964 lr: 0.000275\n", + "INFO:root:epoch: 73/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.809113 acc: 0.742614 lr: 0.000274\n", + "INFO:root:epoch: 73/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.811607 acc: 0.741168 lr: 0.000274\n", + "INFO:root:epoch: 73/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.813121 acc: 0.742318 lr: 0.000273\n", + "INFO:root:epoch: 73/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.813520 acc: 0.741625 lr: 0.000273\n", + "INFO:root:epoch: 73/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.811619 acc: 0.743269 lr: 0.000272\n", + "INFO:root:epoch: 73/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.805024 acc: 0.746296 lr: 0.000272\n", + "INFO:root:epoch: 73/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.803621 acc: 0.746652 lr: 0.000271\n", + "INFO:root:epoch: 73/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.806689 acc: 0.745690 lr: 0.000271\n", + "INFO:root:epoch: 73/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.805943 acc: 0.745938 lr: 0.000270\n", + "INFO:root:epoch: 73/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.805572 acc: 0.746169 lr: 0.000270\n", + "INFO:root:epoch: 74/100 starts\n", + "INFO:root:epoch: 74/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.807711 acc: 0.756250 lr: 0.000269\n", + "INFO:root:epoch: 74/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.799644 acc: 0.757812 lr: 0.000269\n", + "INFO:root:epoch: 74/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.761504 acc: 0.762500 lr: 0.000268\n", + "INFO:root:epoch: 74/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.774130 acc: 0.762500 lr: 0.000268\n", + "INFO:root:epoch: 74/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.803446 acc: 0.758750 lr: 0.000267\n", + "INFO:root:epoch: 74/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.813824 acc: 0.751042 lr: 0.000267\n", + "INFO:root:epoch: 74/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.815117 acc: 0.749107 lr: 0.000267\n", + "INFO:root:epoch: 74/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.826940 acc: 0.747266 lr: 0.000266\n", + "INFO:root:epoch: 74/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.824534 acc: 0.747569 lr: 0.000266\n", + "INFO:root:epoch: 74/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.809307 acc: 0.753125 lr: 0.000265\n", + "INFO:root:epoch: 74/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.812155 acc: 0.754261 lr: 0.000265\n", + "INFO:root:epoch: 74/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.808710 acc: 0.753125 lr: 0.000264\n", + "INFO:root:epoch: 74/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.802184 acc: 0.753606 lr: 0.000264\n", + "INFO:root:epoch: 74/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.801939 acc: 0.754018 lr: 0.000263\n", + "INFO:root:epoch: 74/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.796604 acc: 0.755000 lr: 0.000263\n", + "INFO:root:epoch: 74/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.796238 acc: 0.755273 lr: 0.000262\n", + "INFO:root:epoch: 74/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.801234 acc: 0.753309 lr: 0.000262\n", + "INFO:root:epoch: 74/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801564 acc: 0.752083 lr: 0.000262\n", + "INFO:root:epoch: 74/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.802566 acc: 0.753289 lr: 0.000261\n", + "INFO:root:epoch: 74/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.801704 acc: 0.754219 lr: 0.000261\n", + "INFO:root:epoch: 74/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.799007 acc: 0.755208 lr: 0.000260\n", + "INFO:root:epoch: 74/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.801955 acc: 0.755256 lr: 0.000260\n", + "INFO:root:epoch: 74/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.797431 acc: 0.757065 lr: 0.000259\n", + "INFO:root:epoch: 74/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.800845 acc: 0.755208 lr: 0.000259\n", + "INFO:root:epoch: 74/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.796210 acc: 0.757500 lr: 0.000258\n", + "INFO:root:epoch: 74/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.794110 acc: 0.758293 lr: 0.000258\n", + "INFO:root:epoch: 74/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.795419 acc: 0.758449 lr: 0.000257\n", + "INFO:root:epoch: 74/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.796114 acc: 0.756696 lr: 0.000257\n", + "INFO:root:epoch: 74/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.795710 acc: 0.755927 lr: 0.000257\n", + "INFO:root:epoch: 74/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.792194 acc: 0.757604 lr: 0.000256\n", + "INFO:root:epoch: 74/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.792609 acc: 0.757661 lr: 0.000256\n", + "INFO:root:epoch: 75/100 starts\n", + "INFO:root:epoch: 75/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.810152 acc: 0.759375 lr: 0.000255\n", + "INFO:root:epoch: 75/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.827014 acc: 0.765625 lr: 0.000255\n", + "INFO:root:epoch: 75/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.894969 acc: 0.743750 lr: 0.000254\n", + "INFO:root:epoch: 75/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.866518 acc: 0.750781 lr: 0.000254\n", + "INFO:root:epoch: 75/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.836647 acc: 0.755000 lr: 0.000253\n", + "INFO:root:epoch: 75/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.838524 acc: 0.748958 lr: 0.000253\n", + "INFO:root:epoch: 75/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.843608 acc: 0.750446 lr: 0.000252\n", + "INFO:root:epoch: 75/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.842278 acc: 0.750781 lr: 0.000252\n", + "INFO:root:epoch: 75/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.820922 acc: 0.756944 lr: 0.000252\n", + "INFO:root:epoch: 75/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.820232 acc: 0.754375 lr: 0.000251\n", + "INFO:root:epoch: 75/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.807429 acc: 0.754830 lr: 0.000251\n", + "INFO:root:epoch: 75/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.806791 acc: 0.753646 lr: 0.000250\n", + "INFO:root:epoch: 75/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.811307 acc: 0.750481 lr: 0.000250\n", + "INFO:root:epoch: 75/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.816761 acc: 0.749554 lr: 0.000249\n", + "INFO:root:epoch: 75/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.820517 acc: 0.748542 lr: 0.000249\n", + "INFO:root:epoch: 75/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.819179 acc: 0.747852 lr: 0.000249\n", + "INFO:root:epoch: 75/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.819349 acc: 0.748346 lr: 0.000248\n", + "INFO:root:epoch: 75/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.820818 acc: 0.747743 lr: 0.000248\n", + "INFO:root:epoch: 75/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.827496 acc: 0.746217 lr: 0.000247\n", + "INFO:root:epoch: 75/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.825079 acc: 0.747031 lr: 0.000247\n", + "INFO:root:epoch: 75/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.824680 acc: 0.747173 lr: 0.000246\n", + "INFO:root:epoch: 75/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.824993 acc: 0.746591 lr: 0.000246\n", + "INFO:root:epoch: 75/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.823038 acc: 0.747418 lr: 0.000246\n", + "INFO:root:epoch: 75/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.820720 acc: 0.748568 lr: 0.000245\n", + "INFO:root:epoch: 75/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.827207 acc: 0.746125 lr: 0.000245\n", + "INFO:root:epoch: 75/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.825874 acc: 0.748437 lr: 0.000244\n", + "INFO:root:epoch: 75/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.825848 acc: 0.748380 lr: 0.000244\n", + "INFO:root:epoch: 75/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.827415 acc: 0.748772 lr: 0.000243\n", + "INFO:root:epoch: 75/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.828070 acc: 0.748491 lr: 0.000243\n", + "INFO:root:epoch: 75/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.827790 acc: 0.748542 lr: 0.000243\n", + "INFO:root:epoch: 75/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.825898 acc: 0.749395 lr: 0.000242\n", + "INFO:root:epoch: 76/100 starts\n", + "INFO:root:epoch: 76/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.860451 acc: 0.759375 lr: 0.000242\n", + "INFO:root:epoch: 76/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.808804 acc: 0.765625 lr: 0.000241\n", + "INFO:root:epoch: 76/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.837553 acc: 0.758333 lr: 0.000241\n", + "INFO:root:epoch: 76/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.830293 acc: 0.753125 lr: 0.000240\n", + "INFO:root:epoch: 76/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.852930 acc: 0.746875 lr: 0.000240\n", + "INFO:root:epoch: 76/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.833143 acc: 0.751563 lr: 0.000240\n", + "INFO:root:epoch: 76/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.824864 acc: 0.754464 lr: 0.000239\n", + "INFO:root:epoch: 76/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.826082 acc: 0.751172 lr: 0.000239\n", + "INFO:root:epoch: 76/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.821045 acc: 0.751042 lr: 0.000238\n", + "INFO:root:epoch: 76/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.820884 acc: 0.750625 lr: 0.000238\n", + "INFO:root:epoch: 76/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.815294 acc: 0.752557 lr: 0.000237\n", + "INFO:root:epoch: 76/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.813564 acc: 0.752083 lr: 0.000237\n", + "INFO:root:epoch: 76/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.809495 acc: 0.751683 lr: 0.000237\n", + "INFO:root:epoch: 76/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.808137 acc: 0.752679 lr: 0.000236\n", + "INFO:root:epoch: 76/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.797264 acc: 0.755833 lr: 0.000236\n", + "INFO:root:epoch: 76/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.797611 acc: 0.756250 lr: 0.000235\n", + "INFO:root:epoch: 76/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.798314 acc: 0.755147 lr: 0.000235\n", + "INFO:root:epoch: 76/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.799500 acc: 0.755035 lr: 0.000235\n", + "INFO:root:epoch: 76/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.801328 acc: 0.754112 lr: 0.000234\n", + "INFO:root:epoch: 76/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.799718 acc: 0.754062 lr: 0.000234\n", + "INFO:root:epoch: 76/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.796545 acc: 0.754762 lr: 0.000233\n", + "INFO:root:epoch: 76/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.797761 acc: 0.754972 lr: 0.000233\n", + "INFO:root:epoch: 76/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.795961 acc: 0.755435 lr: 0.000233\n", + "INFO:root:epoch: 76/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.794366 acc: 0.755859 lr: 0.000232\n", + "INFO:root:epoch: 76/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.792657 acc: 0.756250 lr: 0.000232\n", + "INFO:root:epoch: 76/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.791301 acc: 0.756250 lr: 0.000231\n", + "INFO:root:epoch: 76/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.790612 acc: 0.755903 lr: 0.000231\n", + "INFO:root:epoch: 76/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.788301 acc: 0.756362 lr: 0.000231\n", + "INFO:root:epoch: 76/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.792633 acc: 0.755496 lr: 0.000230\n", + "INFO:root:epoch: 76/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.791107 acc: 0.755729 lr: 0.000230\n", + "INFO:root:epoch: 76/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.789158 acc: 0.756754 lr: 0.000229\n", + "INFO:root:epoch: 77/100 starts\n", + "INFO:root:epoch: 77/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.832452 acc: 0.740625 lr: 0.000229\n", + "INFO:root:epoch: 77/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.805612 acc: 0.753125 lr: 0.000228\n", + "INFO:root:epoch: 77/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.803798 acc: 0.745833 lr: 0.000228\n", + "INFO:root:epoch: 77/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.796911 acc: 0.750000 lr: 0.000228\n", + "INFO:root:epoch: 77/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.810061 acc: 0.747500 lr: 0.000227\n", + "INFO:root:epoch: 77/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.796402 acc: 0.748958 lr: 0.000227\n", + "INFO:root:epoch: 77/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.788726 acc: 0.750893 lr: 0.000227\n", + "INFO:root:epoch: 77/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.786726 acc: 0.753125 lr: 0.000226\n", + "INFO:root:epoch: 77/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.795086 acc: 0.752778 lr: 0.000226\n", + "INFO:root:epoch: 77/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.804910 acc: 0.748125 lr: 0.000225\n", + "INFO:root:epoch: 77/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.813269 acc: 0.746591 lr: 0.000225\n", + "INFO:root:epoch: 77/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.811426 acc: 0.747396 lr: 0.000225\n", + "INFO:root:epoch: 77/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.807266 acc: 0.749519 lr: 0.000224\n", + "INFO:root:epoch: 77/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.804830 acc: 0.750223 lr: 0.000224\n", + "INFO:root:epoch: 77/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.802381 acc: 0.750417 lr: 0.000223\n", + "INFO:root:epoch: 77/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.806278 acc: 0.749414 lr: 0.000223\n", + "INFO:root:epoch: 77/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.811008 acc: 0.747610 lr: 0.000223\n", + "INFO:root:epoch: 77/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.813348 acc: 0.747743 lr: 0.000222\n", + "INFO:root:epoch: 77/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.811135 acc: 0.748355 lr: 0.000222\n", + "INFO:root:epoch: 77/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.804978 acc: 0.751094 lr: 0.000221\n", + "INFO:root:epoch: 77/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.802043 acc: 0.752381 lr: 0.000221\n", + "INFO:root:epoch: 77/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.803534 acc: 0.751989 lr: 0.000221\n", + "INFO:root:epoch: 77/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.799325 acc: 0.753940 lr: 0.000220\n", + "INFO:root:epoch: 77/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.803571 acc: 0.752604 lr: 0.000220\n", + "INFO:root:epoch: 77/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.798379 acc: 0.753375 lr: 0.000220\n", + "INFO:root:epoch: 77/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.791920 acc: 0.755409 lr: 0.000219\n", + "INFO:root:epoch: 77/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.796250 acc: 0.754282 lr: 0.000219\n", + "INFO:root:epoch: 77/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.795690 acc: 0.754799 lr: 0.000218\n", + "INFO:root:epoch: 77/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.796734 acc: 0.754526 lr: 0.000218\n", + "INFO:root:epoch: 77/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.797624 acc: 0.753958 lr: 0.000218\n", + "INFO:root:epoch: 77/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797562 acc: 0.754435 lr: 0.000217\n", + "INFO:root:epoch: 78/100 starts\n", + "INFO:root:epoch: 78/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.798032 acc: 0.756250 lr: 0.000217\n", + "INFO:root:epoch: 78/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.773538 acc: 0.750000 lr: 0.000216\n", + "INFO:root:epoch: 78/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.793279 acc: 0.743750 lr: 0.000216\n", + "INFO:root:epoch: 78/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.792584 acc: 0.742969 lr: 0.000216\n", + "INFO:root:epoch: 78/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.778425 acc: 0.748750 lr: 0.000215\n", + "INFO:root:epoch: 78/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.779079 acc: 0.747396 lr: 0.000215\n", + "INFO:root:epoch: 78/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.774168 acc: 0.751339 lr: 0.000215\n", + "INFO:root:epoch: 78/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.768096 acc: 0.755469 lr: 0.000214\n", + "INFO:root:epoch: 78/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.777745 acc: 0.752431 lr: 0.000214\n", + "INFO:root:epoch: 78/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.774641 acc: 0.751875 lr: 0.000213\n", + "INFO:root:epoch: 78/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.777431 acc: 0.752557 lr: 0.000213\n", + "INFO:root:epoch: 78/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.780717 acc: 0.752083 lr: 0.000213\n", + "INFO:root:epoch: 78/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.788871 acc: 0.748798 lr: 0.000212\n", + "INFO:root:epoch: 78/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.787595 acc: 0.747991 lr: 0.000212\n", + "INFO:root:epoch: 78/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.790902 acc: 0.748958 lr: 0.000212\n", + "INFO:root:epoch: 78/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.787587 acc: 0.750977 lr: 0.000211\n", + "INFO:root:epoch: 78/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.787144 acc: 0.749449 lr: 0.000211\n", + "INFO:root:epoch: 78/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.787887 acc: 0.750694 lr: 0.000211\n", + "INFO:root:epoch: 78/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.786996 acc: 0.750164 lr: 0.000210\n", + "INFO:root:epoch: 78/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.789652 acc: 0.750781 lr: 0.000210\n", + "INFO:root:epoch: 78/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.792301 acc: 0.749851 lr: 0.000209\n", + "INFO:root:epoch: 78/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.791866 acc: 0.750000 lr: 0.000209\n", + "INFO:root:epoch: 78/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.789160 acc: 0.750951 lr: 0.000209\n", + "INFO:root:epoch: 78/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.798120 acc: 0.748307 lr: 0.000208\n", + "INFO:root:epoch: 78/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.796369 acc: 0.749625 lr: 0.000208\n", + "INFO:root:epoch: 78/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.800060 acc: 0.750000 lr: 0.000208\n", + "INFO:root:epoch: 78/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.802455 acc: 0.748495 lr: 0.000207\n", + "INFO:root:epoch: 78/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.802527 acc: 0.748549 lr: 0.000207\n", + "INFO:root:epoch: 78/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.799196 acc: 0.749138 lr: 0.000207\n", + "INFO:root:epoch: 78/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.801000 acc: 0.749167 lr: 0.000206\n", + "INFO:root:epoch: 78/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.799783 acc: 0.749194 lr: 0.000206\n", + "INFO:root:epoch: 79/100 starts\n", + "INFO:root:epoch: 79/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.880325 acc: 0.721875 lr: 0.000205\n", + "INFO:root:epoch: 79/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.848140 acc: 0.729687 lr: 0.000205\n", + "INFO:root:epoch: 79/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.816481 acc: 0.736458 lr: 0.000205\n", + "INFO:root:epoch: 79/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.794711 acc: 0.742969 lr: 0.000204\n", + "INFO:root:epoch: 79/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.818348 acc: 0.742500 lr: 0.000204\n", + "INFO:root:epoch: 79/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.818294 acc: 0.738021 lr: 0.000204\n", + "INFO:root:epoch: 79/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.815743 acc: 0.740625 lr: 0.000203\n", + "INFO:root:epoch: 79/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.814110 acc: 0.741016 lr: 0.000203\n", + "INFO:root:epoch: 79/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.824917 acc: 0.738889 lr: 0.000203\n", + "INFO:root:epoch: 79/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.817645 acc: 0.740625 lr: 0.000202\n", + "INFO:root:epoch: 79/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.816279 acc: 0.743182 lr: 0.000202\n", + "INFO:root:epoch: 79/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.803411 acc: 0.746875 lr: 0.000201\n", + "INFO:root:epoch: 79/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.794337 acc: 0.751202 lr: 0.000201\n", + "INFO:root:epoch: 79/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.801573 acc: 0.750223 lr: 0.000201\n", + "INFO:root:epoch: 79/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.811712 acc: 0.747083 lr: 0.000200\n", + "INFO:root:epoch: 79/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.816973 acc: 0.744531 lr: 0.000200\n", + "INFO:root:epoch: 79/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.819649 acc: 0.742831 lr: 0.000200\n", + "INFO:root:epoch: 79/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.813565 acc: 0.744097 lr: 0.000199\n", + "INFO:root:epoch: 79/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.811333 acc: 0.745724 lr: 0.000199\n", + "INFO:root:epoch: 79/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.815313 acc: 0.743594 lr: 0.000199\n", + "INFO:root:epoch: 79/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.815732 acc: 0.742262 lr: 0.000198\n", + "INFO:root:epoch: 79/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.816799 acc: 0.743182 lr: 0.000198\n", + "INFO:root:epoch: 79/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.816930 acc: 0.743342 lr: 0.000198\n", + "INFO:root:epoch: 79/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.814491 acc: 0.744792 lr: 0.000197\n", + "INFO:root:epoch: 79/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.810731 acc: 0.745500 lr: 0.000197\n", + "INFO:root:epoch: 79/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.812610 acc: 0.743990 lr: 0.000197\n", + "INFO:root:epoch: 79/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.813645 acc: 0.742593 lr: 0.000196\n", + "INFO:root:epoch: 79/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.813632 acc: 0.742634 lr: 0.000196\n", + "INFO:root:epoch: 79/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.818151 acc: 0.742457 lr: 0.000196\n", + "INFO:root:epoch: 79/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.818030 acc: 0.743021 lr: 0.000195\n", + "INFO:root:epoch: 79/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.817498 acc: 0.743347 lr: 0.000195\n", + "INFO:root:epoch: 80/100 starts\n", + "INFO:root:epoch: 80/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.750064 acc: 0.734375 lr: 0.000195\n", + "INFO:root:epoch: 80/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.767472 acc: 0.742188 lr: 0.000194\n", + "INFO:root:epoch: 80/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.758861 acc: 0.748958 lr: 0.000194\n", + "INFO:root:epoch: 80/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.787833 acc: 0.742969 lr: 0.000194\n", + "INFO:root:epoch: 80/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.796420 acc: 0.745000 lr: 0.000193\n", + "INFO:root:epoch: 80/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.779974 acc: 0.748958 lr: 0.000193\n", + "INFO:root:epoch: 80/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.784120 acc: 0.748214 lr: 0.000192\n", + "INFO:root:epoch: 80/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.774460 acc: 0.752734 lr: 0.000192\n", + "INFO:root:epoch: 80/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.777161 acc: 0.751736 lr: 0.000192\n", + "INFO:root:epoch: 80/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.783559 acc: 0.751250 lr: 0.000192\n", + "INFO:root:epoch: 80/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.786514 acc: 0.751989 lr: 0.000191\n", + "INFO:root:epoch: 80/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.791311 acc: 0.754427 lr: 0.000191\n", + "INFO:root:epoch: 80/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.791077 acc: 0.754808 lr: 0.000191\n", + "INFO:root:epoch: 80/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.791207 acc: 0.755134 lr: 0.000190\n", + "INFO:root:epoch: 80/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.792544 acc: 0.754375 lr: 0.000190\n", + "INFO:root:epoch: 80/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.798387 acc: 0.751172 lr: 0.000190\n", + "INFO:root:epoch: 80/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.803075 acc: 0.749081 lr: 0.000189\n", + "INFO:root:epoch: 80/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.803261 acc: 0.748090 lr: 0.000189\n", + "INFO:root:epoch: 80/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.801857 acc: 0.748849 lr: 0.000189\n", + "INFO:root:epoch: 80/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.801413 acc: 0.749219 lr: 0.000188\n", + "INFO:root:epoch: 80/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.802582 acc: 0.749405 lr: 0.000188\n", + "INFO:root:epoch: 80/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.801758 acc: 0.749716 lr: 0.000188\n", + "INFO:root:epoch: 80/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.797444 acc: 0.750951 lr: 0.000187\n", + "INFO:root:epoch: 80/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.800117 acc: 0.751042 lr: 0.000187\n", + "INFO:root:epoch: 80/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.796923 acc: 0.752625 lr: 0.000187\n", + "INFO:root:epoch: 80/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.798514 acc: 0.751803 lr: 0.000186\n", + "INFO:root:epoch: 80/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.802870 acc: 0.749884 lr: 0.000186\n", + "INFO:root:epoch: 80/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.802658 acc: 0.749888 lr: 0.000186\n", + "INFO:root:epoch: 80/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.800922 acc: 0.749138 lr: 0.000185\n", + "INFO:root:epoch: 80/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.795446 acc: 0.752083 lr: 0.000185\n", + "INFO:root:epoch: 80/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797460 acc: 0.750907 lr: 0.000185\n", + "INFO:root:epoch: 81/100 starts\n", + "INFO:root:epoch: 81/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.762089 acc: 0.771875 lr: 0.000184\n", + "INFO:root:epoch: 81/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.782786 acc: 0.773438 lr: 0.000184\n", + "INFO:root:epoch: 81/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.786439 acc: 0.772917 lr: 0.000184\n", + "INFO:root:epoch: 81/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.796047 acc: 0.759375 lr: 0.000183\n", + "INFO:root:epoch: 81/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.793564 acc: 0.758750 lr: 0.000183\n", + "INFO:root:epoch: 81/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.777598 acc: 0.759375 lr: 0.000183\n", + "INFO:root:epoch: 81/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.777131 acc: 0.758482 lr: 0.000182\n", + "INFO:root:epoch: 81/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.774471 acc: 0.757422 lr: 0.000182\n", + "INFO:root:epoch: 81/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.780834 acc: 0.752778 lr: 0.000182\n", + "INFO:root:epoch: 81/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.780715 acc: 0.752500 lr: 0.000181\n", + "INFO:root:epoch: 81/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.784636 acc: 0.753125 lr: 0.000181\n", + "INFO:root:epoch: 81/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.788807 acc: 0.751563 lr: 0.000181\n", + "INFO:root:epoch: 81/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.793873 acc: 0.752644 lr: 0.000180\n", + "INFO:root:epoch: 81/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.791054 acc: 0.752679 lr: 0.000180\n", + "INFO:root:epoch: 81/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.791990 acc: 0.752292 lr: 0.000180\n", + "INFO:root:epoch: 81/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.790770 acc: 0.752734 lr: 0.000180\n", + "INFO:root:epoch: 81/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.788597 acc: 0.754963 lr: 0.000179\n", + "INFO:root:epoch: 81/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.791801 acc: 0.753646 lr: 0.000179\n", + "INFO:root:epoch: 81/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.797125 acc: 0.752796 lr: 0.000179\n", + "INFO:root:epoch: 81/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.795501 acc: 0.754375 lr: 0.000178\n", + "INFO:root:epoch: 81/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.791406 acc: 0.755506 lr: 0.000178\n", + "INFO:root:epoch: 81/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.787304 acc: 0.755824 lr: 0.000178\n", + "INFO:root:epoch: 81/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.785418 acc: 0.757065 lr: 0.000177\n", + "INFO:root:epoch: 81/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.786203 acc: 0.756771 lr: 0.000177\n", + "INFO:root:epoch: 81/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.787770 acc: 0.756000 lr: 0.000177\n", + "INFO:root:epoch: 81/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.789473 acc: 0.755889 lr: 0.000176\n", + "INFO:root:epoch: 81/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.791416 acc: 0.755671 lr: 0.000176\n", + "INFO:root:epoch: 81/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.790074 acc: 0.756473 lr: 0.000176\n", + "INFO:root:epoch: 81/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.794264 acc: 0.754526 lr: 0.000176\n", + "INFO:root:epoch: 81/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.789935 acc: 0.756146 lr: 0.000175\n", + "INFO:root:epoch: 81/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.791139 acc: 0.755847 lr: 0.000175\n", + "INFO:root:epoch: 82/100 starts\n", + "INFO:root:epoch: 82/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.628804 acc: 0.803125 lr: 0.000175\n", + "INFO:root:epoch: 82/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.695736 acc: 0.801562 lr: 0.000174\n", + "INFO:root:epoch: 82/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.726282 acc: 0.792708 lr: 0.000174\n", + "INFO:root:epoch: 82/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.762066 acc: 0.775000 lr: 0.000174\n", + "INFO:root:epoch: 82/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.771014 acc: 0.766875 lr: 0.000173\n", + "INFO:root:epoch: 82/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.781545 acc: 0.765625 lr: 0.000173\n", + "INFO:root:epoch: 82/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.781009 acc: 0.764286 lr: 0.000173\n", + "INFO:root:epoch: 82/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.782196 acc: 0.762891 lr: 0.000172\n", + "INFO:root:epoch: 82/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.782277 acc: 0.759722 lr: 0.000172\n", + "INFO:root:epoch: 82/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.778186 acc: 0.758750 lr: 0.000172\n", + "INFO:root:epoch: 82/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.779406 acc: 0.757102 lr: 0.000172\n", + "INFO:root:epoch: 82/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.785728 acc: 0.752604 lr: 0.000171\n", + "INFO:root:epoch: 82/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.792901 acc: 0.752163 lr: 0.000171\n", + "INFO:root:epoch: 82/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.792623 acc: 0.751786 lr: 0.000171\n", + "INFO:root:epoch: 82/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.787286 acc: 0.752708 lr: 0.000170\n", + "INFO:root:epoch: 82/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.788793 acc: 0.753320 lr: 0.000170\n", + "INFO:root:epoch: 82/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.784648 acc: 0.755331 lr: 0.000170\n", + "INFO:root:epoch: 82/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.784689 acc: 0.754688 lr: 0.000169\n", + "INFO:root:epoch: 82/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.783967 acc: 0.755263 lr: 0.000169\n", + "INFO:root:epoch: 82/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.788794 acc: 0.754375 lr: 0.000169\n", + "INFO:root:epoch: 82/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.794243 acc: 0.751786 lr: 0.000169\n", + "INFO:root:epoch: 82/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.796243 acc: 0.751847 lr: 0.000168\n", + "INFO:root:epoch: 82/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.794753 acc: 0.751087 lr: 0.000168\n", + "INFO:root:epoch: 82/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.793552 acc: 0.750911 lr: 0.000168\n", + "INFO:root:epoch: 82/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.792333 acc: 0.751000 lr: 0.000167\n", + "INFO:root:epoch: 82/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.793941 acc: 0.750841 lr: 0.000167\n", + "INFO:root:epoch: 82/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.791701 acc: 0.751273 lr: 0.000167\n", + "INFO:root:epoch: 82/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.796419 acc: 0.749888 lr: 0.000167\n", + "INFO:root:epoch: 82/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.798194 acc: 0.749138 lr: 0.000166\n", + "INFO:root:epoch: 82/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.796819 acc: 0.749479 lr: 0.000166\n", + "INFO:root:epoch: 82/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797615 acc: 0.748286 lr: 0.000166\n", + "INFO:root:epoch: 83/100 starts\n", + "INFO:root:epoch: 83/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.761741 acc: 0.775000 lr: 0.000165\n", + "INFO:root:epoch: 83/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.751758 acc: 0.771875 lr: 0.000165\n", + "INFO:root:epoch: 83/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.760744 acc: 0.757292 lr: 0.000165\n", + "INFO:root:epoch: 83/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.801879 acc: 0.746875 lr: 0.000164\n", + "INFO:root:epoch: 83/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.812679 acc: 0.746250 lr: 0.000164\n", + "INFO:root:epoch: 83/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.812477 acc: 0.746354 lr: 0.000164\n", + "INFO:root:epoch: 83/100 et: 3s eta: 10s batches: 70/313(22%) samples: 2240 loss: 0.800193 acc: 0.747321 lr: 0.000164\n", + "INFO:root:epoch: 83/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.804929 acc: 0.750000 lr: 0.000163\n", + "INFO:root:epoch: 83/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.809009 acc: 0.746875 lr: 0.000163\n", + "INFO:root:epoch: 83/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.805895 acc: 0.747187 lr: 0.000163\n", + "INFO:root:epoch: 83/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.799392 acc: 0.750000 lr: 0.000162\n", + "INFO:root:epoch: 83/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.793033 acc: 0.749219 lr: 0.000162\n", + "INFO:root:epoch: 83/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.794914 acc: 0.748558 lr: 0.000162\n", + "INFO:root:epoch: 83/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.796939 acc: 0.749330 lr: 0.000162\n", + "INFO:root:epoch: 83/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.802187 acc: 0.746875 lr: 0.000161\n", + "INFO:root:epoch: 83/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.796101 acc: 0.749023 lr: 0.000161\n", + "INFO:root:epoch: 83/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.798109 acc: 0.749816 lr: 0.000161\n", + "INFO:root:epoch: 83/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.795528 acc: 0.749479 lr: 0.000161\n", + "INFO:root:epoch: 83/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.794467 acc: 0.750164 lr: 0.000160\n", + "INFO:root:epoch: 83/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.793782 acc: 0.751094 lr: 0.000160\n", + "INFO:root:epoch: 83/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.793767 acc: 0.750446 lr: 0.000160\n", + "INFO:root:epoch: 83/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.793982 acc: 0.750142 lr: 0.000159\n", + "INFO:root:epoch: 83/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.792620 acc: 0.750136 lr: 0.000159\n", + "INFO:root:epoch: 83/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.793667 acc: 0.750781 lr: 0.000159\n", + "INFO:root:epoch: 83/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.792085 acc: 0.751000 lr: 0.000159\n", + "INFO:root:epoch: 83/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.791104 acc: 0.752163 lr: 0.000158\n", + "INFO:root:epoch: 83/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.789927 acc: 0.752315 lr: 0.000158\n", + "INFO:root:epoch: 83/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.790392 acc: 0.752121 lr: 0.000158\n", + "INFO:root:epoch: 83/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.795518 acc: 0.750539 lr: 0.000157\n", + "INFO:root:epoch: 83/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.796847 acc: 0.750417 lr: 0.000157\n", + "INFO:root:epoch: 83/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.794402 acc: 0.750806 lr: 0.000157\n", + "INFO:root:epoch: 84/100 starts\n", + "INFO:root:epoch: 84/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.820571 acc: 0.759375 lr: 0.000157\n", + "INFO:root:epoch: 84/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.880304 acc: 0.737500 lr: 0.000156\n", + "INFO:root:epoch: 84/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.824604 acc: 0.750000 lr: 0.000156\n", + "INFO:root:epoch: 84/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.815394 acc: 0.754687 lr: 0.000156\n", + "INFO:root:epoch: 84/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.787376 acc: 0.764375 lr: 0.000155\n", + "INFO:root:epoch: 84/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.791501 acc: 0.762500 lr: 0.000155\n", + "INFO:root:epoch: 84/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.801442 acc: 0.758482 lr: 0.000155\n", + "INFO:root:epoch: 84/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.811390 acc: 0.753516 lr: 0.000155\n", + "INFO:root:epoch: 84/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.810934 acc: 0.751736 lr: 0.000154\n", + "INFO:root:epoch: 84/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.801340 acc: 0.754688 lr: 0.000154\n", + "INFO:root:epoch: 84/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.809829 acc: 0.751989 lr: 0.000154\n", + "INFO:root:epoch: 84/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.806949 acc: 0.753385 lr: 0.000154\n", + "INFO:root:epoch: 84/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.808518 acc: 0.751202 lr: 0.000153\n", + "INFO:root:epoch: 84/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.811593 acc: 0.750893 lr: 0.000153\n", + "INFO:root:epoch: 84/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.807069 acc: 0.752292 lr: 0.000153\n", + "INFO:root:epoch: 84/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.807367 acc: 0.750586 lr: 0.000153\n", + "INFO:root:epoch: 84/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.807674 acc: 0.749816 lr: 0.000152\n", + "INFO:root:epoch: 84/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.810116 acc: 0.749653 lr: 0.000152\n", + "INFO:root:epoch: 84/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.803725 acc: 0.752796 lr: 0.000152\n", + "INFO:root:epoch: 84/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.802151 acc: 0.752969 lr: 0.000152\n", + "INFO:root:epoch: 84/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.795957 acc: 0.754018 lr: 0.000151\n", + "INFO:root:epoch: 84/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.792063 acc: 0.755682 lr: 0.000151\n", + "INFO:root:epoch: 84/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.788154 acc: 0.756386 lr: 0.000151\n", + "INFO:root:epoch: 84/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.786676 acc: 0.755599 lr: 0.000150\n", + "INFO:root:epoch: 84/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.791370 acc: 0.755500 lr: 0.000150\n", + "INFO:root:epoch: 84/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.795353 acc: 0.752644 lr: 0.000150\n", + "INFO:root:epoch: 84/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.792128 acc: 0.753009 lr: 0.000150\n", + "INFO:root:epoch: 84/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.793074 acc: 0.752567 lr: 0.000149\n", + "INFO:root:epoch: 84/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.790604 acc: 0.752909 lr: 0.000149\n", + "INFO:root:epoch: 84/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.792560 acc: 0.752396 lr: 0.000149\n", + "INFO:root:epoch: 84/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.792698 acc: 0.751915 lr: 0.000149\n", + "INFO:root:epoch: 85/100 starts\n", + "INFO:root:epoch: 85/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.766288 acc: 0.796875 lr: 0.000148\n", + "INFO:root:epoch: 85/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.747180 acc: 0.784375 lr: 0.000148\n", + "INFO:root:epoch: 85/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.769042 acc: 0.769792 lr: 0.000148\n", + "INFO:root:epoch: 85/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.766711 acc: 0.764063 lr: 0.000148\n", + "INFO:root:epoch: 85/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.766502 acc: 0.764375 lr: 0.000147\n", + "INFO:root:epoch: 85/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.761401 acc: 0.764583 lr: 0.000147\n", + "INFO:root:epoch: 85/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.755251 acc: 0.766964 lr: 0.000147\n", + "INFO:root:epoch: 85/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.761742 acc: 0.763672 lr: 0.000147\n", + "INFO:root:epoch: 85/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.764124 acc: 0.761111 lr: 0.000146\n", + "INFO:root:epoch: 85/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.773257 acc: 0.760000 lr: 0.000146\n", + "INFO:root:epoch: 85/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.786567 acc: 0.758239 lr: 0.000146\n", + "INFO:root:epoch: 85/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.796150 acc: 0.754167 lr: 0.000146\n", + "INFO:root:epoch: 85/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.802659 acc: 0.752163 lr: 0.000145\n", + "INFO:root:epoch: 85/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.798765 acc: 0.754018 lr: 0.000145\n", + "INFO:root:epoch: 85/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.793041 acc: 0.754792 lr: 0.000145\n", + "INFO:root:epoch: 85/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.798035 acc: 0.754297 lr: 0.000145\n", + "INFO:root:epoch: 85/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.799259 acc: 0.755147 lr: 0.000144\n", + "INFO:root:epoch: 85/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801317 acc: 0.754861 lr: 0.000144\n", + "INFO:root:epoch: 85/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.794634 acc: 0.758388 lr: 0.000144\n", + "INFO:root:epoch: 85/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.793190 acc: 0.758906 lr: 0.000144\n", + "INFO:root:epoch: 85/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.791191 acc: 0.759673 lr: 0.000143\n", + "INFO:root:epoch: 85/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.789623 acc: 0.759943 lr: 0.000143\n", + "INFO:root:epoch: 85/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.789784 acc: 0.759783 lr: 0.000143\n", + "INFO:root:epoch: 85/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.790267 acc: 0.759375 lr: 0.000143\n", + "INFO:root:epoch: 85/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.794921 acc: 0.757875 lr: 0.000142\n", + "INFO:root:epoch: 85/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.795896 acc: 0.757813 lr: 0.000142\n", + "INFO:root:epoch: 85/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.793987 acc: 0.758796 lr: 0.000142\n", + "INFO:root:epoch: 85/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.794202 acc: 0.758482 lr: 0.000142\n", + "INFO:root:epoch: 85/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.795893 acc: 0.757543 lr: 0.000141\n", + "INFO:root:epoch: 85/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.796696 acc: 0.756563 lr: 0.000141\n", + "INFO:root:epoch: 85/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.798361 acc: 0.756048 lr: 0.000141\n", + "INFO:root:epoch: 86/100 starts\n", + "INFO:root:epoch: 86/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.789228 acc: 0.743750 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.786835 acc: 0.756250 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.767423 acc: 0.758333 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.768584 acc: 0.751563 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.781230 acc: 0.746875 lr: 0.000140\n", + "INFO:root:epoch: 86/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.784208 acc: 0.745313 lr: 0.000139\n", + "INFO:root:epoch: 86/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.804742 acc: 0.743750 lr: 0.000139\n", + "INFO:root:epoch: 86/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.805145 acc: 0.742187 lr: 0.000139\n", + "INFO:root:epoch: 86/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.799827 acc: 0.743056 lr: 0.000139\n", + "INFO:root:epoch: 86/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.811971 acc: 0.741875 lr: 0.000138\n", + "INFO:root:epoch: 86/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.803488 acc: 0.744318 lr: 0.000138\n", + "INFO:root:epoch: 86/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.795298 acc: 0.747396 lr: 0.000138\n", + "INFO:root:epoch: 86/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.790926 acc: 0.749279 lr: 0.000138\n", + "INFO:root:epoch: 86/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.783147 acc: 0.749777 lr: 0.000137\n", + "INFO:root:epoch: 86/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.789888 acc: 0.747500 lr: 0.000137\n", + "INFO:root:epoch: 86/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.791250 acc: 0.747070 lr: 0.000137\n", + "INFO:root:epoch: 86/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.788609 acc: 0.747610 lr: 0.000137\n", + "INFO:root:epoch: 86/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.791464 acc: 0.746875 lr: 0.000136\n", + "INFO:root:epoch: 86/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.795199 acc: 0.746217 lr: 0.000136\n", + "INFO:root:epoch: 86/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.796702 acc: 0.745469 lr: 0.000136\n", + "INFO:root:epoch: 86/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.799329 acc: 0.745238 lr: 0.000136\n", + "INFO:root:epoch: 86/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.795992 acc: 0.746875 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.793393 acc: 0.749049 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.793658 acc: 0.748828 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.793849 acc: 0.748000 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.792766 acc: 0.747716 lr: 0.000135\n", + "INFO:root:epoch: 86/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.791771 acc: 0.747106 lr: 0.000134\n", + "INFO:root:epoch: 86/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.796318 acc: 0.745759 lr: 0.000134\n", + "INFO:root:epoch: 86/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.798938 acc: 0.745690 lr: 0.000134\n", + "INFO:root:epoch: 86/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.799687 acc: 0.746458 lr: 0.000134\n", + "INFO:root:epoch: 86/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.796616 acc: 0.748488 lr: 0.000133\n", + "INFO:root:epoch: 87/100 starts\n", + "INFO:root:epoch: 87/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.850964 acc: 0.740625 lr: 0.000133\n", + "INFO:root:epoch: 87/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.802344 acc: 0.746875 lr: 0.000133\n", + "INFO:root:epoch: 87/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.815467 acc: 0.739583 lr: 0.000133\n", + "INFO:root:epoch: 87/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.809854 acc: 0.742969 lr: 0.000132\n", + "INFO:root:epoch: 87/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.780924 acc: 0.751875 lr: 0.000132\n", + "INFO:root:epoch: 87/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.791136 acc: 0.755729 lr: 0.000132\n", + "INFO:root:epoch: 87/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.790528 acc: 0.755357 lr: 0.000132\n", + "INFO:root:epoch: 87/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.790394 acc: 0.755469 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.788743 acc: 0.752083 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.789852 acc: 0.753125 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.784777 acc: 0.753977 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.776484 acc: 0.757031 lr: 0.000131\n", + "INFO:root:epoch: 87/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.775763 acc: 0.756250 lr: 0.000130\n", + "INFO:root:epoch: 87/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.784094 acc: 0.754464 lr: 0.000130\n", + "INFO:root:epoch: 87/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.781750 acc: 0.754792 lr: 0.000130\n", + "INFO:root:epoch: 87/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.783062 acc: 0.754883 lr: 0.000130\n", + "INFO:root:epoch: 87/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.779886 acc: 0.756985 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.782445 acc: 0.756944 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.790414 acc: 0.756414 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.790827 acc: 0.755937 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.792944 acc: 0.755357 lr: 0.000129\n", + "INFO:root:epoch: 87/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.793317 acc: 0.754119 lr: 0.000128\n", + "INFO:root:epoch: 87/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.785357 acc: 0.756386 lr: 0.000128\n", + "INFO:root:epoch: 87/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.787292 acc: 0.754948 lr: 0.000128\n", + "INFO:root:epoch: 87/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.785757 acc: 0.755000 lr: 0.000128\n", + "INFO:root:epoch: 87/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.786238 acc: 0.755048 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.784898 acc: 0.755671 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.783881 acc: 0.755915 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.786642 acc: 0.755388 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.784731 acc: 0.755937 lr: 0.000127\n", + "INFO:root:epoch: 87/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.781761 acc: 0.756552 lr: 0.000126\n", + "INFO:root:epoch: 88/100 starts\n", + "INFO:root:epoch: 88/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.747271 acc: 0.737500 lr: 0.000126\n", + "INFO:root:epoch: 88/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.740076 acc: 0.756250 lr: 0.000126\n", + "INFO:root:epoch: 88/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.748918 acc: 0.761458 lr: 0.000126\n", + "INFO:root:epoch: 88/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.732172 acc: 0.767969 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.747654 acc: 0.769375 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.764433 acc: 0.760937 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.770631 acc: 0.760268 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.765169 acc: 0.762109 lr: 0.000125\n", + "INFO:root:epoch: 88/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.775203 acc: 0.760417 lr: 0.000124\n", + "INFO:root:epoch: 88/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.774713 acc: 0.759062 lr: 0.000124\n", + "INFO:root:epoch: 88/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.773177 acc: 0.760795 lr: 0.000124\n", + "INFO:root:epoch: 88/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.773849 acc: 0.760156 lr: 0.000124\n", + "INFO:root:epoch: 88/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.770292 acc: 0.761058 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.771836 acc: 0.759375 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.770684 acc: 0.760625 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.766468 acc: 0.762500 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.770944 acc: 0.760662 lr: 0.000123\n", + "INFO:root:epoch: 88/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.769675 acc: 0.760937 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.766954 acc: 0.762171 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.769263 acc: 0.760937 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.777239 acc: 0.757589 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.776881 acc: 0.757812 lr: 0.000122\n", + "INFO:root:epoch: 88/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.777944 acc: 0.756522 lr: 0.000121\n", + "INFO:root:epoch: 88/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.777051 acc: 0.756510 lr: 0.000121\n", + "INFO:root:epoch: 88/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.778403 acc: 0.756375 lr: 0.000121\n", + "INFO:root:epoch: 88/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.778209 acc: 0.756971 lr: 0.000121\n", + "INFO:root:epoch: 88/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.775398 acc: 0.757292 lr: 0.000120\n", + "INFO:root:epoch: 88/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.775242 acc: 0.757143 lr: 0.000120\n", + "INFO:root:epoch: 88/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.775968 acc: 0.757543 lr: 0.000120\n", + "INFO:root:epoch: 88/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.775583 acc: 0.757083 lr: 0.000120\n", + "INFO:root:epoch: 88/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.773207 acc: 0.757258 lr: 0.000120\n", + "INFO:root:epoch: 89/100 starts\n", + "INFO:root:epoch: 89/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.734807 acc: 0.753125 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.776515 acc: 0.748438 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.785510 acc: 0.744792 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.772257 acc: 0.761719 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.781800 acc: 0.763750 lr: 0.000119\n", + "INFO:root:epoch: 89/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.783931 acc: 0.761458 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.777696 acc: 0.765179 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.776828 acc: 0.763672 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.778528 acc: 0.764931 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.782296 acc: 0.764375 lr: 0.000118\n", + "INFO:root:epoch: 89/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.779648 acc: 0.763068 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.781904 acc: 0.761719 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.783871 acc: 0.761538 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.780222 acc: 0.762277 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.790571 acc: 0.760000 lr: 0.000117\n", + "INFO:root:epoch: 89/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.788741 acc: 0.760352 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.786166 acc: 0.759743 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.783236 acc: 0.761806 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.789045 acc: 0.760855 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.788909 acc: 0.761719 lr: 0.000116\n", + "INFO:root:epoch: 89/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.787145 acc: 0.762798 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.786296 acc: 0.762926 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.787380 acc: 0.762500 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.791102 acc: 0.761198 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.787385 acc: 0.762750 lr: 0.000115\n", + "INFO:root:epoch: 89/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.787774 acc: 0.761058 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.787381 acc: 0.760301 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.784104 acc: 0.761272 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.783240 acc: 0.760560 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.784704 acc: 0.760313 lr: 0.000114\n", + "INFO:root:epoch: 89/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.783017 acc: 0.761391 lr: 0.000113\n", + "INFO:root:epoch: 90/100 starts\n", + "INFO:root:epoch: 90/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.997723 acc: 0.671875 lr: 0.000113\n", + "INFO:root:epoch: 90/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.891037 acc: 0.723437 lr: 0.000113\n", + "INFO:root:epoch: 90/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.856110 acc: 0.741667 lr: 0.000113\n", + "INFO:root:epoch: 90/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.829870 acc: 0.750781 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.825318 acc: 0.754375 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.819115 acc: 0.751562 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.821462 acc: 0.746875 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.810009 acc: 0.751563 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.805757 acc: 0.753125 lr: 0.000112\n", + "INFO:root:epoch: 90/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.792173 acc: 0.755937 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.797606 acc: 0.755398 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.788231 acc: 0.758073 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.790362 acc: 0.757692 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.787044 acc: 0.758482 lr: 0.000111\n", + "INFO:root:epoch: 90/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.785833 acc: 0.758333 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.786919 acc: 0.758594 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.795069 acc: 0.755882 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801482 acc: 0.752431 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.799135 acc: 0.753618 lr: 0.000110\n", + "INFO:root:epoch: 90/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.802517 acc: 0.752344 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.802409 acc: 0.753423 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.799748 acc: 0.754119 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.799636 acc: 0.754348 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.797934 acc: 0.754036 lr: 0.000109\n", + "INFO:root:epoch: 90/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.796901 acc: 0.754625 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.796197 acc: 0.753966 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.795277 acc: 0.754514 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.794153 acc: 0.754911 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.797511 acc: 0.754741 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.798095 acc: 0.755729 lr: 0.000108\n", + "INFO:root:epoch: 90/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797423 acc: 0.755645 lr: 0.000107\n", + "INFO:root:epoch: 91/100 starts\n", + "INFO:root:epoch: 91/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.880258 acc: 0.734375 lr: 0.000107\n", + "INFO:root:epoch: 91/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.863308 acc: 0.737500 lr: 0.000107\n", + "INFO:root:epoch: 91/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.802331 acc: 0.753125 lr: 0.000107\n", + "INFO:root:epoch: 91/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.790554 acc: 0.753906 lr: 0.000107\n", + "INFO:root:epoch: 91/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.782695 acc: 0.761250 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.794129 acc: 0.754687 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.807174 acc: 0.752679 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.798740 acc: 0.752344 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.797484 acc: 0.755208 lr: 0.000106\n", + "INFO:root:epoch: 91/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.804903 acc: 0.753438 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.810278 acc: 0.753125 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.802937 acc: 0.754688 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.805282 acc: 0.751923 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.802206 acc: 0.753348 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.811627 acc: 0.749583 lr: 0.000105\n", + "INFO:root:epoch: 91/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.806510 acc: 0.751367 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.804635 acc: 0.752390 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801146 acc: 0.753993 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.805238 acc: 0.753454 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.803522 acc: 0.752500 lr: 0.000104\n", + "INFO:root:epoch: 91/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.798943 acc: 0.753720 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.803901 acc: 0.752131 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.803263 acc: 0.752717 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.800978 acc: 0.753255 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.800123 acc: 0.754125 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.799911 acc: 0.754687 lr: 0.000103\n", + "INFO:root:epoch: 91/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.800873 acc: 0.754745 lr: 0.000102\n", + "INFO:root:epoch: 91/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.797758 acc: 0.755246 lr: 0.000102\n", + "INFO:root:epoch: 91/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.795045 acc: 0.755603 lr: 0.000102\n", + "INFO:root:epoch: 91/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.794688 acc: 0.756354 lr: 0.000102\n", + "INFO:root:epoch: 91/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.795100 acc: 0.755948 lr: 0.000102\n", + "INFO:root:epoch: 92/100 starts\n", + "INFO:root:epoch: 92/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.690164 acc: 0.806250 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.703724 acc: 0.804688 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.741244 acc: 0.785417 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.774958 acc: 0.772656 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.798213 acc: 0.758750 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.797358 acc: 0.758333 lr: 0.000101\n", + "INFO:root:epoch: 92/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.792055 acc: 0.756696 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.790635 acc: 0.756250 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.781314 acc: 0.757986 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.786877 acc: 0.753437 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.793163 acc: 0.749432 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.791653 acc: 0.748958 lr: 0.000100\n", + "INFO:root:epoch: 92/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.784631 acc: 0.750000 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.792362 acc: 0.748661 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.794085 acc: 0.748125 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.801355 acc: 0.746680 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.794676 acc: 0.748346 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.794705 acc: 0.748785 lr: 0.000099\n", + "INFO:root:epoch: 92/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.794772 acc: 0.748684 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.795025 acc: 0.749063 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.792504 acc: 0.749554 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.789176 acc: 0.750852 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.787420 acc: 0.752717 lr: 0.000098\n", + "INFO:root:epoch: 92/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.790530 acc: 0.751693 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.789574 acc: 0.751500 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.790968 acc: 0.752404 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.791330 acc: 0.752662 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.788709 acc: 0.752902 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.788136 acc: 0.753556 lr: 0.000097\n", + "INFO:root:epoch: 92/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.783615 acc: 0.754583 lr: 0.000096\n", + "INFO:root:epoch: 92/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.785232 acc: 0.753831 lr: 0.000096\n", + "INFO:root:epoch: 93/100 starts\n", + "INFO:root:epoch: 93/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.731577 acc: 0.753125 lr: 0.000096\n", + "INFO:root:epoch: 93/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.806961 acc: 0.728125 lr: 0.000096\n", + "INFO:root:epoch: 93/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.780273 acc: 0.742708 lr: 0.000096\n", + "INFO:root:epoch: 93/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.794445 acc: 0.740625 lr: 0.000096\n", + "INFO:root:epoch: 93/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.785886 acc: 0.748750 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.808928 acc: 0.743750 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.807731 acc: 0.746429 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.809243 acc: 0.746094 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.801031 acc: 0.748264 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.806838 acc: 0.747187 lr: 0.000095\n", + "INFO:root:epoch: 93/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.808934 acc: 0.746023 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.813034 acc: 0.742187 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.805116 acc: 0.744471 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.810833 acc: 0.743304 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.805423 acc: 0.745000 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.805258 acc: 0.746484 lr: 0.000094\n", + "INFO:root:epoch: 93/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.799348 acc: 0.748529 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.798730 acc: 0.748958 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.799672 acc: 0.749507 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.808083 acc: 0.748437 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.808896 acc: 0.748214 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.801298 acc: 0.750284 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.797310 acc: 0.752446 lr: 0.000093\n", + "INFO:root:epoch: 93/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.798673 acc: 0.751823 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.801596 acc: 0.751750 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.799826 acc: 0.752163 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 12s eta: 2s batches: 270/313(86%) samples: 8640 loss: 0.796523 acc: 0.751968 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 13s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.796352 acc: 0.751228 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.799463 acc: 0.750108 lr: 0.000092\n", + "INFO:root:epoch: 93/100 et: 14s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.800424 acc: 0.749479 lr: 0.000091\n", + "INFO:root:epoch: 93/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.797798 acc: 0.750706 lr: 0.000091\n", + "INFO:root:epoch: 94/100 starts\n", + "INFO:root:epoch: 94/100 et: 0s eta: 14s batches: 10/313(3%) samples: 320 loss: 0.803447 acc: 0.762500 lr: 0.000091\n", + "INFO:root:epoch: 94/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.833519 acc: 0.745312 lr: 0.000091\n", + "INFO:root:epoch: 94/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.840047 acc: 0.743750 lr: 0.000091\n", + "INFO:root:epoch: 94/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.841335 acc: 0.739844 lr: 0.000091\n", + "INFO:root:epoch: 94/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.821914 acc: 0.746250 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.825954 acc: 0.744792 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.844646 acc: 0.739286 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.839375 acc: 0.737109 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.827256 acc: 0.740972 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.824656 acc: 0.745937 lr: 0.000090\n", + "INFO:root:epoch: 94/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.822932 acc: 0.746023 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.821938 acc: 0.747135 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.809640 acc: 0.751683 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.803667 acc: 0.754018 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.806155 acc: 0.752917 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.808421 acc: 0.752930 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.803617 acc: 0.754779 lr: 0.000089\n", + "INFO:root:epoch: 94/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.801626 acc: 0.754514 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.798918 acc: 0.754112 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.797969 acc: 0.755000 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.797693 acc: 0.754762 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.800783 acc: 0.753267 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.798359 acc: 0.754891 lr: 0.000088\n", + "INFO:root:epoch: 94/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.800326 acc: 0.754167 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.789442 acc: 0.758000 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.787871 acc: 0.758413 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.784470 acc: 0.759722 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.783457 acc: 0.759710 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.782562 acc: 0.759698 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.782705 acc: 0.759375 lr: 0.000087\n", + "INFO:root:epoch: 94/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.778999 acc: 0.760282 lr: 0.000086\n", + "INFO:root:epoch: 95/100 starts\n", + "INFO:root:epoch: 95/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.766231 acc: 0.762500 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.751542 acc: 0.765625 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.773591 acc: 0.756250 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.763280 acc: 0.757031 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.771654 acc: 0.757500 lr: 0.000086\n", + "INFO:root:epoch: 95/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.772644 acc: 0.754687 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.768126 acc: 0.758036 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.781850 acc: 0.756641 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.798895 acc: 0.751389 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.796311 acc: 0.752812 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.796165 acc: 0.752841 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.786548 acc: 0.754687 lr: 0.000085\n", + "INFO:root:epoch: 95/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.788553 acc: 0.756250 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.786640 acc: 0.757589 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.790195 acc: 0.756875 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.790926 acc: 0.757227 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.792668 acc: 0.755699 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.796034 acc: 0.754167 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.796901 acc: 0.753289 lr: 0.000084\n", + "INFO:root:epoch: 95/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.793658 acc: 0.754219 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.790697 acc: 0.755357 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.791174 acc: 0.755540 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.790981 acc: 0.755842 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.792551 acc: 0.754948 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.790318 acc: 0.755750 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.788822 acc: 0.756731 lr: 0.000083\n", + "INFO:root:epoch: 95/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.785852 acc: 0.757755 lr: 0.000082\n", + "INFO:root:epoch: 95/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.785526 acc: 0.757701 lr: 0.000082\n", + "INFO:root:epoch: 95/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.788618 acc: 0.757112 lr: 0.000082\n", + "INFO:root:epoch: 95/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.787243 acc: 0.757708 lr: 0.000082\n", + "INFO:root:epoch: 95/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.786788 acc: 0.758165 lr: 0.000082\n", + "INFO:root:epoch: 96/100 starts\n", + "INFO:root:epoch: 96/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.900184 acc: 0.721875 lr: 0.000082\n", + "INFO:root:epoch: 96/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.817050 acc: 0.726562 lr: 0.000082\n", + "INFO:root:epoch: 96/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.787611 acc: 0.742708 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.786342 acc: 0.742187 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.785539 acc: 0.748125 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.809411 acc: 0.743229 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.815168 acc: 0.744643 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.808906 acc: 0.747656 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.799573 acc: 0.748958 lr: 0.000081\n", + "INFO:root:epoch: 96/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.809905 acc: 0.745937 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.815977 acc: 0.744886 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.809168 acc: 0.747656 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.803039 acc: 0.749038 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.803802 acc: 0.750670 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.800406 acc: 0.751458 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.801153 acc: 0.751367 lr: 0.000080\n", + "INFO:root:epoch: 96/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.795543 acc: 0.753493 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.792251 acc: 0.754340 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.795225 acc: 0.754276 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.794892 acc: 0.755156 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.801911 acc: 0.753571 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.804700 acc: 0.753125 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.806657 acc: 0.752446 lr: 0.000079\n", + "INFO:root:epoch: 96/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.804139 acc: 0.752604 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.802955 acc: 0.753250 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.800339 acc: 0.753726 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.800201 acc: 0.752894 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.800504 acc: 0.751563 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.798371 acc: 0.752155 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.798875 acc: 0.751354 lr: 0.000078\n", + "INFO:root:epoch: 96/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.798440 acc: 0.751915 lr: 0.000078\n", + "INFO:root:epoch: 97/100 starts\n", + "INFO:root:epoch: 97/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.719486 acc: 0.768750 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.778803 acc: 0.753125 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 1s eta: 13s batches: 30/313(9%) samples: 960 loss: 0.755296 acc: 0.759375 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.797087 acc: 0.750781 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.783176 acc: 0.753750 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.789088 acc: 0.750000 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.783504 acc: 0.753125 lr: 0.000077\n", + "INFO:root:epoch: 97/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.795049 acc: 0.750391 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.797903 acc: 0.750347 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.795908 acc: 0.749375 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.791749 acc: 0.748295 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.793694 acc: 0.749219 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 6s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.790152 acc: 0.750962 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 6s eta: 8s batches: 140/313(44%) samples: 4480 loss: 0.781105 acc: 0.752232 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.780573 acc: 0.753333 lr: 0.000076\n", + "INFO:root:epoch: 97/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.778776 acc: 0.753125 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.777386 acc: 0.753493 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.778101 acc: 0.753646 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.782027 acc: 0.752467 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.775025 acc: 0.755156 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.772303 acc: 0.756101 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.768858 acc: 0.757670 lr: 0.000075\n", + "INFO:root:epoch: 97/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.770851 acc: 0.756522 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.770891 acc: 0.755990 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.773270 acc: 0.755750 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 12s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.771640 acc: 0.756611 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.771903 acc: 0.756019 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.777557 acc: 0.754911 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.780185 acc: 0.753664 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.779205 acc: 0.755417 lr: 0.000074\n", + "INFO:root:epoch: 97/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.780044 acc: 0.756048 lr: 0.000073\n", + "INFO:root:epoch: 98/100 starts\n", + "INFO:root:epoch: 98/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.707994 acc: 0.771875 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.712420 acc: 0.775000 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.744193 acc: 0.769792 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.754451 acc: 0.761719 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.754195 acc: 0.758125 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.764942 acc: 0.757813 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.758779 acc: 0.757589 lr: 0.000073\n", + "INFO:root:epoch: 98/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.766650 acc: 0.755078 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.757546 acc: 0.757639 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.761171 acc: 0.758125 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.762360 acc: 0.759943 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.770996 acc: 0.758594 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.768122 acc: 0.759615 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.772493 acc: 0.756250 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.770627 acc: 0.757500 lr: 0.000072\n", + "INFO:root:epoch: 98/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.768439 acc: 0.757227 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.781094 acc: 0.754228 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.785662 acc: 0.754340 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.783487 acc: 0.755099 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.780362 acc: 0.755156 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.784881 acc: 0.754613 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.782945 acc: 0.756108 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.784274 acc: 0.756522 lr: 0.000071\n", + "INFO:root:epoch: 98/100 et: 11s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.781904 acc: 0.757292 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.778922 acc: 0.758000 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.776607 acc: 0.758053 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.776517 acc: 0.758102 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.776057 acc: 0.758147 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.775508 acc: 0.757759 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.774023 acc: 0.758854 lr: 0.000070\n", + "INFO:root:epoch: 98/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.771974 acc: 0.759173 lr: 0.000070\n", + "INFO:root:epoch: 99/100 starts\n", + "INFO:root:epoch: 99/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.837768 acc: 0.746875 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.823928 acc: 0.743750 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.777027 acc: 0.758333 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.774453 acc: 0.763281 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 2s eta: 12s batches: 50/313(15%) samples: 1600 loss: 0.784160 acc: 0.758125 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.773204 acc: 0.759896 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.767902 acc: 0.762500 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.773605 acc: 0.759766 lr: 0.000069\n", + "INFO:root:epoch: 99/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.792748 acc: 0.755903 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.793041 acc: 0.755625 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 5s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.783605 acc: 0.757955 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.784180 acc: 0.757292 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.787015 acc: 0.759135 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.784868 acc: 0.758929 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.783041 acc: 0.760000 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 7s eta: 7s batches: 160/313(51%) samples: 5120 loss: 0.787532 acc: 0.759961 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.780208 acc: 0.763235 lr: 0.000068\n", + "INFO:root:epoch: 99/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.782567 acc: 0.761632 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.782641 acc: 0.760197 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.780030 acc: 0.760781 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.777667 acc: 0.761310 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.778561 acc: 0.760938 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.784298 acc: 0.758967 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.782546 acc: 0.758854 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.782698 acc: 0.758625 lr: 0.000067\n", + "INFO:root:epoch: 99/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.781098 acc: 0.758413 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.781811 acc: 0.758796 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.778903 acc: 0.760156 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.774765 acc: 0.761422 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.775532 acc: 0.760313 lr: 0.000066\n", + "INFO:root:epoch: 99/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.778129 acc: 0.759173 lr: 0.000066\n", + "INFO:root:epoch: 100/100 starts\n", + "INFO:root:epoch: 100/100 et: 0s eta: 13s batches: 10/313(3%) samples: 320 loss: 0.728071 acc: 0.762500 lr: 0.000066\n", + "INFO:root:epoch: 100/100 et: 0s eta: 13s batches: 20/313(6%) samples: 640 loss: 0.806044 acc: 0.764063 lr: 0.000066\n", + "INFO:root:epoch: 100/100 et: 1s eta: 12s batches: 30/313(9%) samples: 960 loss: 0.840349 acc: 0.759375 lr: 0.000066\n", + "INFO:root:epoch: 100/100 et: 1s eta: 12s batches: 40/313(12%) samples: 1280 loss: 0.832785 acc: 0.755469 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 2s eta: 11s batches: 50/313(15%) samples: 1600 loss: 0.830045 acc: 0.754375 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 2s eta: 11s batches: 60/313(19%) samples: 1920 loss: 0.827730 acc: 0.751042 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 3s eta: 11s batches: 70/313(22%) samples: 2240 loss: 0.814045 acc: 0.758482 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 3s eta: 10s batches: 80/313(25%) samples: 2560 loss: 0.798234 acc: 0.760547 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 4s eta: 10s batches: 90/313(28%) samples: 2880 loss: 0.800798 acc: 0.757986 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 4s eta: 9s batches: 100/313(31%) samples: 3200 loss: 0.792712 acc: 0.757500 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 4s eta: 9s batches: 110/313(35%) samples: 3520 loss: 0.783494 acc: 0.760227 lr: 0.000065\n", + "INFO:root:epoch: 100/100 et: 5s eta: 8s batches: 120/313(38%) samples: 3840 loss: 0.777760 acc: 0.761719 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 5s eta: 8s batches: 130/313(41%) samples: 4160 loss: 0.774788 acc: 0.762981 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 6s eta: 7s batches: 140/313(44%) samples: 4480 loss: 0.766845 acc: 0.765848 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 6s eta: 7s batches: 150/313(47%) samples: 4800 loss: 0.765367 acc: 0.767292 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 7s eta: 6s batches: 160/313(51%) samples: 5120 loss: 0.762996 acc: 0.768945 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 7s eta: 6s batches: 170/313(54%) samples: 5440 loss: 0.766414 acc: 0.768382 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 8s eta: 6s batches: 180/313(57%) samples: 5760 loss: 0.768261 acc: 0.766319 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 8s eta: 5s batches: 190/313(60%) samples: 6080 loss: 0.768353 acc: 0.765954 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 9s eta: 5s batches: 200/313(63%) samples: 6400 loss: 0.769359 acc: 0.765156 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 9s eta: 4s batches: 210/313(67%) samples: 6720 loss: 0.771308 acc: 0.765030 lr: 0.000064\n", + "INFO:root:epoch: 100/100 et: 10s eta: 4s batches: 220/313(70%) samples: 7040 loss: 0.771582 acc: 0.764631 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 10s eta: 3s batches: 230/313(73%) samples: 7360 loss: 0.776534 acc: 0.762500 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 10s eta: 3s batches: 240/313(76%) samples: 7680 loss: 0.782496 acc: 0.759635 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 11s eta: 2s batches: 250/313(79%) samples: 8000 loss: 0.782019 acc: 0.760250 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 11s eta: 2s batches: 260/313(83%) samples: 8320 loss: 0.780009 acc: 0.761058 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 12s eta: 1s batches: 270/313(86%) samples: 8640 loss: 0.776966 acc: 0.760880 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 12s eta: 1s batches: 280/313(89%) samples: 8960 loss: 0.776915 acc: 0.761607 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 13s eta: 1s batches: 290/313(92%) samples: 9280 loss: 0.773110 acc: 0.763901 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 13s eta: 0s batches: 300/313(95%) samples: 9600 loss: 0.776147 acc: 0.762917 lr: 0.000063\n", + "INFO:root:epoch: 100/100 et: 14s eta: 0s batches: 310/313(99%) samples: 9920 loss: 0.777149 acc: 0.762601 lr: 0.000062\n" + ] + } + ], + "source": [ + "logger = logging.getLogger()\n", + "logger.setLevel(0)\n", + "trainer.fit(train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " epoch train_loss train_acc lr val_loss val_acc\n", + "21 1 4.597061 0.0185 0.003120 4.604409 0.03\n", + "22 2 4.576668 0.0248 0.006250 4.550752 0.03\n", + "23 3 4.429581 0.0333 0.009380 4.132248 0.03\n", + "24 4 3.932376 0.0777 0.010000 3.541716 0.11\n", + "25 5 3.252122 0.1659 0.010000 3.084885 0.17\n", + ".. ... ... ... ... ... ...\n", + "116 96 0.799725 0.7515 0.000077 0.671766 0.78\n", + "117 97 0.781655 0.7553 0.000073 0.714923 0.77\n", + "118 98 0.771329 0.7589 0.000070 0.513621 0.84\n", + "119 99 0.778845 0.7587 0.000066 0.385253 0.89\n", + "120 100 0.777107 0.7624 0.000062 0.483041 0.85\n", + "\n", + "[100 rows x 6 columns]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "train_result = pd.read_csv('./tdnn_xvec/train.log')\n", + "train_result = train_result[train_result.index>20]\n", + "print(train_result)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA68AAALYCAYAAACTyMQkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAACcVUlEQVR4nOzddZiU1RvG8fvdpbthF2nEAgQFlA4BAUVFRAzs7u6un4otJraioihiIIqigIiKhGIhioAg3V275/fHw7DBxszuzLwT3891zTUweXZ3dmfu9zznOZ5zTgAAAAAAxLIUvwcAAAAAAEBhCK8AAAAAgJhHeAUAAAAAxDzCKwAAAAAg5hFeAQAAAAAxj/AKAAAAAIh5hFcAAAAAQMwjvAIAkprneYd5nveq53nzPM/bsvv0p+d5wz3POzzb7Sp7nrfI87xNnuc1yuexnvE8z3med3W2y87cfVlBpzuDGGfD3bf9JCxfOAAAcaaE3wMAAMAPnuelSnpU0uWSdkiaIOkDSZmS9pV0sqTzPM873Tn3hnNuved550v6VNJLnucd4Zxz2R6vp6QLJX0r6fE8nvILSVPzGc7EsHxRAAAkMMIrACBZ/U8WXGdIOsE5tyD7lZ7nVZZ0o6Qqgcucc+M8z3tZ0tmSLpH01O7bVpL0kqStks5yzmXm8XzjnXMPh//LAAAgOVA2DABIOp7n7SvpWkmrJPXNHVwlyTm33jl3k6Thua66WtJiSQ94ntd492WPSaov6Wbn3F8RG3gIPM/r6nne557nrfU8b6vneb96nne953kl87jtIM/zvvE8b6Xneds8z/vP87xPPc/rlet2PT3PG+953lLP87Z7nrfM87yvPc87KXpfGQAgWRFeAQDJ6EzZe+DzzrmVBd3QObc91//XSzpPUnlJr3ied7RsJvYbSU9GZLQh2h0mv5LUXtJ7koZJ8iQ9KGm053lettteIuldSWmS3pEF8S8lNZN0ZLbbHS1pvKTmkj6S9IishLq6pBMi/kUBAJIeZcMAgGTUYff5V0W5s3PuM8/zXpJ0jqTDJG2RlQu7Au7W2/O8Cvlc95xzbllRxpLb7nLn5yVtk3SYc+6P3ZffImmcpKMlnS7ptd13OUfSEkktnXNbcj1W9Wz/PVvSTkmtnHMrCrgdAAARQXgFACSjOrvP/yvGY9wm6SxJpSXd4pybV8jte+0+5WWMpLCEV0nHSaok6clAcJUk59xOz/NulPSjpDOUFV4la1iVkfuBnHOrc120c/epsNsBABB2lA0DAFA0NynrffSY3d2LC3Kdc87L5/RTGMd18O7zSbmvcM5Nl7Qp220kKxVuKOlXz/Pu8TzvCM/zyuXxuO/ISqV/9TzvYc/z+u2e5QUAICoIrwCAZBSY5axblDt7ntdV0qWSvpf0sqx0+KrwDK3YKu0+X57P9cuz3UaShko6X9IGSbfK1ruu8TxvhOd5tQM3cs69I2mgpPmyr3WspFWe533seV7T8H4JAADsjfAKAEhGgf1We4R6R8/zyssC63ZZ2fBVkhZJusfzvGZhG2HRbdh9Xjuf62tnu42cecE5d6ikWpIGyRoznSpr5KRstx3tnOskqZps7exbu8/HeZ5XKqxfBQAAuRBeAQDJ6FVJmZLO9zyvRkE39DyvdK6LHpTUWNLtzrk5zrkNsu7DZWTdh/1+b/1p93mX3Fd4nneIpArZbpODc26lc+4959wxu2/TJa/S4N3bCI11zp0h6UNJTSUdEJbRAwCQD7/fYAEAiLrde7E+LKmmpLGe5zXIfRvP8yp6nnevrKQ2cFl3SRfLyoUfzfZ4n8tmYztIujyyoy/Uh7KZ1XOzzwR7nldC0gO7//t6tsu75n4Az/PKSqosaZd2N3LyPK9z7nW9u4N6IPxvC+PXAADAXryCu/oDAJCYdgexxyRdJuu2+6Wk32Uzsk1knYErSTrNOTdi9zY3v8g6Fbd2zs3J9XiVJf0qK6ltGeg+7HnemZJekfSFssqVc5vjnBtZyHgbytabLpY0IZ+bTXHOveh53smSRsiaM70jaa2koyQdJOkTSccEtvXxPG+dpHWSfpC0UDaD3Fc2m/qUc+6y3bf7affX/q2kBbJ9Y7tLaiXpE+dc/4LGDwBAcRFeAQBJzfO8wyVdKKmzpDRZVdIiSZMlPe+cm7b7ds9IukjS9c65h/J5rL6SPt19327OOZctvBbkQ+fccYWMs6EsvBbkNefcmbtv303WEfkwWSCdJ5txfdQ5t2e7G8/zLpKF1ZaycLpR0p+yvWJHZAu5g2UNmw6VfZ+2SfpHtuXO8865HYWMDQCAYiG8AgAAAABiHmteAQAAAAAxj/AKAAAAAIh5hFcAAAAAQMwjvAIAAAAAYl4JvwcQiho1ariGDRv6OobNmzerfPnyvo4BCOD1iFjC6xGxhNcjYgmvR8SSWH89zpgxY5VzrmZe18VVeG3YsKGmT5/u6xgmTpyobt26+ToGIIDXI2IJr0fEEl6PiCW8HhFLYv316Hnewvyuo2wYAAAAABDzCK8AAAAAgJhHeAUAAAAAxDzCKwAAAAAg5hFeAQAAAAAxj/AKAAAAAIh5hFcAAAAAQMwjvAIAAAAAYh7hFQAAAAAQ8wivAAAAAICYR3gFAAAAAMQ8wisAAAAAIOYRXgEAAAAAMY/wCgAAAACIeYRXAAAAAEDMI7wCAAAAAGIe4RUAAAAAEPMIrwAAAACAmEd4BQAAAADEPMIrAAAAACDmEV4BAAAAADGP8AoAAAAAiHmEVwAAAABAzCO8AgAAAABiHuEVAAAAABDzCK8AAAAAgJhHeAUAAAAAxDzCazg9+6w0dKjfowAAAACAhEN4DRfn9OfLU6QbbtD6C66XnPN7RAAAAACQMAiv4eJ5uqPh63paF6vy8If0SZ1z9PQTu7R8ud8DAwAAAID4R3gNo5GjUtVv3lOa0uN2Hb3iFdW98gQ1StumI4+UXn1VWr/e7xECAAAAQHwivIZZo8aeOk24S3rySR2nD/VrvT5aOme9zjpLql1buvhiKooBAAAAIFSE10i57DLpzTfVeMm3+rl6d/04doWOP956Ov3wg9+DAwAAAID4QniNpFNOkT76SN6cOWpzRUe9cMsCVaggDR/u98AAAAAAIL4QXiOtb1/pyy+lVatUvndHXdvnV40cyfpXAAAAAAgF4TUaOnSQJk+WnNNNk47U1q1Ob73l96AAAAAAIH4QXqOlRQvp1ltVauUS9Wq+VMOH07gJAAAAAIJFeI2mJk0kSRf3+ls//STNmOHvcAAAAAAgXhBeo6lpU0lS7ybzVK4cjZsAAAAAIFiE12hq0EAqUULllvytwYOlt9+WNm70e1AAAAAAEPsIr9FUooQF2L//1vnnS5s2SSNH+j0oAAAAAIh9hNdoa9pUmjdPhx0mNW8uvfCC3wMCAAAAgNhHeI22pk2lv/+WJ6fzz5d+/FH66Se/BwUAAAAAsY3wGm1Nmkjr10urV2vIEKlMGWZfAQAAAKAwhNdo291xWPPmqWpVadAgacQIafNmf4cFAAAAALGM8BptgfD699+SpPPOkzZskEaN8nFMAAAAABDjCK/R1qiR5Hl7wmunTtL++7PnKwAAAAAUhPAabWXKSPvsI82bJ8ly7HnnSd99J/36q89jAwAAAIAYRXj1w+6OwwGnny6VKkXjJgAAAADID+HVD7nCa40a0vHHS6+/Lm3d6uO4AAAAACBGEV790KSJtHKldWra7fzzpXXrpPff929YAAAAABCrCK9+yLZdTkC3bnYxjZsAAAAAYG+EVz/k2i5HssZN554rffONtGCBP8MCAAAAgFhFePVD48Z2nm3mVZI6drTzuXOjPB4AAAAAiHGEVz9UrCjVrp1j5lWS6te383//9WFMAAAAABDDCK9+ydVxWJLS06WUFMIrAAAAAORGePVLkyZ7lQ2XKCHVrUt4BQAAAIDcCK9+adpUWrx4r41d69cnvAIAAABAboRXvwQ6Dv/zT46LCa8AAAAAsDfCq1+aNLHzXKXD9etLixZJmZk+jAkAAAAAYhTh1S957PUqWXjdsUNascKHMQEAAABAjCK8+qVaNalqVbbLAQAAAIAgEF79lEfHYcIrAAAAAOyN8OqnPPZ6JbwCAAAAwN4Ir35q2lRauFDauXPPRZUrSxUr2sUAAAAAAEN49VPTplJGRo6k6nlslwMAAAAAuRFe/RTYLieP0mHCKwAAAABkIbz6qYDtcgivAAAAAJCF8Oqn2rWl8uXz7Di8apW0ZYtP4wIAAACAGEN49ZPnWelwPh2HFy3yYUwAAAAAEIMIr35juxwAAAAAKBTh1W9Nm0r//GNdh3cjvAIAAABAToRXvzVpIu3YIf33356L6ta1imLCKwAAAAAYwqvf8ug4XLKklJ5OeAUAAACAAMKr3wLhNY+Ow4RXAAAAADCEV7/VrSuVKsVerwAAAABQAMKr31JTpcaN8wyvixZJmZk+jQsAAAAAYgjhNRbks13O9u3SypU+jQkAAAAAYgjhNRY0aWJrXp3bcxHb5QAAAABAFsJrLGjaVNq8WVq+fM9FhFcAAAAAyEJ4jQV5bJdDeAUAAACALITXWJDHdjlVq0oVKhBeAQAAAEAivMaGBg2s63C2mVfPY7scAAAAAAggvMaCkiUtwLLXKwAAAADkifAaK5o2zVE2LBFeAQAAACCA8BormjTJc+Z1xQpp61afxgQAAAAAMYLwGiuaNpXWrpXWrNlzUaDj8KJFPo0JAAAAAGIE4TVW5NFxmO1yAAAAAMAQXmNFkyZ2zl6vAAAAALAXwmusaNzYzrOF17p1bcscwisAAACAZEd4jRVly0r77JOjbLhUKSktjfAKAAAAAITXWJJPx2HCKwAAAIBkR3iNJU2bEl4BAAAAIA+E11jStKm0fLm0adOeiwLh1TkfxwUAAAAAPiO8xpJA06Z//tlzUf360vbt0sqVPo0JAAAAAGIA4TWW1K1r50uW7LmI7XIAAAAAgPAaW9LT7Xzp0j0XEV4BAAAAgPAaW9LS7JzwCgAAAAA5EF5jSZkyUpUqOcJrtWpSuXKEVwAAAADJjfAaa9LTc6x59Ty2ywEAAAAAwmusSUvLMfMqEV4BAAAAgPAaawivAAAAALAXwmusSUuzsmHn9lxUv760fLm0bZuP4wIAAAAAHxFeY016urRjh7R27Z6LAh2HFy/2aUwAAAAA4DPCa6xhuxwAAAAA2AvhNdYQXgEAAABgL4TXWJOebufZtsvZZx87J7wCAAAASFaE11iTx8xr6dJSnTqEVwAAAADJy/fw6nneZ57nOc/z7vV7LDGhfHmpYsU8t8tZuNCnMQEAAACAz3wNr57nnSzpYD/HEJMC2+Vkw16vAAAAAJKZb+HV87yqkh6TdLVfY4hZ6el5zrz++2+O7V8BAAAAIGn4OfP6oKRfnXNv+ziG2JSWlmd43bZNWrXKpzEBAAAAgI9K+PGknud1knS6KBnOWyC8Oid5nqSc2+XUrOnj2AAAAADAB1EPr57nlZL0vKSHnXN/BnH78yWdL0m1a9fWxIkTIzvAQmzatCniY9hn61Y13bJF34wdq4wKFSRJK1ZUkNRGn376qzZuZPoVJhqvRyBYvB4RS3g9IpbwekQsiefXox8zr9dLKivpvmBu7JwbLmm4JLVp08Z169YtciMLwsSJExXxMSxZIj37rDo3bSrtv78k6aCDpAsvlCpVai6fvwWIIVF5PQJB4vWIWMLrEbGE1yNiSTy/HqO65tXzvPqSbpF0m6TSnudV8Tyvyu6rA/9PjeaYYlIee73WqCGVKUPHYQAAAADJKdoNmxpLKiNphKS12U6SdO3uf7eI8phiTyC8Ztsux/PYLgcAAABA8op22fBPkrrncfnXskD7kqS/ozmgmJSebuf5bJcDAAAAAMkmquHVObdO0sTcl3vWUXehc26v65JSxYpSuXJ5htdPP/VpTAAAAADgIz/3eUV+PC/fvV6XLZO2b/dpXAAAAADgE1/2ec3NOef5PYaYk56eY82rJDVoYOeLF0tNmvgwJgAAAADwCTOvsSqfmVeJda8AAAAAkg/hNVblEV7r1bNzwisAAACAZEN4jVVpadLGjdKmTXsuqlPHzles8GlMAAAAAOATwmusymO7nAoVpDJlCK8AAAAAkg/hNValpdl5tvDqeVKtWoRXAAAAAMmH8Bqr8givEuEVAAAAQHIivMaqQNlwru1yCK8AAAAAkhHhNVZVqSKVLs3MKwAAAACI8Bq7PC/P7XIC4dU5n8YFAAAAAD4gvMaytLS9yoZr1pR27JA2bPBpTAAAAADgA8JrLEtPz3PmVaJ0GAAAAEByIbzGsnzKhiXCKwAAAIDkQniNZWlp0rp10tatey4ivAIAAABIRoTXWBbYLifb7CvhFQAAAEAyIrzGsrQ0O88WXmvWtHPCKwAAAIBkQniNZXmE19KlpcqVpZUrfRoTAAAAAPiA8BrLAmXDubbLCez1CgAAAADJgvAay6pXl0qWzLPjMOEVAAAAQDIhvMYyz5Pq1CG8AgAAAEh6hNdYl89er4RXAAAAAMmE8Brr0tPzXPO6apWUkeHTmAAAAAAgygivsS6fmVfnpNWrfRoTAAAAAEQZ4TXWpaVZSt2+fc9FtWrZOaXDAAAAAJIF4TXWBbbLWbZsz0WEVwAAAADJhvAa69LS7Dxb6TDhFQAAAECyIbzGOsIrAAAAABBeY14e4bVaNSklhfAKAAAAIHkQXmNdzZpSamqO7XJSUuxiwisAAACAZEF4jXWpqVLt2nlul0N4BQAAAJAsCK/xIJ+9XgmvAAAAAJIF4TUepKfnKBuWKBsGAAAAkFwIr/GAmVcAAAAASY7wGg/S0qSVK6WdO/dcVKuWtHGjtG2bj+MCAAAAgCghvMaDtDTJOWn58j0XBfZ6XbnSpzEBAAAAQBQRXuNBerqdZysdDoRXSocBAAAAJAPCazxIS7NzwisAAACAJEV4jQeEVwAAAABJjvAaD2rXljwvx3Y5hFcAAAAAyYTwGg9KlLC0mm3mtUIFqUwZwisAAACA5EB4jRe59nr1PPZ6BQAAAJA8CK/xIi0tR9mwRHgFAAAAkDwIr/EiPT3HzKtEeAUAAACQPAiv8SItTVq+XMrI2HMR4RUAAABAsiC8xou0NCkzU1q5cs9FgfDqnI/jAgAAAIAoILzGi/R0O8+1Xc6OHdKGDT6NCQAAAACihPAaL9LS7Dzbulf2egUAAACQLAiv8YLwCgAAACCJEV7jRZ06dp6rbFgivAIAAABIfITXeFGqlFSjBjOvAAAAAJIS4TWepKXlCK81a9o54RUAAABAoiO8xpNc4bVUKalKFcIrAAAAgMRHeI0n6ek51rxKNvuabetXAAAAAEhIhNd4kpYmLVsmZWbuuahWLWZeAQAAACQ+wms8SUuTdu2SVq/ecxHhFQAAAEAyILzGk8Ber7m2yyG8AgAAAEh0hNd4kp5u57m2y1m1SsrI8GlMAAAAABAFhNd4Egiv//2356JatSTnclQSAwAAAEDCIbzGk7p1pZQUaeHCPRfVqmXnlA4DAAAASGSE13hSsqRUr540f/6eiwivAAAAAJIB4TXeNGpEeAUAAACQdAiv8aZhQ8IrAAAAgKRDeI03jRrZVjnbt0uSqlWzZbCEVwAAAACJjPAabxo1svPdTZtSUqSaNQmvAAAAABIb4TXeNGxo57lKhwmvAAAAABIZ4TXeBGZeFyzYcxHhFQAAAECiI7zGm/R02zKHmVcAAAAASYTwGm9SUqQGDQivAAAAAJIK4TUeNWq0V9nwxo3S1q3+DQkAAAAAIonwGo8aNcpzr9eVK30aDwAAAABEGOE1HjVsaEl10yZJhFcAAAAAiY/wGo9y7fUaCK+sewUAAACQqAiv8SgQXneXDhNeAQAAACQ6wms8atjQzneH15o17b+EVwAAAACJivAaj2rVksqV29NxuEIFqUwZwisAAACAxEV4jUeeZ7Ovu2dePY+9XgEAAAAkNsJrvMoWXiXCKwAAAIDERniNV3ns9Up4BQAAAJCoCK/xqlEjaf16ad06SYRXAAAAAImN8BqvcnUcDoRX5/wbEgAAAABECuE1XuWx1+uOHdKGDT6OCQAAAAAihPAarwLhdfd2ObVq2X8pHQYAAACQiAiv8apKFalSpRwzrxLhFQAAAEBiIrzGK8/L0XGY8AoAAAAgkRFe41mjRpQNAwAAAEgKhNd41rChzbw6p5o17SLCKwAAAIBERHiNZ40aSVu2SCtXqlQpWwZLeAUAAACQiAiv8SyPjsOEVwAAAACJiPAazxo2tPNsTZtWrvRvOAAAAAAQKYTXeBaYec0WXpl5BQAAAJCICK/xrEIFqUYNwisAAACAhEd4jXcNG+ZY87pqlZSR4euIAAAAACDsCK/xrlGjHDOvzkmrV/s8JgAAAAAIM8JrvGvUSFq4UMrMVK1adhGlwwAAAAASDeE13jVsKO3YIS1dqpo17SLCKwAAAIBEQ3iNd9k6DjPzCgAAACBREV7jHeEVAAAAQBIgvMa7Bg3sfMECVasmpaQQXgEAAAAkHsJrvCtTRkpLk+bPV0qKVKeOtHix34MCAAAAgPAivCaCbNvlNG4s/fOPz+MBAAAAgDAjvCaCRo2kBQskSU2aEF4BAAAAJB7CayJo2FBatEjatUtNmkj//Sdt3er3oAAAAAAgfAiviaBRIykjQ1q0SI0b20W7q4gBAAAAICEQXhNBYLucBQvUpIn9k9JhAAAAAImE8JoIGja08/nz94TXefN8Gw0AAAAAhB3hNRHUq2cbvM6frxo1pAoVCK8AAAAAEgvhNRGULGkBdv58eR4dhwEAAAAkHsJromjYMMd2Ocy8AgAAAEgkhNdE0ajRnhbDjRvbPzMzfR4TAAAAAIQJ4TVRNGokLVkibdumJk2k7dvtvwAAAACQCAiviSLQcfjff/fs9UrpMAAAAIBEQXhNFIG9XrNtl0PTJgAAAACJgvCaKLKF1/r1pdRUZl4BAAAAJA7Ca6JIS7MtcxYsUMmSUv36hFcAAAAAiYPwmihSU6UGDfZ0HGavVwAAAACJhPCaSLJtl8NerwAAAAASCeE1kTRsKC1YIMn2el29Wlq/3tcRAQAAAEBYEF4TSaNG0sqV0qZNdBwGAAAAkFAIr4kk0HF4wYI94ZXSYQAAAACJgPCaSBo2tPP589W4sf2TmVcAAAAAiYDwmkiyzbxWqiTVqMHMKwAAAIDEQHhNJLVqSWXL0nEYAAAAQMIhvCYSz5Pq1pWWLJFkHYcpGwYAAACQCAiviSYtTVq6VJLNvP77r7Rzp89jAgAAAIBiIrwmmmzhtXFjKSNDWrjQ5zEBAAAAQDERXhNNrplXidJhAAAAAPGP8Jpo0tKkTZukjRvZ6xUAAABAwiC8Jpr0dDtfulRpaVLp0oRXAAAAAPEv6uHV87wjPc/7yvO8ZZ7nbfc8b7Hnee96nndgtMeSkNLS7HzpUqWk0HEYAAAAQGIo4cNzVpM0Q9IzklZKqi/pRknfe57XwjlHe6HiyBZeJfZ6BQAAAJAYoh5enXNvS3o7+2We502TNEfSCZIeifaYEkqu8Nq4sTRxouScbQMLAAAAAPEoVta8rt59vsvXUSSCqlVtoWu2mddNm6SVK30eFwAAAAAUg2/h1fO8VM/zSnmet6+k5yUtU64ZWRSB50l16uy1XQ6lwwAAAADimZ8zrz9I2i5prqSWkno451b4OJ7EkW2v18aN7SKaNgEAAACIZ340bAo4TVIlSY0lXSvpC8/zOjnnFmS/ked550s6X5Jq166tiRMnRnmYOW3atMn3MRTmoJIlVe7vv/XjxInasSNFntdZX365QHXr0gsr0cTD6xHJg9cjYgmvR8QSXo+IJfH8evScc36PQZ7nVZG0QNJI59yF+d2uTZs2bvr06dEaVp4mTpyobt26+TqGQl1yifTWW9LatZKkevWkHj2k117zeVwIu7h4PSJp8HpELOH1iFjC6xGxJNZfj57nzXDOtcnrupho2OScWyfpb0lNfR5KYkhPl9atk7ZulcRerwAAAADiX0yEV8/zakvaXxJthcIhsF3OsmWS2OsVAAAAQPyL+ppXz/M+kDRT0mxJGyQ1k3SVbJsc9ngNh+x7vTZqpCZN7J9btkjlyvk7NAAAAAAoCj9mXr+XdJyk1ySNlXS1pEmSWjnn5vownsSTPbwqq+Pw/Pk+jQcAAAAAiinqM6/OuQclPRjt500qucJr9r1eDzrIpzEBAAAAQDHExJpXhFnNmlJqKnu9AgAAAEgYhNdElJIi1a69J7xWry5VqkTTJgAAAADxi/CaqNLS9oRXz6PjMAAAAID4RnhNVOnp0pIle/7LXq8AAAAA4hnhNVFlm3mVbOZ1/nwpI8PHMQEAAABAERFeE1VamrRypbRzpyQLrzt2SP/95/O4AAAAAKAICK+JKrBdzvLlkug4DAAAACC+EV4TVQF7vQIAAABAvCG8Jqpc4bVePalECcIrAAAAgPhEeE1UucJriRJSgwaUDQMAAACIT4TXRFW7tm3wmqvjMDOvAAAAAOIR4TVRlSwp1ayZY6/XJk2YeQUAAAAQnwiviSzXXq+NG0tr1kjr1vk3JAAAAAAoCsJrIssVXuk4DAAAACBeEV4TWR4zrxKlwwAAAADiD+E1kaWlScuXSxkZkrLCKzOvAAAAAOIN4TWRpaVZcF21SpJUsaJUqxbhFQAAAED8Ibwmslx7vUo2+/r33z6NBwAAAACKiPCayPIIr61bSzNm7KkkBgAAAIC4QHhNZOnpdp4tvHbqJG3cKP3yi09jAgAAAIAiILwmsjp17HzJkj0Xdexo51Om+DAeAAAAACgiwmsiK1NGqlo1x8xr/frSPvsQXgEAAADEF8Jrosu116vnWenwlCmScz6OCwAAAABCQHhNdLnCq2Th9b//pH//9WlMAAAAABAiwmuiyyO8su4VAAAAQLwhvCa6QHjNViPcooVUsSLhFQAAAED8ILwmurQ0accOae3aPRelpkodOkjffuvjuAAAAAAgBITXRJeWZud5lA7/+muOTAsAAAAAMYvwmujS0+08216vkjVtck767jsfxgQAAAAAISK8Jrp8Zl7btZNKlKB0GAAAAEB8ILwmunzCa/nyUuvWNG0CAAAAEB8Ir4muQgU75QqvkpUOT5tm/ZwAAAAAIJYRXpNBHnu9ShZet22TZs70YUwAAAAAEALCazLIJ7x27GjnlA4DAAAAiHWE12SQT3itXVtq2pTwCgAAACD2EV6TQT7hVbLS4W+/tW1zAAAAACBWEV6TQXq6tHmztHHjXld16iStWiXNnevDuAAAAAAgSITXZBDYLmfJkr2uYt0rAAAAgHhAeE0G+ez1Kkn77SdVr054BQAAABDbCK/JoIDw6nlZ614BAAAAIFYRXpNBAeFVstLhv/6Sli+P4pgAAAAAIASE12RQpYpUunSBHYclZl8BAAAAxC7CazLwvAK3yznkEKlMGcIrAAAAgNhFeE0WBYTX0qWltm1p2gQAAAAgdhFek0V6er7hVbLS4ZkzpS1bojgmAAAAAAgS4TVZpKXluc9rQKdO0q5d0rRpURwTAAAAAASJ8Jos0tKk9eulrVvzvLp9ezundBgAAABALCK8JotCtsupWlVq3pzwCgAAACA2EV6TRSHhVbLS4alTpYyMKI0JAAAAAIJEeE0WQYTXjh2ljRulX3+N0pgAAAAAIEgFhlfP8yp5nucV9iCe55XzPO+Q8A0LYRfkzKtE6TAAAACA2FPYzOtaSW0D//E8L8XzvNme5x2Q63YtJP0Y7sEhjGrUkEqUKDC8Nmgg1a1LeAUAAAAQewoLr7lnXT1JzSWVjcxwEDEpKVKdOgWGV8+z2dcpUyTnojg2AAAAACgEa16TSSF7vUpSly7S4sXSP/9EaUwAAAAAEATCazJJSytw5lWSevSw8wkTojAeAAAAAAgS4TWZBBFe99tPSk8nvAIAAACILSWCuE0bz/Mq7P53iiQnqa3neVWy3ebAcA8MEZCWJq1aJe3YIZUqledNPE864gjps8+kzExbKgsAAAAAfgsmvA7T3o2bns32b7f7elr8xLrAdjnLl0v16uV7sx49pDfesP1eW7aM0tgAAAAAoACFhdfuURkFoiP7Xq8FhNcjjrDzCRMIrwAAAABiQ4Hh1Tk3KVoDQRRkD68FqFdP2ndfC69XXRWFcQEAAABAIQpc0eh5Xs1gH8jzvCOKPxxEVHq6nRcSXiWbfZ00Sdq5M8JjAgAAAIAgFNaO53fP804p6Aae51XxPO9VSePDNipERq1a1pEpiPDao4e0aZM0fXoUxgUAAAAAhSgsvE6RNMLzvI89z6ub+0rP806U9Iek4yVdGf7hIaxKlLAAu2RJoTftvnu1M1vmAAAAAIgFBYZX59wASSdLaifpN8/zLpAkz/Pqep73oaSRkmZJau6cGxbpwSIM0tKkf/6xfXAKUKOG1KqV9NVX0RkWAAAAABSk0F08nXPvyPZxHSvpGc/zfpD0m6T2kk5zzvVzzv0b2WEibDp0sER66KHSJ59ILv8djnr0kKZOlbZujeL4AAAAACAPhYZXSXLOrZZ0h6T5ktpKqiDpLOfcmxEcGyLhySelESOkjRul/v2l9u2lL7/MM8QecYS0fbv07bc+jBMAAAAAsik0vHrmGkk/SSop6UJJsyWN8TzvUc/zykV2iAir1FTp1FOlP/6QXnjB1r/26mWLXKdMyXHTLl1smSzrXgEAAAD4rbCtcg6S9L2koZLekK1tHS6bfb1dFmR/9TyvZ6QHijArWVI691zpr7+kYcOkP/+UOneW+vaVfvpJklShgnTYYax7BQAAAOC/wmZeZ0qqIqm7c+4i59xGSXLOZTjn7pfUStJ/kj73PO+VSA4UEVK6tHTppdK8edLQodK0aRZiN22SZOtep0+X1q3zd5gAAAAAklth4fUxSQc75ybndaVzbq5zrrOky2Xb5SBelSsnXXed9P77FlzHjZNk614zM6VJk3weHwAAAICkVlh4vVlSL8/zmud3g93X/Ssp39sgjnTuLNWsaSFW0uGHS2XLUjoMAAAAwF+Fhdchkt6WtLmA22yS9JakTuEaFHyUmiode6w0dqy0bZtKl5Y6daJpEwAAAAB/BRNeX3HOzc/vBs65BZJelnRGGMcFPw0caKXDX34pyUqHf/tNWrbM53EBAAAASFqFhddDJI0P4nG+lNSm+MNBTOjRQ6pceU/p8BFH2MWUDgMAAADwS2HhtaKktUE8ztrdt0UiKFVK6t9f+ugjaedOtW4tValCeAUAAADgn8LC6ypJDYJ4nPq7b4tEcfzx0po10qRJSk2VunVj3SsAAAAA/xQWXqcouLWsZ+6+LRLFkUfa9jmjR0uy0uEFC6R//vF3WAAAAACSU2Hh9XFJR3ie95jneaVyX+l5XknP8x6X1EO2JywSRblyUt++0gcfSJmZrHsFAAAA4KsCw6tz7jtJ10i6XNJiz/NGeJ533+7TCEmLJV0q6Rrn3PeRHy6i6vjjrcXwd99p//2ltDRKhwEAAAD4o0RhN3DOPe553kxJN0gaIKns7qu2Spoo6QHn3DcRGyH8c/TR1rxp9Gh5HTuqRw/piy8k5yTP83twAAAAAJJJYWXDkiTn3GTn3FGyjsJ1dp8qOeeOIrgmsEqVpJ49bcsc53TEEdKKFdKvv/o9MAAAAADJJqjwGuCcy3TOrdh9yojUoBBDBg6UFi6UZs1Sjx52EeteAQAAAERbSOEVSeiYY6TUVOn999WggdSkCeteAQAAAEQf4RUFq1FD6to1x5Y5kyZJu3b5PC4AAAAASYXwisIdf7w0Z470++/q2VPasEGaOtXvQQEAAABIJoRXFG7AADsfPVp9+0ply0rvvOPvkAAAAAAkF8IrCpeeLrVvL40erQoVbAedUaMoHQYAAAAQPYRXBGfgQGnWLOmff3TSSdLKldLEiX4PCgAAAECyILwiOMcfb+cffKC+faUKFaSRI/0dEgAAAIDkQXhFcBo1klq3lt5/X2XLSscdZw2Id+zwe2AAAAAAkgHhFcE7/njpu++kJUs0eLC0dq30xRd+DwoAAABAMiC8IngDB9r5Bx+od2+pShW6DgMAAACIDsIrgnfAAdL++0ujR6tUKZuIHTNG2rrV74EBAAAASHSEV4Rm4EBp0iRp5EidNChDGzdK48b5PSgAAAAAiY7witBccIHNvp58snpe3ULnVRypd9/O8HtUAAAAABIc4RWhqVdPmj1bevddeSkpGr7xZN3xfgtte3WklEGIBQAAABAZhFeELiVFGjRImj1bv9/5rjJcisqcdbLUvLn09tuEWAAAAABhR3hF0aWkaP/bBqlP2mw92HaUVKKEdMopFmL//dfv0QEAAABIIIRXFEtKijRocIpu//kErZ/8s+2dM2eO9NZbfg8NAAAAQAIhvKLYBg+WduyQxnyUIp14otSwoTRrlt/DAgAAAJBACK8otsMOkxo0kEaO3H1B69aEVwAAAABhRXhFsXmezb5++aW0apUsvP71l7Rxo99DAwAAAJAgCK8Ii5NOknbtkkaPloVXybbUAQAAAIAwILwiLFq1kpo1s35Ne8IrpcMAAAAAwoTwirAIlA5PnCgtS0mXatYkvAIAAAAIG8Irwuakk6TMTOm99z2aNgEAAAAIK8IrwubAA6XmzXd3HW7dWvr1V9tDBwAAAACKifCKsDrpJOnbb6VV9VpLO3dKv//u95AAAAAAJADCK8Jq8GA7/2ABTZsAAAAAhA/hFWHVtKl02GHS0583lcqXJ7wCAAAACAvCK8JuyBDp519StHnfgwmvAAAAAMKC8IqwGzxYKlFC+slrLf30k7UgBgAAAIBiILwi7GrWlPr0kT6Y31ratEmaN8/vIQEAAACIc4RXRMRpp0lfraNpEwAAAIDwILwiIvr3l/6tcJB2eSWsdBgAAAAAioHwiogoW1Y6ZlBp/e4dpIzpzLwCAAAAKB7CKyLmtNOkGZmttWMa4RUAAABA8RBeETFdu0rzK7dW2fXLpaVL/R4OAAAAgDhGeEXEpKRIaf2sadP6icy+AgAAACg6wisiquvlB0uSfn+L8AoAAACg6AiviKgDD6+kf0s11dZvCa8AAAAAio7wiojbun9rNVg7S3Pn+j0SAAAAAPGK8IqIS+vbSk30j957ab3fQwEAAAAQpwiviLhKXa1p0y9v/CTnfB4MAAAAgLhEeEXktbbwWnvpLH33nc9jAQAAABCXCK+IvDp1lFm7jg5N/UkjRvg9GAAAAADxiPCKqEg5pLW6VJyld96RduzwezQAAAAA4g3hFdHRurXqbfxdm9Zs17hxfg8GAAAAQLyJanj1PO8Ez/Pe9zxvoed5Wz3P+9PzvPs9z6sYzXHAB61bKyVjlzpX+ZXSYQAAAAAhi/bM67WSMiTdLKmPpGclXSTpC8/zmAVOZLubNp1zyCx9/LG0bp2/wwEAAAAQX0pE+fn6O+dWZvv/JM/z1kh6TVI3SV9FeTyIlkaNpEqVdES1Wdq+XRo1SjrvPL8HBQAAACBeRHW2M1dwDfhx93ndaI4FUZaSIrVqpZr/zVKLFtITT0iZmX4PCgAAAEC8iIVS3a67z//wdRSIvNat5f38s26+IUO//SZ98IHfAwIAAAAQLzznnH9P7nl1Jc2S9LNzrlc+tzlf0vmSVLt27UNHjhwZxRHubdOmTapQoYKvY4hXtT/7TAc8+KC+e/k1Db7jBJUqlakXXpguz/N7ZPGL1yNiCa9HxBJej4glvB4RS2L99di9e/cZzrk2eV3nW3j1PK+CpImS0iW1c84tLuw+bdq0cdOnT4/00Ao0ceJEdevWzdcxxK2ff5ZatZLeekuv7ThZZ54pffihdMwxfg8sfvF6RCzh9YhYwusRsYTXI2JJrL8ePc/LN7z6UjbseV5ZSR9LaizpyGCCKxLAgQdKpUpJs2bplFOsh9M990g+Tv4DAAAAiBNRD6+e55WU9J6kNpL6Oed+ifYY4JOSJaXmzaVZs1SypHTTTdL06dLnn/s9MAAAAACxLqrhdfderm9K6iHpOOfc99F8fsSA1q2ln36SnNMZZ0j16jH7CgAAAKBw0Z55fVrSIEmPSNrsed7h2U77RHks8EPr1tKqVdJ//6lUKemGG6SpU6Wvv/Z7YAAAAABiWbTDa9/d57dI+i7X6dwojwV+aN3azmfNkiSdc46UlmazrwAAAACQn6iGV+dcQ+ecl8/pzmiOBT5p2VLyPGnmTElSmTLSdddJEydKU6b4OzQAAAAAscuXbsNIYhUqSIceKo0du+eiCy6QatZk9hUAAABA/giviL4TTpB+/FFasECSVK6cdM010vjx0rRp/g4NAAAAQGwivCL6Bg2y8/ff33PRxRdL1aox+woAAAAgb4RXRF/jxtIhh0ijRu25qGJF6corpU8+2dPLCQAAAAD2ILzCH4MGST/8IP37756LLrtMqlRJuvdeH8cFAAAAICYRXuGPQOnwe+/tuahKFenyy6XRo6Vff/VnWAAAAABiE+EV/mjSxPZ8zVY6LFnpcIUK0n33+TMsAAAAALGJ8Ar/DBokff+9tGjRnouqV7fmTe++K82d6+PYAAAAAMQUwiv8c8IJdp6tdFiSrr5aKlVKeuABH8YEAAAAICYRXuGfffeVDj54r9Lh2rWl886T3nhDWrjQp7EBAAAAiCmEV/hr0CDpu++kxYtzXHzddZLnSUOH+jQuAAAAADGF8Ap/BboOv/9+jovr1ZPOPFN66SVpyZLoDwsAAABAbCG8wl/NmkktW+5VOixJN9wg7dwpPfKID+MCAAAAEFMIr/DfoEHSt99K//2X4+ImTaRTTpGee05atcqnsQEAAACICYRX+C/QdThX6bAk3XSTtHWr9Pjj0R0SAAAAgNhCeIX/9t9fat48z9LhAw+UBg6Uhg2T1q2L/tAAAAAAxAbCK2JDoHQ4j+5MN98sbdggPf20D+MCAAAAEBMIr4gNgwZJzuVZOty6tXTUUdJjj0mbNvkwNgAAAAC+I7wiNhxwgHTQQXmWDkvSLbdIq1dLzz8f5XEBAAAAiAmEV8SOQYOkKVOkpUv3uqp9e6lHD+nhh6Vt23wYGwAAAABfEV4ROwKlw6NH53n1rbdKy5ZJL78c5XEBAAAA8B3hFbHjwAOtfDif0uFu3aQOHaQHH5R27ozu0AAAAAD4i/CK2DJokDR5sk2x5uJ5Nvv677/SiBE+jA0AAACAbwiviC2FlA736SMdcoh0773S5s1RHhsAAAAA3xBeEVsOOkjaf3/prbfyvNrzrGnT/PnS+edbzgUAAACQ+AiviC2eJ51zjvTtt9JPP+V5k+7dpXvusXz77LPRHR4AAAAAfxBeEXvOOUcqV04aNizfm9x0k3TUUdKVV0rffx+9oQEAAADwB+EVsadqVen006U335RWrszzJikp0htvSHXr2jLZfG4GAAAAIEEQXhGbLrtM2r5deuGFfG9Star0/vsWXE85RcrIiOL4AAAAAEQV4RWx6cADpZ49pWeeKXBT10MOkZ5+WvryS+nOO6M3PAAAAADRRXhF7LriCum//6QPPijwZuecI519tm2fM3ZslMYGAAAAIKoIr4hd/fpJTZpITzxR6E2fekpq1UoaMsS20QEAAACQWAiviF0pKdKll0pTp0rTpxd407Jlbf2rJA0cKG3bFoXxAQAAAIgawiti21lnSRUqFLhtTkDjxtLrr0uzZlm/JwAAAACJg/CK2Fa5snTmmdLIkdLy5YXevH9/6eabpRdflEaMiPzwAAAAAEQH4RWx79JLpR07pOefD+rmd90ldeokXXSRNG9ehMcGAAAAICoIr4h9++0n9e0rPfushdhClCghvfmmnZ98clB3AQAAABDjCK+ID5dfLi1bJo0aFdTN69eXXnhB+vFH6fbbIzw2AAAAABFHeEV86N1batZMevLJoO9ywgnSeedJQ4dKEyZEcGwAAAAAIo7wiviQkmIthKdNk374Iei7PfaYVR2fdpq0cmUExwcAAAAgogiviB9nnCFVqhTS7Gv58taoePVq23XHuQiODwAAAEDEEF4RPypWlM4+W3r3XWnJkqDvdvDB0kMPSWPHSk89FcHxAQAAAIgYwiviy6WXShkZ0nPPhXS3yy6TjjpKuvZa6eefIzQ2AAAAABFDeEV8adJEOvpo6f77pQ4dLI2OHm2diAvgedIrr0jVq0snnSRt3hyl8QIAAAAIC8Ir4s/zz0tXXWVNnJ56Sho4UEpLkxo3loYMkZ55Rvr9973uVrOm9MYb0p9/2t0BAAAAxI8Sfg8ACFlamu1/I0nbt0uzZklTp9ppwgTpzTftuj//tO11sjniCOn666UHH5QOO0w655wojx0AAABAkTDzivhWurR0+OHS1VdL771njZymTrXrJk/O8y733GPbxp53njRiRBTHCgAAAKDICK9ILJ5nYbZ69awQm0vJktKYMVK3brb7zrvvRnWEAAAAAIqA8IrE43nWzCmf8CpJZctKH38sdewonXKKhVkAAAAAsYvwisTUvr2teV29Ot+blC9ve7+2bSudeKL9GwAAAEBsIrwiMXXoYOfff1/gzSpWlMaNk1q2lI4/Xho/PgpjAwAAABAywisSU9u2UmpqgaXDAVWqWGg94ADp2GOlr7+O/PAAAAAAhIbwisRUrpzUunVQ4VWSqlWTvvxSatJEOvpoacqUCI8PAAAAQEgIr0hcHTpI06ZJu3YFdfMaNWyb2Hr1pL59pR9+iPD4AAAAAASN8IrE1b69tGWLNHt20HepXVv66is7P/po6e+/Izg+AAAAAEEjvCJxBZo2BVk6HJCebk2cnLMZ2FWrIjA2AAAAACEhvCJx1asn1a0bcniVpH33tX1gFy+WjjlG2ro1AuMDAAAAEDTCKxKX59nsaxHCq2RVxyNG2G47Q4ZIGRlhHh8AAACAoBFekdjat5cWLpSWLCnS3QcOlB59VBo9WrruujCPDQAAAEDQCK9IbIF1r999V+SHuPJK6fLLpccek558MjzDAgAAABAawisSW+vWUunSRS4dDnj0Uem44yzIjhkTjoEBAAAACAXhFYmtVCmpbdtih9fUVOnNN6V27aSTT7Z1sAAAAACih/CKxNe+vTRzprRtW7Eeplw560Bct67Uv780b16YxgcAAACgUIRXJL4OHaQdOyzAFlPNmtKnn0qZmRZgN20Kw/gAAAAAFIrwisTXvr2dF7N0OKBZM+ndd6U5c6RLLgnLQwIAAAAoBOEVia92balJk7CFV0k64gjp9tul11+XXn01bA8LAAAAIB+EVySH9u1tuxznwvaQt90mde8uXXyx9NtvYXtYAAAAAHkgvCI5dOggLVsmLVgQtodMTZXeekuqVEkaNEjavDlsDw0AAAAgF8IrkkOHDnYextJhSapTx7bQYf0rAAAAEFmEVySH5s2lChXCHl4lW/96223Sa6+x/hUAAACIFMIrkkNqqnT44bbuNQJuv13q1o31rwAAAECkEF6RPNq3l37+OSKbswbWv1asKJ14IutfAQAAgHAjvCJ5dOggZWZK06ZF5OHT0mz96x9/SJddFpGnAAAAAJIW4RXJ4/DD7TwC614DevaUbr1VeuUVWwMLAAAAIDwIr0geVapIBx0UsXWvAXfcYetfzz7bOhCvXh3RpwMAAACSAuEVyaV9ewuvmZkRe4rUVGnMGAuuzz0nNWsmPfuslJERsacEAAAAEh7hFcmlQwdp7Vrpzz8j+jSVx4/Sk8tO1E8zM3XwwdaF+NBDpcmTI/q0AAAAQMIivCK5dOhg5xFc9ypJeuIJadQotVj+pSZMkEaNsszctat00knSokWRfXoAAAAg0RBekVyaNZOqVYtseF2xIuvxn3pKniedcIJ1Ib7jDunDD6X99pPuu49SYgAAACBYhFckF8/LWvcaKWPHSs5JRx8tffKJNH++JKlcOenOO6U5c6R+/awr8emnS7t2RW4oAAAAQKIgvCL5dOhg06Br1kTm8T/6SKpXz7o0paTYeTYNGkjvvWczr2+9JZ16qrRzZ2SGAgAAACQKwiuSTyTXvW7dKo0fLx1zjLTPPtKAAdKLL0pbtux105tvlh56SHr3XVsHu2NH+IcDAAAAJArCK5LP4YdLlStbF6VwmzDBguoxx9j/L73UOjWNHJnnza+91no7jR5t62K3bw//kAAAAIBEQHhF8ilTRjrxROn996VNm8L72B99JFWsaG2FJalLF6l5c2nYMFsHm4fLL5eeeUb6+GPpuONs8hYAAABAToRXJKfTT5c2b7Ypz3DJzLQE2revVLq0XeZ5Nvv6008FNom66CLphRekzz+3Sds8qowBAACApEZ4RXLq2FFq3Fh67bXwPeb06dKyZVklwwGnnmplyk89VeDdzz1XeuUVqzw+6qjwTwoDAAAA8YzwiuTkeTb7+vXX0r//hucxP/pISk21mdfsKlSQzj7b1tguXVrgQ5xxhjRihDR5stSnj7R+fTHG8+23kd0SCAAAAIgiwiuS12mn2TrUN98Mz+N9+KHUubNUrdre1118sW3o+sILhT7MKadYf6cffpDatZN+/70IY1m50qZvBw5kHx4AAAAkBMIrklfjxhY2X3st32ZKQfvnH+nXX6Vjj837+qZNbUb2ueeCCpODBln58Lp10mGHWW+pkNxyi03bLl1qoRoAAACIc4RXJLfTT5f+/FP68cfiPc7HH9t5//753+bSSy1MBtkkqksXaeZM6aCDbBudm26SMjKCuOOMGba37OWXSw0bSk8/HdTzAQAAALGM8IrkNmiQbZ1T3MZNH31kKbNJk/xv06ePzfYW0rgpu7p1pUmTpPPPlx54QOrXT1q9uoA7OCdddplUs6Z0993ShRdKEydKv/0W9HMCAAAAsYjwiuRWubJtrjpypLR9e9EeY+1aS5i5uwznlpIiXXKJNGWKbZ0TpNKlpeeft+WyEydKbdoUcPc337QmTfffb1/bOefYAzzzTNDPBwAAAMQiwitwxhnSmjXSp58W7f7jxlk9b37rXbM76yypbNkilfKee651Id65U+rQIY8+Uxs3StdfL7VtK515pl1Wo4Y0eLD0+uvShg0hPycAAAAQKwivQM+eUp06RS8d/ugjqXZtC42FqVpVGjLEkueaNSE/1WGH2ZLWtm3tYU488XDtv790yCHSGwfcJy1dquvKDNPJp6bonHN25/FLLrFNY0eMCP1rAwAAAGIE4RUoUcKS4Nix0qpVod13xw6bee3f38qCg3HJJdLWrdIrr4Q+VllO/vJLaehQ6dBD16pVK6ltlb900pJHNbbmmfpiw2GaMUP64APbKefvqm2lQw+12d7idlUGAAAAfEJ4BSTrOrxrl/T226Hdb9IkK8ctbL1rdgcfbFv0PPNMkO2D91aypHTdddINN/ypkSOl58tdpZIVyuio2ffrp5+kuXNt557SpaULLvTkLr7ENoydNKlIzwcAAAD4jfAKSFKLFlKrVrY2NBQffWRrWHv2DO1+l1xie8OGI0yOHWunO+6w8ufd0tOlBx+UvvpKemPnSVK1amybAwAAgLhFeAUCzjhDmj7dZiiD4ZyF1969LcCGon9/qXx56Z13Qh9nNt6OHdJVV0n77Wdb5ORy3nk2yXvlTWW1efDZVkv833/Fek4AAADAD4RXIODkk6XU1OBnX2fPlv79N7SS4YBy5ex+779v7YOLaJ/335f++kt6/HGpVKm9rk9JkYYPlzZvlm7+90IpM9MuAAAAAOIM4RUIqF1b6tvXuvIGsxb1o48kz5OOOqpoz3fiidLq1VbXWxRLlqjBG29YCO7TJ9+b7b+/dOut0pNjm2jFIX0svBYjMAMAAAB+ILwC2Z1+upXVBhMoP/xQat/eQm9R9OkjVaokvftu0e5/441K2bVLevTRQm96ww3SQQdJ1y+8RFq2zMqHAQAAgDhCeAWy699fqlKl8NLhxYttw9WilAwHlCkjHXusNHq0bbkTinnzpBEjtHjgQKlJk0JvXqqU9MIL0ohVfbSqUqNCGzc5Z1vRDhwoLVgQ2tAAAACASCC8AtmVKSMNHmyBcuPG/G/3ySd2XpzwKtlzrVsnffFFaPd7+mkpNdXCa5Dat5cuujRVQzdcJE2eLP3yS563+/NPa548ZIhN0HbuLM2ZE9rwAAAAgHAjvAK5nXGGtGWLzcL262fprVUrm+GsVcs6C190kdS0qS0oLY5evWymN5Suw5s2SS+/LA0cqB01aoT0dP/7n/RZ2lna7pVWxrBncly3dat0++1Sy5bSzJnSs8/a+Y4dUpcu0qxZuR4sI8MaRa1bF9IYAAAAgKIo4fcAgJhz+OEWXOfOlSpWtFODBln/Dpx697aGTcVRqpQ0YID03nvStm0281uYESOk9etta5wQGy9VrCjd93wNvX3MSTrltTeU+vCDUqVK+uyzrK1nhwyRHn44aynvN9/YTGz37tKnn0odOux+sC++sG16UlPz3KYHAAAACCfCK5Cb51kn4WgZPFh65RXps8+k444r+LbOSU89JbVubSly0qSQn65/f+mmIy5RqQmvae6tr+vWZZdq1CjbKnbCBKlHj5y3b9ZMmjLFAmyvXtanqmdPSePG2Q2mTQt5DAAAAECoKBsG/Najh1S9enClw19/Lf32m810FmPW94oRbTUjta12DXtGH3/kdO+90s8/7x1cA+rXtxnYJk1sZ6AxY0R4BQAAQFQRXgG/lSxpbX0//tjW2hZk2DALuiedVKynrFNHKnHe2TpQf2juB7/plluk0qULvk/t2tLEiTbpe/3AedJff0n16ll59dq1xRoPAAAAUBjCKxALBg+WNm+2RaX5WbjQypnPO8+aRhXTwbcfK0mqN2NM0PepVs2Wul7axGZdJ3e8ya6YPr3Y4wEAAAAKQngFYkHXrtbJuKDS4Wd2dwe+6KLwPGdamjWn+uCDkO5WsaJ0SZNxWlKuiY4ZebIkyf1A6TAAAAAii/AKxILUVOmEE6SxY20rnNy2bpVefNEaOtWvH77nHTDA9sNZuDD4+2zbptRJX6v2mX111ClVNEf76bdXp8m58A0LAAAAyI3wCsSKwYMtpH788d7XvfWWtGaNdPnl4X3OAQPs/MMPg7/PpEnS1q1KPaqv3nhD2rBfO9WY94POPstp167wDg8AAAAIILwCsaJTJyk9fe/SYeesUVOLFlKXLuF9zn33lQ48MLTS4XHjrLtTt25KSZHaXtJOdbRcX762WIMG2Xa1AAAAQLgRXoFYkZIiDRpk4XDDhqzLp0yxfWyKuT1OvgYMkCZPllatCu7248ZJ3bpJ5cpJkrzD2kmSnj97msaMkfr1yzl8AAAAIBwIr0AsGTxY2rEjZxnvsGFS1arSqadG5jkHDJAyM6VPPin8tv/8Y1vj9O2bddnBB0ulSqlfjWkaMcJycI8e0sqVkRkuAAAAkhPhFYglhx1me6cGSocXL5ZGj5bOOWfPTGfYHXKIPWcwpcPjbIucHOG1dGmpVStp2jSdeqrl7t9+kzp3lv79NyIjBgAAQBIivAKxJCVFOvFEafx4ae1a6bnnbFb04osj95yeZ12Mx4+3vWYLMm6c1LixrZXNrl072+s1I0NHHWUPtXSp1LGj9O671syYbsQAAAAoDsIrEGsGD5Z27pRGjpSGD5f695caNYrscw4YYJ2WPv88/9ts2yZ99ZXNuuZee9uunW3xM2eOJJt1nTRJysiwL6dhQ6l2bemoo6Q77rAK5eXL9374+fNtie+770qPPy5dd530wAM0gQIAAIBUwu8BAMilTRub3bzhBmnjRmvUFGmdO0vVqlnp8PHH532byZNtK59+/fa+rp01bdK0adJBB0mySuL586VffpF+/DHr9NlnNpksWbVylSrSkiXS6tV7P2zp0tL27dKIEXZq1aq4XygAAADiFeEViDWeZ6XDDzwgHXCAdMQRkX/OEiVshvfDD23Wt2TJvW+TbYucvey7r1S5soXXs87ac3Hp0pbF27SRLrrILtu0SZo1KyvMbtliuwTVrWs7BaWnZ/27alWbDD77bMvH99wjXXutlJoamW8DAAAAYhfhFYhFJ58sPfigdOWVkdkeJy8DBkivvWb1vj177n19ri1yckhJkdq2lX74odCnqVDBJno7dw5uWH362OztBRdIN94ojR0rvf66lSIDAAAgebDmFYhFLVtKf/8tnXde9J6zVy+pbNm8uw7Pny/9+WfOLsO5tWsnzZ5tpcVhVr26NGqUZeuffrJvz6uv0gQKAAAgmRBegVjVuHH0Zl0lm1Ht08dKhwOLUgPy2iInt3btrEPTrFkRGZ7nSaefbvm4VSurTj7hBGnVqog8HQAAAGIM4RVAlgEDpP/+s21vsstvi5zssjdtiqCGDaWvv7aq6o8/lpo1sx5S119v5cSzZkVk8hcAAAA+Y80rgCxHHWXdkD74ICuMBrbIOeusgmeC09KkffaJeHiVbIjXXy/17i098oitiZ0wQdqxw65PSZGaNpVatLAS4zPOkBo0iPiwAAAAEEHMvALIUq2aNWUaMybrsm++sZbABZUMB7RrF5XwGtCqlfTGG7YOdtMm6fffbY/YW2+Vmje3UHvXXTY7e9ll0rJlURsaAAAAwozwCiCnAQOkOXPsJGVtkdO9e+H3bddOmjcv701b8zJzpvTdd0UfazYlS9rOQoMGWWB9/33rMbVggXTmmdKzz1rl8403SmvWhOUpAQAAEEVRD6+e5+3jed4wz/O+8zxvi+d5zvO8htEeB4B8HHusnQe6Do8bJ3XtmvcWObkFSo1//LHw227caA2iOnSQLrxQ2rChaOMtRL160vPPWxY//nhp6FCpUSPbM3bjxog8JQAAACLAj5nXppJOlLRW0jc+PD+Aguyzj+3ZOmaMTVvOmRNcybAkHXqorYsNpnT4iSeklSulIUOkF16wOt/x44sz8gI1bSqNGGHdinv0kG6/3WZiH31U2rw5Yk8LAACAMPEjvE52ztV2zvWTNMqH5wdQmAEDLIC++KL9P9jwWqmS1e4WFl7XrJEefthmed94Q/r2W6l8eenII6Vzz5XWry/e+AvQvLlNKv/wg9S6tXTNNdZr6pxzpMmT2TsWAAAgVkU9vDrnMgu/FQBfHXecnT/0kNXYNmsW/H0PO8zCa0Ep8KGHrEz4nnvs/4cfbnvc3HCD9MorljADe8tGSLt2NtE7ZYrtF/vuu1Yd3bSprZmdPz+iTw8AAIAQ0bAJwN4OOEDabz/be6Zv34K3yMmtXTsrB164MO/rly2zkuGTT7a9bALKlJEeeMAaOFWqZJu3nn22tG5dsb6UwnTsKL38sg3r9dctq991l5UUd+tmWTqCE8EAAAAIEuEVQN4GDLDzYEuGAwJNm/IrHf7f/ywU33VX/vefOVO6+WZLk4ceGpX2wOXLS6edJn35pS31vfde6b//LD9Xq2YlxpddJr3zjl0erG3bIp6/AQAAkoLnfFzg5XneuZJekNTIObcgn9ucL+l8Sapdu/ahI0eOjN4A87Bp0yZVqFDB1zEAAZF8PZZesUL7jBqlf847T65UqaDv5+3apc79+um/AQM076KLcj7msmU67LTTtOzIIzX32msLfazKs2fr4Guu0Zp27fTrvfeGNgMcBs5Jv/9eSdOnV9Uvv1TWb79V1rZtqZKkOnW2qkWL9WrRYr3S0rZp9epSWrmy9F6n9etLyfOcevRYodNPX6j69bdE9WuIJv4+IpbwekQs4fWIWBLrr8fu3bvPcM61yeu6mA+v2bVp08ZNnz494uMqyMSJE9WtWzdfxwAExOzrsUMHqUQJ64CU3TnnSG++Kf31l+1hE4xhw6TLL7d1skEE3kjatUv66SdbJxs4LV+e8zbVq1vD5nr17HyffWzi+PnnpS1brFr69tutKjvRxOzrEUmJ1yNiCa9HxJJYfz16npdveC0R7cEASALt2tn2N7t2WYiVpD//lF591YJosMFVki69VJo0SbrxRgvFHTpEZMjBKFFCatPGTldeaTOz8+ZZGXHdunYqWzbv+954ozVYfuopaeRIC7G33ZaYIRYAACASWPMKIPzatbNpxt9/z7rs9tst2d10U2iP5XnSSy9JDRpIJ50krV4d3rEWg+dZd+JAl+L8gqsk1awpPfigrae99lrbrufAA22d7dy5URsyAABA3PJl5tXzvBN2//PQ3ed9Pc9bKWmlc26SH2MCEEbZmza1bGm1tu++K916q1SrVuiPV7my3b9DB+mMM6SPPpJS4vPYWyDEXnONzcQ+/bQ0YoQ1W65QIeepYsWsf1euLFWpIlWtaqfc/05Pl0qW9PmLS3QbNlj3rfr1/R4JAABJya+y4VG5/v/M7vNJkrpFdygAwq5JE0tVP/wgnXuuhdYqVSyxFdWhh0qPPmplxA8/LF1/fdiG64dataShQ20W9vXXbXehTZukjRvtPHBautQuW7/eTvm1KahcWTrqKNuit08fC74Is6uuksaMkRYtksqV83s0AAAkHV/Cq3Muui1DAUSX59ns67Rp0tSp0tix0v33W4AtjosvtvWvN99ss7CdOoVluH6qVSv4PlSZmTb5t3atTQCuXWunNWtse9yPP5beeksqVUo64ggLssccI9WpE8mvIElkZto3eM0a2y/prLP8HhEAAEmHhk0AIqNdO+m++yyZ1a5tm6QWl+dZI6iZM239608/STVqFP9x40RKiuX/vI4BnHeelJFhxwrGjLHTBRdIF14oHX641K+f1L691LatVKlSdMedEGbMsOnx1FTpuecIrwAA+CA+F40BiH3t2tls1XffSbfcIpUvH57HrVxZGjXKgsRpp9lzQJLlqs6dpUcekf7+W5o9W7rrLmn7duts3LOnBd8WLaya+8UXpV9+sdCLQowbZwdPbr7ZKgpmzvR7RAAAJB3CK4DIaNvWzuvXl84/P7yP3bq19Pjj0mef2cJRP2VmSjt2+DuGPHiehdTbbrNJwzVr7Nt1xx229+zo0TZb27KlBdojj7SS4+3b/R55jBo3zvZIuvpqayv93HN+jwgAgKRDeAUQGbVrS1dcYR/yS5cO/+NfeKE0eLDN6n78cfgfP1g332x73uTXSSlGVK1qAfWOOyyHrV5tW+++/rp0+uk2U3vqqRZsr7tO+usvv0ccQ1avtuZjffta0j/5ZEv669f7PTIAAJIK4RVA5Dz+uH3gj4TA+tdDDpFOPFGaPDkyz1OQzZulZ5+V5s2T/vijaI/hnPTFF9KuXeEdWyE8T2rWzCqvn37awur48bZn7WOP2XU9e0rvvSft3BnVocWe8ePt5xR4LV94of3s33zT33EBAJBkaNgEIH5VrGjTiJ07S/37SxMnWklxtLzzjrX/ley5Dzww9Mf47jupd2/ppZeks88O6/BCkZIi9eplp6VLpZdfloYPlwYNskn0006z3ljbtuV92rlTOuCAWura1YJxQhk3TqpePasUvk0bO2jy3HPSRRcl4BcMAEBsIrwCiG81atjMWMeOVhc7ZYpNG0bDc89ZYN2wwbbwufji0B9j/Hg7//xzX8NrdmlpVo194402rOefty12A72xSpaUypTJedq2TXr//QP12282GV27dnjG4pw1nho1ytbuNm5s3/KDDrLzmjUjnB0zM22x8JFHWkcsyZ7woots0fB339m2TQAAIOIIrwDiX716VnrbubNNHX77rS3ejKQZM6Qff5SefNLWQ37xhSWtUJPUV1/Z+ZdfWtvfQECKAamptsVOv37Sli32pZUubbO0uWVkSJdcMk+vvtpEBx1kpcgnnli0YOmc7YI0apSd/v7bnvOgg2wroMBkt2QTogcemHU6+GCpVStrSh0WM2daZ+vc5e8nnSRdc40dwCC8ApHz77/W+A8AxJpXAIliv/1shmztWivDXbUqss/3/PPWdfa006Ru3aQVK6wDUig2b5a+/96mE9esientV8qVsy83r+AqWdA96aRFmjVLatLEst2gQfZtCYZzdjzgxhulffe1qtyhQ6VGjexbvWyZzcCuWyf9959NWD/+uDRwoN333XetP1i3btZTqXFj6fjjpbvvlj76yD7/FqmnVmCLnCOPzHl5hQr2s3/3XWvoBCD8pk+XGjSwjasBQMy8AkgkhxxinYePPNKmCydMsHWx4bZhg3WbPflkS0pdu9rlEydK++8f/ON8840tFr37bmnIEJu9DayrjFMHHGAT3w8/bJ2NJ02SnnnGgmx2W7fa59KpU7NOq1ZZCD7iCAuxxx1nVeHZeZ6Unm6nXr2yLnfOAu7PP9usbeA0ZkxWaK1aVWrYUKpTx061a+/973Ll7PbOWcVw+uhPpeZt9O/amspcbWXK1avvftILLrAp5tdesy10AITXrFl2ft990rHHsr4cAOEVQILp2tVmw44/XhowQBo7Nvxb9bz5ps2aXnCB/b9pU0tTkyZZJ9pgTZgglSpl42zVyqYTb7459PEMGmQh/eWXQ79vBJQoYeGzf3/pjDOsfPjEE+3L/P57WyY6c2ZWg+VmzaSjj5a6dJGOOSZbOAyB59la3bQ0qU+frMs3bZJ++cWC7M8/S4sWScuX2yzu8uUFN3muptVaqR90j27TnfvZZSVL2qzyVVdJrVu3sLXWzz1nF0Twg3VmpgX80aNtIuqSS+z7DCS0uXPtfPp0W2JxxBH+jgeA73jrA5B4jjnGgtwZZ0innGJdgcP1Sd85CyutW2fNknqeheaJE0Nb9zphgq2XLFfOSp0fe8zSVoUKwY9n+XJLNCVKWB1tpUqhfkURc9BBFlaHDpXuvNOOKZQta9+2a6+1L/3ww202M1IqVJDat7dTbpmZVmW+fLnN2i5bZjPCKSn2I2z8w3ilPOd02J199ea+dtn339tL6403rET5kXYX6pDHTpO+/lrq0SOsY3fOllW/84597xYvth/zrl3SyJE24Rut3mSAL+bOtXUEGzdKDzxAeAXAmlcACer00y0Mjh4t/e9/4XvcH36wabsLL8wZUrt2tT1m/voruMdZvdqmAwMfxnr3thLiSZNCG8/o0ZbCduywxZ0xpkQJm0yeO9eC2Pr19iXef7/NzEYyuBYmJSWr4VOPHnac45xzpLPOks48U+qy2bbI6XNrW51yilWJP/GEzd4+9JBt79vhsRO0NqWa/r72OW3enP9zOWefvxctkpYssSXOmzdbo6vct/vpJ+mmm2zt8GGHScOG2bGSN9+0+735pjRnjk3WDxuW1QW6MNOn2xrhypXtcS++2HZo+ukn9vJFjJo7V2rRwiobvvzSXsQAkhozrwAS15VXWmK65x6bjW3VqviP+dxzVqJ78sk5L+/Wzc4nTQpuOuzrry2pBMJrx46258z48dJRRwU/nvfes+fbutWm54YMCf6+UdSwoZ3iRmCLnN699+oAXaWKzRxfcYU0enQZjb3iLA2e9YRa1l2qLoPTlJFhxyYCpzVr7Dy/gFiihFW2lyljx0MCa3979pRuv93W/lapknX7U06xYyXnnitdfrmt6335ZSsnzs05e0n+73+2pLpyZQuw//wjjRhh2xpJ9vwtW0qHHmq/JtWqSeXL730qV85ms8uUKf63GCjQrl12hOi44+xg4f/+Jz34oLUgB5C0CK8AEtuwYbZW6owzLMiWKlX0x1qzxmo4zzxz70ZQzZpZ15+JE23/z8IEmkkFSo/LlLFE8sUXwY9nxQp7vptusvD61FPWjjd70kHR5LdFTjYlS0qDB0uu1fny9n9Et+3zss5/8xZVqmQzutWrW/+uwL+rV7cfTUaGtH277Y27bVvWv7dvt1PbtrZkO3ezquzq1pU+/VR68UXrFdWihc0Kn3mmBWDnpE8+sc/7339vL80HH7QMEKgsz8y0bYhmzrROzzNmSG+/bcdnClO7dtZeu9n33S1ozJG0bZt1lF6wwE6LFkkbNqSrZEkL5ZHo24YIW7jQjvg0a2Yv2osvttLhv/6yUmIASYnwCiCxVasmDR9uM6/33GOnonr9dfuUnFdTJs+z2ddJk4Jb9zphgnUoyr4Wt3dv2zt00SLbu7YwH3xgCWTQIBvXo49a6fDpp4f0ZSEP+W2Rkwdvv2bSEUfolL+G65T1N0Ztr17Ps+MkPXtaqfPZZ1sV+cCB9lL45Reb7X7mGQu1ZcvmvH9KiuWCZs2sCZVkL6fFi628e8sWK23Oftqyxcqf582TfvvN1t1u3Jj1mDVrWoht0sS2Wq5bN+epRo2i97Vau9Z2o5ozx6pJ58+3oLpwoVXs5/7eONdMTz5p/2/a1GaUW7XK2gu4bl1/m9cuXWo/E4415SPQrClQyXLFFfbCfugh+5sOICkRXgEkvkDb2/vvtxBblO1onLMNRw8/3D795qVrV5uZnTfPPi3nZ9Eimz246KKclwf2fvniC0sihXnvPZuBaNnS/l+/vpUOE16Lb9w4qU0bqVat4G5/4YV2EOGzz0Ir+w6DRo2suODJJ20S/pNPLEC+8YbNDJcsGfxjpaTYyyhYzlnY/f13O/32m50+/dQaYeXeW7dUKWvMXaeObV1UtaqFt8C/A6fUVPsVCYTVP//MuWdwiRJWJt2woe2KFfh34JSeLo0e/Z3Klm2fY+uk997Leoy0NDt+1LWrnR94YOTD7OLF9is6cqQVgkg2I9+0qZ323Tfr302bFq3zdsDcufYyXrPGDkZkP61bZ+cpWzZp39YV1LGj1KmT/WnMfZDDN7nDa+3adpTm5ZetA1x6+p5dyw49NO53GQMQJMIrgOTw+OPW8OOMM6xOMtRFe5Mn26foV1/N/zbZ170WFF4nTLDz3J0zmze3T/XBhNdVq2zd7PXXZ33iPvFEqx1du9YSAIpmzRprzHXrrcHf59hj7Wf33HNRD6+Shc4rr7RjMwsXWiBLiUJLRs+zIoF69faepN650zo4//ff3qflyy2Mzp1rL9d16/JuPFWzppVeH3OMtN9+9u/99rPAXlgD8Zo1t6tbN9uGKWDjRuu3NmuWbdk0aZIdb5JsVrhz56ww26SJ/ZqtXJn3aft2C5uB0ulmzfL+s7JihYXmkSNta2fJtqS+/377Gv7+205TplgQyx7499/f8tppp1nYLkxmpv35eOIJC64BFSvaeucqVey8Th1pYKUvdMOkvnr2x9t02ae3S/JUsqQFwU6d7NShg49N1ebOtQFnr0W/9lpp+HBlPvq4XtpvqG69NeugxqmnWpl8KAdfimv9eunDD21v67597bXGFlZAhDnn4uZ06KGHOr99/fXXfg8B2IPXY4g++8w5ybnrrw/9vied5FyVKs5t2ZL/bTIznatVy7nTTiv4sYYMca5mTecyMva+7vTTnatePe/rshs+3L6WmTOzLps2zS575ZWC7xshBb4eH33Uvu433nBu+fKojalI3n7bvo/ffRfa/W691TnPc27SpMiMK4FlZDi3fr1zCxY4N2uWcz/84Nzq1cV7zGD+PmZmOvf33869/LJzZ5zhXKNG9qMv6FS6tHP77GO3TUnJujwlxbl993XumGOcu/FG5x57zLlevbJuc8ABzt19t3N//pn/eLZude7335376CPnHnrIuY4d7b6pqc4ddZRz773n3Pbte99v40bnnnrKuf32s9vXqePcXXc59++/zu3alccT7drlXPPmzpUs6Zzktlxyrfvow0x3ww32nKVKZX1dHTo49+yzxf95hKxnT+fatdvr4mXdT3KbUiq4KlrjOnZ0buJE526+2bkyZex00032WoqUTZuce+cd5wYMsNdC4DUhOVevnnP33uvcsmV734/3a8SSWH89Spru8smDvgfSUE6EVyAnXo9FcN559mly6tTg77N8uX3Iu+KKwm87aJB9gsnMzPv6zEzn0tKcGzw47+vfeMP+NE+fXvDz9OrlXJMmOZ8nM9O5hg2d69u38HFGQL6vx127nKtcOecn/UMPtbA3ZYpzO3dGc5iFO+00O4CQ56f+Aqxda+mhenVLRPBVUf8+/vuv/Ro++KCF2o8/du77752bN8+5DRty/spt3erczz87N3Kkc7ff7twJJzh34IHOlShhL/PGjZ275RbnZs/O/09CYebMsTCcnm6PWaOG/Sn66Scb01VXOVepkl3Xtq1zI0bkHXBzeOklu8PIkc5dcon9+6KL9hw027rVuW++ce6ee+zrkSzQDhjg3OjRzm3blvfDZmZaOH/xRTsOFwj4pUo5V768/RmoUcPCdb169v1p1cqe+u23nVu8ONuD1a9vB7x2mzvXuWOPde5gzXJOcrNPui/H93ThQudOPdXGWquWc889F74/Ldu2OTdmjHMnn2xfh2R/xq+4wo5x7djh3Acf2J9lyd4uTjrJucmTs37uvF8jlsT667Gg8OrZ9fGhTZs2brrPe3xNnDhR3QKlgYDPeD0WwcaN1pq1VClbBFeuXOH3GTpUuuEGW9R3wAEF3/bpp6VLL7W9SBo12vv6OXPsMYYPz7sr8bJlVh94//3SjTfm/RyrV9v6r2uvte6b2d1wgzU1Wb7cmlVFUb6vxx9/lNq1s71Z9t/f6hnHjbM2uJmZVhrYq5d0/vnWfchPmZlWU9mzp9Vwhurvv20T1dq1rS61cuXwjxFB8fPv486d9isYzqZQu3bZTlqvvGKlqoGtl0qUsOXWl19uL71Cn2/LFqt3rlfPXqOS/a0ZOtTWy7/0Uo7aV7d77+E33rBfieXLbVXC4MG2M1e5clYOPXmylT4vX273q1HDyq/32886bO/aZWPOfb5ihQ1j0ya7X5MmUo/2WzV8RDmtufJupdxxm+65xxrHly5t+0ZfP7GvUmfNsBr5XIt0f/zROnBPmWJdsB9+2Erat22zMt8NG+wU+Hf2yzZuzPp39tM//9h59erSCSdYg7POnfPuzTZ3rm1B9cor9tgtWlh7g9TU6erXr41q1rSvoyCbN1szsvnzs07r19vbVunSWafs/y9Txv6U5nUKNNnPyLBS+OXL7a0m+ylQfh143MBjZ/93w4bW9qF+/eg2O9u1y9bST59ur5ns34Pcp0BzuFDGt3WrPfZ339mSgKOPtrL5cPXeW7xYmjrVTt9/b78bxx1n7Thq1w7Pc4Qq1j8/ep43wznXJs8r80u1sXhi5hXIiddjEU2YYIfHr7yy8NtmZNj0QNeuwT32L7/YY7/8ct7XP/WUXT9vXv6P0bKlc92753/9iy/mPzs7fXrBzx9B+b4eH3jAxpS7lm7NGqu/O+ssmyopXdpmL/3044821tdfL/pjfP21Tb317h17s8pJJKJ/Hz/5xGbZN22K3HMUYNUq54YNsxLVHLOVwbj3XnuNf/NN1mWZmTbNKtn0cT5Ttzt3OjdunHOnnOJc2bJZhRSScw0a2ETp8OHO/fFHaDPNO3fan65HH3XuuOOc61R5tnOSO1EjXYkSVo1/9tnOLVmy+w4TJ9qTPvNMno+Xmenc++9bcYqUNRNe2KlMGftT1LSpc4cc4ly3blYGfv759nXv2BH817Rpk/2pbt167+epVMmeo0MHm00+91yb1T3sMHv+3LcvW9a5unVttUmlSlllysGeypa1gpDsxS/ZT+XKWdFO48ZWEl+rls2Sly1rJeu5b1+njv2cHnjAfhTh/jVYutRmum+80X4GgdnuYE9VqjjXqZNzF17o3NNP20qONWuyXhsLF9pM/+WXW7VC9tfH7kp6l5bm3GWXWXFQYat4sgu8lp980mbf69fP+X3u0sW+15K9rtu3tyqPOXPC+z0sTKx/fhQzr+ET60cqkFx4PRbDpZfaHiITJ9r0QH7Gj7fD9m+/nbWfSEEyM+1Q6lFH5d3c6fjjrVvM/Pn5P8Z111nr2DVrpPLl976+Tx87vD9v3t6Hl52zqYv99svZsSUK8n099u4tLVki/fpr/nf+4Qc7pP/SS8F1Wo6Ue+6Rbr/dpiaC7TSclxdftJn1yy7Tnv1aEFUR/ft48snWgWnSpIL/fsSaFSusmdwRR9hWW7k99phNW/brZ12mCmg9vHGj9PHH9ienc+fwNkrKHPW+Uk48QaNunqlvN7fWGWdIrVtnu4FzUvv2WV2/8umStGOHNSdeuNCKICpVyvu8YkX7d3G2Ac+Pc7Zt1Ucf/aLatVtoxQob9sqVyvHvcuWsWCevU61aef+p37Ura3/orVuzOknnPq1fbzPb1avb21OdOjlPFSoU/DUE9qaeM8dmDgOnv/6y61NSbIa5bVtrxt+ypf2/sL6Bztn+zL/8Yqeff7a3ggUL7PoSJeznfvjhdmrXzt4SA19z7tO2bfaz/uUXe7v55Rf72gPS0+18yRI7L1fOxty+vTUnO/xwu+yTT6wr+Kef2mPWrWvVDYMHW3VDRoY9T6DZ2rx5Wefz5tlYJNsurGNHe+yOHe37UrJk1mtizBiropg5026/3342I9u7t902kvtmx/rnx4JmXgmvIYr1HzaSC6/HYti82d4dnLPNMVevto0Xly3LOl+2zGqVMjKs7qewWq+AgQOlGTOy3oEDMjLs3WjgQAs3+fniC3v3+vRTa2GZ3Zo19unj6qulBx/M+/433ig98oiNvzh7bYQoz9fj9u32Cea886wFan6cs1LGBg2yujH7oUMHq0sL7GNSHNdcYyXczzyz97ZIiLiI/X3MzLQ0sXq11aRec034nyNSLr3UOmL/9pt9Us7L8OG29VO3brZvdGHJJhLuv9/qgzduzP/5x4yRBgywWuaTT47q8IoiEd+vV62Spk2zIPvddxbC1qzJur5evaww27KlvXX9/ntWWP3115zhsn59C4eBsNq6dfG2bnLOuptnD7OZmfbY7dtnhcn8BA7QvPuuHQvescNW46xfb2/nAWXLZm1v1aSJdezu2DG47dolC/AffWRBduJEOygh2YGFFi2yDga0aGGdzcuUser/QGl57hLz//6z4F+mTFZJeeA88O/DD5+tG29sWcTvbOQVFF5p6A0gOZUvbzOjXbvmOqQveydKS7N3jk6d7INRsMFVsg99o0fbodkGDbIunzXLDoP36FHw/Tt1sucbP37v8Prhh/bOdsIJ+d//xBMt2I4ZI51zTvDjjoTvv7cpgcK+Zs+zvS7uucfeeevWjc74sgtskXPLLeF5vKFDbYPSyy6zTzWBfXwR32bNsuAq2UK5eDF3ru1Vff75+QdXya4vV862Fevd2z61R3vt9ty5Nk1WUHA+5hhbQ//AA1YVE81FmOH2xhv2N6+wv5MxpkYNm6Tv18/+75zNas6enfP02WdZgUyyl1OLFtIpp2QFs+bNi/kyc84Oyuyzjy30lb0k9tnHTrnfSoNRsaKN8ZRTLLB+9JGFy7S0rKDatKl9VCjOy69+fTuudOml9hHhxx/t+xYI+U89lTWbm5JiAXrVqpyPUaZM1kx927YW0rdts1NgVnrbNnub27ZNatkyTAt6fUB4BZC8One2rh6LFtm7TyCwVqxYvHeirl3tfNIka4ASEJhRLOwDStmyVor4xRd7XzdqlHXNaJN3HwNJFsabNLHDxX6H1wkT7N028D0pyKmnSnffbeWYfsxmjR9v7/iBT2LFlZpqs0IdO1rN2fff24dtxLfPP7fzTp3CM0MfLTfeaJ9w77ij8NsGOjENGmR7ZAdzn3CaO9c2zi1ISoo1qDvrLAvY4fq9jbbt222mu0oVqzkNdQ/yGOJ5WQ2TsofFQMnx8uXWr3CffcJ0rCEz06Z8338/64Dx4MH2HhJmlSvbfsunnRb2h84h0L8w+7HOXbusLPmXXyzULl9ugbdx46zAWrt2aN/TiRNXhn3s0RKFLcwBIIZ16GBvdl272oelSpWK/67avLkdGp04MeflEyZY+8s6dQp/jN697Sjyf/9lXbZ2rfTllzbrWtAYPc9mXydM2PvwbFEVdYnJV19Z0N59JLxAzZrZbd98s2jPVRyB8vHq1e2wdbhUqmR1Z6VKWQvLwIwd4tf48XaA6KijLGysXev3iAr37be2xvWGG4Jvb3r88fZ15v47Fg3BhFfJpsTq1ZPuu6/of6P8NmWK1YAuWWIz4wmodGkrH+7d235cxXqL3bnT3gcvushScqdO1uW/eXP797hxWa24E0SJEnbcc9AgK04aPly69VZ7+bdvX/yZ33hDeAWAcEtJsZnTSZOyLtu+3T6kHHFEcI8ROOyaffb1o4/sTbmgkuGAE0+0RTl5NWUJ1YIFdtg51A+xmzZZGW4opXCnnmplmX/8EdpzFcfGjfYpYNQom3EK1/4IAQ0bWgn3okUWCMaOtdfGzJnW8WTpUvtexeuH72SycaPtd9G7d1b1Q6yXDjtnTeDS0qSrrgrtvl26WMVAoGYxGtassYNuwYTXUqUskE+dmvPvbTwZP97SSfv2ttZ3yxa/RxSbnJOuv96SWq9e0uuvW1h96y3revXJJ9KVV9qeRoEtoJCQCK8AEAndutnmgIsW2f+/+87WfgYbXlu0sBmS7OF11CirFWrXrvD7H3ywNUB6992Qh76XCRPsQ3uoswLffGP1TsF+zZKtXUtJid7s6y+/WAh5913pf/+zBkuR0KGDdVL+5hubge3Wzbp6NGtma/sqVrTQXKWK3c5Pv/9OkM7PpEl2AKl3b/v5SbEfXt9/3/7+3HNP3t3LC9Kliy2Qi+bXGGhhG0x4law7eZ060r33Rm5MkfT557a0YOhQqwd99ll/xpGZ6c/zBuuPP6SHHrKOTh98YAc4Ro2ynhSBxbI9e9qBgCh32kd0EV4BIBKyr3uVQlv7Kdlte/Wy8JqZaV0cxo8vvGQ4IFA6/NVXtg9DcUydaudjxthR7WBNmGAzIx06BH+fOnUs7L71VuQD1Kuv2gehDRvs+3TTTfZ9j5QhQ2xN1g8/WNnbBx9Yo5ZnnrEGW7feagvCLrnEAqQfpkyx0vaXX/bn+WPd+PG2Jr1jR+ui3bRpbK973bHD1ro2by6deWbo9+/Uyc4nTw7rsAo0d66dBxtey5aVrr3W/t7E24zbsmW2P8yRR9r3undv+1uwaVN0xzF7tpXg5rW9W6yYMcPOH3rI9pPJqw1x5cr2fkN4TWiEVwCIhJYt7cNtoNR2wgRbSxlKO8VevSx4zp5t6yaDLRkOOPFEC76jR4c09L1MnWoLlbZtC60MecIE+yBRrlxoz3fqqdbvP1IfRLdssdmas86yUr2ffgr+oEJx1atnM+dHHGEfwIYMsbVb119vzarGjLFZ2NNP92fd1nvv2fndd1vwQU7jx9useaD7eNu2sT3z+vzzti536NCilcPXqGEHM6IdXlNTrQtNsC64wNarx9vsa6CypndvO7/rLvub//TT0RvDsmVS//52fscdsbtedMYMey8prOld3752QCCwmSsSDuEVACIhJcW6GU+aZDN706aFVj4rZa17HT/eyqP22cdmCoPVooVtiVGc0uHVq61N5AUXWAfjESOCv99PPxVt64cBA6zjZiRKh//8076Hr74q3XabfW+DbWATDbVr2z6cM2ZYGXM0OWfheZ99bOPBWJ19XbfOfnZ//x3d51240F4/gaAhWcn5okVW7hlr1q+3MNSjh9SnT9Efp0sXa/iUfa+TSJo714JrqVLB36dCBVvP++mntpY8Xnz+uVSzZtZ2bYcfbl2Thw4NrcqlqLZutYNoq1ZZWfm//0rvvBP55y2KGTOkVq0KPwgTaHMc6AqOhEN4BYBI6drVPmCPHGnNk0INr2lpFkDfe8/eiE84IbSyVs+z9oQTJxb9w/X339t5x442SzhhQnBHtL/+2s5D/Zol69B7zDEWusM1C7BjhwXWNm1shmHcOJtdDHdzpnAYONC+1/fcE91ZvZ9/toB25532877vPpttjyUZGbYu+t577QN/UQ5w7NplwXzKlNDul3uWTCp606atW22fi9dfD+1+oXj+eTuI9NBDxWtF2qWLrXn/+efwja0gwXYazu3SS62y5b77wj+mSMjMtNdUr145/67fdZc1rRo2LLLP75xVn0ybZr9HN99ss+xDh8bemveMDDsoEVhnXpCWLa2PQDyWDq9ebX/fzj3X75HENMIrAERKt252fu+9VmbYvn3oj9Grl62p27HDgmioils6PHWqBby2ba2c1znp7bcLv99XX9lsSFG3nTn1VJsNGD++aPeX7Ov+5hvbQzEtzT6oHXywdTM+8siiP240DBtm639PO82CTjSMGWMfoo85xoL94sXSCy9E57mDdcMNdiDnnntsFmbIEOmMMyxcBWPKFPsAfM459hoLZTZx/HhbF3jAAVmXHXKIBcNQ171Onmyl8aNGhXa/ULz5pv3NOeSQ4j1O5852Ho3SYeeKHl4rV5Yuv9z+1v32W/jHFm4//2xdcrMfDJHsgMgxx0gPP2yz55Fy5502y/rggzb7mpJiyxd++SX2gt+ff9pyj4L2Nw/wPKs0+OKL6FULhMOUKfY37Z13pFdeiY8tuHxCeAWASDn4YPtAtWiRzWTl1WCiMIEPNnXrWklZqJo3tzVCRS0dnjrV3lDLl7fuxYcdFlzp8IQJNmNTsmTRnrdPH9srtygza7/8Yk1qGjWyMbzxhj3eJ5/YLPQ++xRtTNFUpYp9gJkzR7rllug855gx9jqtWVPq3t0qB/73v+iF58K8/rr0yCPW0OrWW212/4477PV46KFZDV3ysny5hdzOne1D4ZVXWolksGu4MzKsyVbv3jlnMStUsDAb6szrZ5/Z+eTJkfmA/fvvtlb+5JOL/1h169qSgWiE1yVLLKQUJbxK0hVX2N+qaJfcF0WgrDV3eJUsWK5bJz3+eGSe+8037QDV2Wdbs6uAk0+2dfkPPhiZ5y2qwO92MDOvkpUOr1uXVTkUyzIyrFogsJb+qafswGtxDtwmOMIrAERKamrWrEVRymclu3/FillbyITK86wMdfJkK0ULxa5dVlKWvVvwkCG2lvXXX/O/3+LFNntS1K9ZsvVugwZJH34YXOfNnTttpqJFCysbe/hhC+5vvmnB5c03paOOsm0U4kWvXhbUHnssqww7UubPt5mg446z/3uelS8uW2ZrcP02bZp0/vkWqh97zC4rUcI+5H/1lQWe9u3tuuxbfuzaZbPYzZpZxcBNN9mWGw8/bIEs2K2Rpk+30JtX0Ag0bQql1PKzz+xg1oYNVgkQbm+/bX8vTjwxPI/XpYtVMUR6O5VQOw3nVr26dPHFtlQjsOVOrBo/3v5WpaXtfV3r1rYn9KOPhn8GbupUC61du9q2PNkPxpQsaWuHJ0+OreAXbLOmgJ497f031maQc1u2zA6s3nqrvd/NnGmVQtWr2/pt5InwCgCRFCgdLmqQK1fOgmJxumj265e1vioUs2dbKMgeXgcPtg8FBc2IfvWVnRelWVN2p55qzz9mTMG327HDPqRfd52tl336aWnpUmnsWOmUU2x2LF49+KDNeJ91VmQbuHz4oZ0fe2zWZV272uv2gQekzZsj99yFWbrUmnilpVkFQe7Z/K5dLXj36yddfbXto7tihSr98ovN1Fx+uVUM/PKLzciVL2+v4SuusA/owXS1Hj/ePuT37Ln3dW3a2AGSxYuD+3oWLLAZ9SuusP+H+8BEoLS/R4/wNSPr0sXW4/3xR3geLz/FDa+SvQZKlbLXbazatMnKRPM6GBJw5532Ox/OvacXLLADVPXr2/6/eTXFOu8865QfS7Ov06cH16wpoEqV2N8y54svrDrr229tecZbb9n7V2qqBdpx42J/712fEF4BIJIuuMCCXrt2RX+M+vWt+25RHXaYfRgJlCoGK7C/a/bwWrOmvbG++Wb+b6xffWVbbLRsWbTxBnTsaF97QUF5+3ZrZDVmjM2wffutzbzUrFm8544V5ctbueyiRTYjEiljxtisdZMmOS+/+25bl/fMM5F77oJs22bBdf166aOP7HWVl+rVrQT46aft9desmQ65/HKrNgg0PNtvv5z3OessK+sPzOQWZPx4C8J5PX9gXXewpcOB38Mzz7SS43CH1+nTbXuccJQMB3TpYueRLh2eO9dmpOvWLfpj1KljAez1160BWSyaONGqRQpae9+ihR2Ue/xxO3BQXBs22IGdnTttCUX16nnfrkIFq/j48EM7yOK3jAyrTghmvWt2ffrY/ZYti8y4imrXLlsKcuSR9vfkxx+tQVP2GfB+/WzLpFjehstHhFcAiKQKFWz2rzjdPosrNdWO8H/2WWhHcqdOtQ+R9erlvPzUUy1MffPN3vdxzta7du9etDLn7FJS7Hv3xRcWoHLbts1K6z7+2MLVpZcW7/li1eGH2xrel1+2ABduK1fazzJQMpxdhw72IfDBB4NvilSYNWvsQ1lhr0XnrITuhx9s3XKLFgXf3vPswMWPP0qtWmnhKafYh++BA/P+/atQwUqR33+/4JCzYYPNzuY3S3bwwVbCHGzTps8+kxo2tNnF7t1tBi6ce2u+/bbNqB1/fPges1Ej+1sQjfC6777F/9tx3XX2M4+l2cPsxo+3kN6pU8G3u+MOq3p4+OHiPd+uXbb05M8/7WBO7gM5uV12ma2/fOih4j1vOASaNQW73jUgsGVOqAdtC/LNN/a4oTZoC1i92qpZ/vc/axr344/W4Tm3I4+01y+lw3kivAJAMujb145Ah7LdxdSpFl5yf/A/9lj74J9X46a//rLyyeKWDAeceqodec/dcCqwP+Gnn0rDh0sXXRSe54tVd9xhIem88yxshtMnn1iQzCu8Srb2dfXq8G3dcdFFNltZv741Tpo6Ne8g+/jj0muvWfnkgAHBP36LFtLEiZp/3nk2c12Qyy6z13dBX9vXX9trML/wWqaMPWcwsyQ7dtjBnT597Hm7dbMS0oKaTYUiI8O6lfbta6WT4eJ5Nvs6eXJkt1Epaqfh3OrVs5n1l14KbmuvaPv8cyt3L6yi5sADbQZ92LC8D+AFa9gwK0N9+unglrDUqmXrYt94Q/rvv6I/bziE2qwpoFUrm4UPV+nw66/b9+6zz+xnF2yzt4C//7Z1+T/8YO+dL7xgy4LyUr26HbQkvOaJ8AoAyaBPHzsP9o38v/9sNip7yXBAuXI2qzNq1N77gAbWuxanWVN2zZtb+XH20uEtW2wrifHj7cPpeeeF57liWalS9uFp3TpbwxlOY8bYh/3WrfO+vl07KzcMx9YdmzfbTHnPnhZgn3suqzz8qqtshjPQafPaa23W9LbbivecBalXzxqlvPBC/jPL48dbCC5oq6s2bYJr2jR1qoXVwO9jYE18uEqHv/nGwlo4S4YDunSxx/7nn/A/tmSzz//8E57wKtm2ShkZxZ+1DLcFCyykB7td1+2328G6oUOL9nxLltjBr379Qvtbec019v0LpuPxmjVS//520DLc3bOnTw+tWVNAYMuc8eOLN6bMTCvzDXQrnzPH3pMGDrTu58EczJk61cLomjX2HnnqqYXfp18/m5kt6h7tCYzwCgDJoHZt2+8x2PAaaGKTV3iVrOvw+vXWFCm7CRNsK5qmTYs+1txOPdUa68ybZ+Hn6KPteV55xWYHkkXLllY+PHJk1nrk4tq82T7cHXdcwaXtd91lXU+feKJ4z/fpp/ZB/JZbbOZixQqbhWjTxkq/O3SQGjSwtX7Nm0uvvlr8EtLCXHWVlQa//HLe148fb+W9eTW3CWjb1r4/hQW7zz6zhlOByoSaNe3rDFd4ffttC9r9+4fn8bIryrrXjRuDX3O4YIGFjHCF18aN7W/Hc8+Fv1qhOAJboAQbXvfbz4LTE09YN9pQXXutzfg/+WRoy1caN7bfw+eft4Nm+Zk9235/x42z1/GTT4Y+xoLMmGEH1oJt1pRdYMucH34o2nNv2WJNCv/3P1uX+tln9vP4+mvrtXDttVZJUlA4fvdd+32vVs3ex/J7T82tXz87D2yphD0IrwCQLPr2tVBa0AeRgKlTraStVau8r+/Rw0qyspcOZ2bam/oRR4R3je/JJ9vjDR9ub+iTJtks5BlnhO854sX110vp6Ra4wtGJcvz4rKZIBTnkEAu4xd264733rCQxsIVUpUoWMMaMsSD7xhv2XPvsY5dFo1N0u3Y2+/vEEzbTlN0//1i5X0FdYaWsZjKFrYUbN87WOVasmHVZ9+7WaGzHjtDHnt2OHfb9PfbY/MsRi+OAA6zBTCjh9aSTLNgHM/MVjk7Dud10k72+i3vQJZw+/9xe36HMJD78sB2AHDIktH2Xv/rKDmjceOPezdiCcf31dgDi2Wfzvn7UKKtI2L7dXsNHH22VEuFqlBVo1hRqyXBAr1528Kso616XLrXy4Pfft+//8OFZnc7LlrWDiDfeaOH+6KP37gbvnM2WDx5sfx+++y60g7qtW1uH9dwHiEF4BYCk0bevfRgIZsucqVPtQ2d+s02pqdZMaezYPfvHVvjnn6yGFOFUr57N+gwdas1tRoywD3HJqHx5mwWYNs0+lBbXmDHWiToQJgty55022x5Md968bNli62uPPz7vWZTKle3n+uGHtj1Uo0ZFe56iuOoq2+s2sGVQQOB3pbDw2ry5NbgpaN3rkiU2SxUoGQ7o3t2+N9OmhT7u3GNdsyYyJcOSHUDq3Dn48Dptms20L14cXMVHJMLr/vvbzy73mnm/7NplVSOBhjzBqlbNqhD++MPKoYOxY4c1sWvUKPj75Na6tX3/nngi5xKRjAw7MHDiiXaAc8YM62r/9NP2dV1ySXjWRhe1WVNA1aoWrkNd9/rTT3ZQ648/7G/kNdfs/fNKSZHuv1968UX7mXbsKP37r123a5fNyN5wgx3A+fLL/Ls758fz7D3788/DX4od5wivAJAsDjvMmrgU9ka+bZuVpxVW3jRkiK1TGzVKklQlUNIWrmZN2V14oQXpt9+O3IfzeHHaaTY7eeON9sGuqHbtsvWn/ftbt9zCHHywlcoVdeuOzz6z8Z5wQuj3jbTjjrMP+bn31Pz8c1uPW1igKlnSPsQXNPMaKP/LHV67drUPqsUtHX77bfuwXljQLo4uXWw2Opg9be+918ZTu7atTS/M3LkW0kL9kF+YY46xRnJ//hnexy2KadPsAFCwJcPZ9expewMPGxZcKenjj1v4GjbMZgqL6oYbbN3l66/b/9eutZnGBx6wreC+/tqqcCT7XbnnHjuo+d57RX/OgMDBoKKGV8kC4IwZwa8d/fhjq45wzg6WHnNMwbc/5xx7T120yN5jv/7a/qY+/7wF/DffLPpWd/362eslmL2okwjhFQCSRYkSWVvmFHRUfMYMC6WFhddWrawb5u7S4aozZ9p6oOLs0Zifk06ycucTTwz/Y8eblBSb/Vy8eO+wFYpvvrEPovl1Gc7LnXdaw6Fgmrjk9t57VnbatWvo94201FRrhPXtt1kBNDBL1rt3cLNkbdrYQZ/cpccBn31mZYC5t/ypVs0ODBQnvG7ZYjNEAwcWvDa3uIJd9zprloWAq66y8v5PPil87Wu4Og3ndtRRdv7JJ+F/7FB9/rn9/ha1OuX+++1v7llnFXwAadEi26P5mGOyvv6i6t7dXtsPPWSVA23b2u/F88/beuLcr7fLLrODa5dfHtwSlYLMmFG0Zk3ZBbbMCSbwDx9uZfcHHGAHGvJbNpNbz55ZS2169LAqiOHDrUqmOGv2e/a09226DudAeAWAZNKnj63lKWjLnEAzoIK6q0r2gX7IEDs6/ddfqjx7dmRmXQOKM3uQaLp0sfLbBx4o+lYgY8bYh61QZuoOOshmFZ57LrS1d1u3WpgZMCC4WV4/nH22rUUNlEVPm2br2IL9/rRta8E+UP6a3a5d9oE2sEVObt272+9d7u7dwfrkE2u+FemqhIMPtu9RYeH13nttPfNll1nQysjImrnLT6TCa4MG1uwsVsJr27Z2wKIoypa1mbxVq2zWM7+DkFdfbd/zcKz19Tybff37b5sB3bxZmjjR9kjOS4kS1r17xQqbeSyO4jRrCmjVymb/C6s4eukl+5727Wt9FdLTQ3ueAw+0hkxnnWXPFY4u+JUr2yww4TUHwisAJJNgtsyZOlXad1/rhFqYU06x8yuvVImtW8O/3hX5GzrU1rXdemvo93XOwmvv3oXvhZrblVfah+e33gr+PuPHW7CLxZLhgEqV7APnu+/azNX48aHNkhXUtGnaNJvlDswC5da9uzW9+f77oo397bdtVjfSs9qpqfZhuqDw+uuv0ujRNvNWpYrNmnXsaN2c8wtbmzdbJUEkwqtkZa6BSgOflNiwwV4bRSkZzq5VKzs48P77eR8QGD/eqhxuuUVq2LB4zxUwYIA9b9u2FigLq8o55BArcX7uOatmKIriNmsKSEnJ2jInv6qI11+33/0+fey1W9SGZ7Vr2+u8V6+ijze3o46yGe9Fi8L3mHGO8AoAySQtzT6E5BdenbPwWtisa0CDBjYL+Omncp6XtW8lIq9JEwsIr75qH/JCMWuWNRcJpWQ4oFs3m8l6/PHgm7KMGmWzTd27h/580XTZZfY1PfWUfdgNZZZs//3tQEBeTZs++8w+RPfsmfd9u3Sx64tSOrxunc3MnHhi8WaogtWli62lXLEi7+vvu8+6RF95ZdZl55xja07z2+Lpr7/sPJLhNSPD121Hqs6caR3Cw7Em+Zpr7Odw2WXWaCxg+3Zr0tS0qXTddcV/noDUVAveU6cGPyN59922Bvb884vWSXvOnOI1a8quTx9rZpZXU7S33rLZ0iOOsOBaunTxny+cAlvmhNp0KoERXgEg2fTtax9C8lqP9M8/9qE02L3opD2dfzc1bRr+Ziso2K232vf86qtD6+45ZoyFpaOPDv05Pc+Cya+/2lYchdm+XfroIwvKga0mYlXDhrZu9LnnbG/IUIJGaqrNOOU18/rZZ9Lhh1sDo7xUrmz3LUp4/eADCwfRamQWWPc6Zcre1/35p/TOO9ZtNvvfgkGDLNDm17gpEp2Gs2vXzipJfCwdrvbjj/ZzPuyw4j9YaqrNFnqedPrpWTOKjzxiBwKeeir8ISzUcv8KFaz78O+/23rZUM2YYeeBiobi6N3b/t7lDoCjRlkDvC5drNN4LC5NOeAAO0hM6fAehFcASDaBLXO+/HLv6wIzI6GE1xNOkMqV05p27cIzPgSvShXprrtsDVrubV4KMmaMbXsSTGl4Xk4+2fZrDaZx0xdf2F6RsVwynN1VV9la16LMkrVta9ts7NyZddnKlTYbm7vLcG7du1tgDrWD9NtvS40bW0CLhjZtbK10XqXD//ufXXf11Tkvr1DBmq69+669FnILhNdQ9sEMRWqqzWB9+mno245kZNh6z+JwTlWnT7fZvXCt+W7QwELqlCkWDhcssHLigQOLX5ocLkcfbb/399yTNbserBkzrJJhv/2KP45q1eygQfbw+sEH9nesQwdbjx+JvZHDwfPstfvll3YgEIRXAEg67dvbDEBeZUhTp9ravwMPDP7xqlaVfvtNC087LXxjRPDOP9+Ozl93XXDlefPmSb/8UrSS4YAyZWwfw08+KfxD6ahRFrLjZT10+/Y2S1qxYuizZG3aWNOl337LuuyLL2xWPJjwumNHaNtiLF9unV9POim0fUOLo1Qp+x7lDq/z5lkzoQsvtAMbuZ1zjq1tfeedva+bO1faZ5/Q11+H4uijbc1rqNuO3H239QC4/PKiN9SaM0dlVqwI/zZGQ4bYrPZtt2W9Boq6D3OkPPGEzQJfeGFo1SEzZtgSl3CVwvftaweRVqywv1uDB9vBpk8/tYMrsaxfP/vd+eYbv0cSEwivAJBsSpSwhhJ5bZkzdap9cA/1A0PDhsqMtbVCyaJECSsX/PtvK9MrTGCG9thji/e8gb13C+poumOHPd+xx0Z2C5dwGzHCPtSGWuYcKHHMvu513DjbIqiwtXudOtnvXSilw6NG2QxxtPc+7tLFZpjXr8+67P777bWY31rLww6zg2J5lQ5HqtNwdr17288zlNLhrVvtdyo93fZLbdcu54GJYAXW2oZ7RtTzrMS9Vi2btb/9dqlevfA+R3Glp1tX9K++kt54I7j7hKtZU3aBZmk33GCz061a2XtgxYrhe45I6d7dDgCMHev3SGIC4RUAklHfvrbFyuzZWZdt2GAzcqGUDCM29O1rH4zvvrvg/R8lKxk++GCpUaPiPWedOhaaXnkl/y6uEyZYwImXkuGAJk0sTIaqaVOragise83MtOBy5JGF7/dYsaLNBIUSXt9+W2re3E7R1KWLHfgKdJJduFB67TXr2JqWlvd9PM9mX7//3tZBBjhna2XDUR5akEqVrBvzxx8Hf5+337bfp8DBjOXL7QDFs8+GNos4fry21KsXvu6/2VWrZp2HL73USt5j0QUX2Gz91VcHt/drOJs1BRxyiIX8V1+135fPP7ff1XhQvrwFWNa9SiK8AkByymvLnB9+sA9khNf49Mgjtp7wrLOkZ56xhi4ffGBrpX74wWaMZs+2wFGckuHsrrjCPmTm14hn1CgLDeHcOiKWeZ6Fm8DM66xZtua1sJLhgO7drSPqpk2F33bhQquUiPasq2TVGSVKZJUOP/igfe3XX1/w/U47zWY/s79eVq+2QBPpmVfJSof/+MNKnAvjnFUVtGxpHbb79rXfn65dpYsvtu1jVq3K//47d9rv3qWXShMmaE04Gg/l5/DDbWY4VqsbUlJsBnv1atviqzDhbNaUfQznnmtr/cePz795Wqzq188qFIq7/joBEF4BIBmlp9vsW/bw+t139gE0HN0wEX0HHWQlcR9/bN1ezzhDOv54C46HH26zDQcfbLOB4QqvrVvbh/lhw/ZuhLNzp83yHnNM7G0/EUlt21rI2bbNyhKl4Nc6du9u38dg9sYM7PF50klFG2dxlCtnX+fkydJ//1kYPeuswktWa9a018Prr2etz450p+HsAt21gym/nDTJfo6XX561nrh2bZv9euwx+9t58ME5O25v2mR7rA4ZYrN8vXrZvp/9+mmRHwcZYknr1rYv+OOPW9VPQcLZrCm7++6z12w8dsUPlD2zZQ7hFQCSVt++9iE5sG5t6lSpRQubKUN8uu8+a+yxbJk1Upo1yz6sjR0rjRwpvfCCdXxt1Sp8z3nVVbZn7JgxOS//+msrJ463kuHiatPGAujs2RZeDz007wZGeenQwWYmCysdHjfOSsSPOso6DfuhSxcrj77zTlujeOONwd3vnHNsxjJQvhvN8NqkiTU3C6Z0+IknLOScckrOy1NSbKuo77+3v5U9e1rTtKOPtrXNgwbZz33AAPudWLVK+uADbS9qZ+9Ecs899rtx110F32769PA2a0oETZva7wilw4RXAEha2bfMycy0mVdKhuNfuXI2Q9S0qX0A7NzZSs4GD7ayuUGDwvt8Rx9tASr3tjmjRlkXz1jZtiNa2ra18y++sN+pwIxJMMqXt6ZABYXX776zhjMtWkhvvVW8sRZHly4WRF580cqBg11D3bu3VLduVunw3LlWghyJ9aB5Ofpom1XdsCH/28yfb3sTX3BB/nt/tm5tIeu88+yg0G+/WTnxpEl28Ojll61RWaxuweKHxo3te/rSS7bOOS8ZGdYMLJzrXRNFv372t2HzZr9H4ivCKwAkq/btbeZg3DhroLJhA+EVoUtNtdLKb7/NalS0a5ett+3f37bVSSb16ll57BNP2AfxYNe7BnTvbmWTeYWr336z2da6de331s8qiY4drZw2JUW6+ebg75eaKp15pjXMWbzYwmuTJuHb/7Qw/ftbSfsXX+R/m6eftq/toosKfqzy5aXnn7c1u//8Iz36qIX6aH0t8ei22+yAwC235H19oFlTJNcIx6t+/Wyv11CauiUgwisAJKuSJbO2zAmssSO8oijOOsu65Qa2zZk0yZqzJFvJsGShp21ba9RUuXLoa8i7d7fQm3tPxwULbNayTBkLXrVrh23IRVK5ss0qX3CB7YMairPPtmqPV1+NzjY52bVvb8168isd3rTJZpNPOMH2ng1G5crR22c33tWqJV1zjXVInjZt7+sDzZqYed1bly52wCTJS4cJrwCQzPr2tYYrw4fbhwq/1s8hvlWqZGsZ33nHXk+jRtmHrFBKZhNJYNaoV6/QZ+Hat7eusdlnV1assOC6ZYt1So1WiW1hxo61ztahatzYQvrLL9va7GiG1xIl7HX56ad2kCC311+3PgBXXBG9MSWba66x6oQbbth7y6Hp0yPTrCkRlC5ta6zHjg1tq6YEQ3gFgGQWKGmcOdNmXZk9QFFddpmFgWHDpNGjrbw1v/WCiS6w7jXUkmHJvmft22eF1w0bLGwtXmwfWqO9p2uknHOOrS3dti264VWy0uGVK7PK3AMyM6Unn7Sf3+GHR3dMyaRiRSsfnjjRysezmzGDZk0FGTjQGuQVVPae4AivAJDM6ta1fQwlSoZRPI0b2xY8jzxiwSAZS4YDjjxSeuop6dRTi3b/7t2tU/SyZfY9/fln24IlkX5Hjz/eym2l6IfXI4+0cJS7dPiLL6yRUPbtcRAZF1xgTb5uvNEOGkhZzZpY75q/E0+U0tKkhx/2eyS+IbwCQLILzA4l0gdj+OPKK61ZU9my1lwkWZUsaXvtFrVZVffuVhbYpYvNwL76auJ9P8uWzQr30Q6vVatKnTpJn3yS8/InnpDq1LGAgMgqVUq69147MPP223ZZoFkT613zV7q0HVz54gsL+kmI8AoAye6SS6Trrgu9sQyQW+fOFgoGD7Z1ayiaww6z4PvXX9Jjj0lDhvg9osi45x5bH52eHv3n7t/f9uL991/7/59/Wgfniy6yYIXIO+kkKxG+7TZpxw5b7yoRXgtz4YW2DdkjjxTt/i+/rBKbNoV3TFFEeAWAZFe/vjR0KNs7oPg8z9axvfyy3yOJb6VLS3feaXvnXnmlz4OJoGrV/CsvP/poOx871s6HDbPQesEF/ownGaWkSPffb2ufn3/e1rvSrKlwVarYnt0jR0qLFoV23y+/lM45R2n5dduOA4RXAAAQPqmprBcMhxtuoONtJDVrJjVtaute162z0uyTT/Z/C6Jkc+SRViZ/993S5MlS69Y0awrGlVfa0oInnwz+Pjt2WGO9Jk3038CBERtapBFeAQAAkFw8z2Zfv/rKmmtt3mxrCRFdnic98IC0apWtf6VkODgNGtja7Oeft62dgvHEE7au+MknlRnHpfGEVwAAACSf/v2l7dtt1q9TJ+mQQ/weUXJq1y6rfJzwGrxrrpE2bpReeKHw2/73n3TXXfaaj/Pmb4RXAAAAJJ9OnaRKlaSdOynR9tuDD9re0Ece6fdI4sehh1rJ9eOPW0lwQa691jrBP/54NEYWUYRXAAAAJJ9SpaRjj83aoxj+adzYti6qVcvvkcSXa6+1WdV33sn/NhMnWnOnG2+073OcI7wCAAAgOQW63NJtHfGob1/pwAOlhx+2Bk657dwpXXqp1LChNYFLAIRXAAAAJKeyZW3rESAeeZ7Nvs6ebdvg5PbUU9Jvv1m5cNmyUR9eJBBeAQAAACAenXKKlJZms6/ZLV0q3XGHzc4ec4w/Y4sAwisAAAAAxKPSpW2bp/HjbbuhgBtusG7aTzyRUHtvE14BAAAAIF5dcIFUvrz0yCP2/ylTpDfesJLifff1d2xhRngFAAAAgHhVtap07rnS229LCxZIl1wi1asn3Xyz3yMLO8IrAAAAAMSzK6+0jsO9e1sDp0cftdnYBEN4BQAAAIB41rChNGiQ9NdfUs+e0sCBfo8oItjUCgAAAADi3S23SH/+aVvkJFCTpuwIrwAAAAAQ75o3l2bO9HsUEUXZMADg/+3de7BdZXnH8e+PhGsLknApJRQEtbYwEEcptRUJ4AVhkFDBCrUFSqHKVKu1rZQWq0VRvAwohRZQLLQRYbi0AaejJAGiY8MUSk0lRSo2QWOhgoSr3PP0j7XOsGezk3NqTvZeOef7mTmzzlnrXTvPyTyTnd9+37WWJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmGV0mSJElS5xleJUmSJEmdZ3iVJEmSJHWe4VWSJEmS1HmpqlHXMGFJHgDuHXEZOwIPjrgGaYz9qC6xH9Ul9qO6xH5Ul3S9H/eoqp0GHdikwmsXJLm9qvYfdR0S2I/qFvtRXWI/qkvsR3XJptyPLhuWJEmSJHWe4VWSJEmS1HmG1/+/S0ZdgNTDflSX2I/qEvtRXWI/qks22X70mldJkiRJUuc58ypJkiRJ6jzDqyRJkiSp8wyvE5DkF5Jck+SRJI8muS7J7qOuS1NbkmOTXJvk3iRPJrk7ySeSbNs3blaSLyR5MMkTSRYn2XdUdWt6SPLVJJXkY3377UcNTZIjknw9yePt+/PtSQ7tOW4/aiiSvC7JjUl+lOSxJHckOblvzFZJPp3kvvZ9fVmSg0ZVszZ9SXZL8tdtL/2kfV9+6YBxE+q9JJslOSPJqiRPJVme5Jih/DITZHgdR5JtgJuAXwJOBH4HeAVwc5KfGWVtmvL+BHge+HPgLcDfAqcBi5JsBpAkwA3t8fcCxwCb0/TnbqMoWlNfkuOBuQP2248amiTvAhYC/wb8BvB24Gpgm/a4/aihSLIfsJimv04F3gbcBlya5LSeoZe2x/8SOBK4D/haklcNtWBNJS8HfhNYA3xjPeMm2nsfBT4CXAAcDtwKXJ3kiEmtegN4w6ZxJHkfcC7wyqq6p923J/Bd4INVde4o69PUlWSnqnqgb98JwOXAG6rqpiTzgX8CDq2qm9sxLwFWAguq6g+HXLamuCSzgLuAPwKuAM6uqjPbY/ajhqKdWbgLOKOqPruOMfajhiLJx2k+cJ5dVY/37F8GUFW/lmQu8C3g5Kr6u/b4TGAFcHdVHTX0wrXJS7JZVa1tvz8F+DywZ1Wt6hkzod5LsjPwA+Ccqvpwz/lLgJ2qar+h/FLjcOZ1fEcBt44FV4CqWgl8E5g/sqo05fUH19Zt7XZOuz0K+J+x/5i15z1CM9tgf2pj+CRwZ1V9ecAx+1HDcjKwFrhoPWPsRw3LFsCzwJN9+x/hhf9rH9WOuWrsYFU9B1wJHJZkyyHUqSlmLLiOY6K9dxhNLy/oO38BsG87eTdyhtfx7QPcOWD/CmDvIdcizWu3d7Xb9fXn7kl+dihVaVpIciBwAvAH6xhiP2pYDgS+AxyX5HtJnktyT5Le3rQfNSyXtdvzk+yaZPskpwJvAM5rj+0DrKyqn/Sdu4ImMLx8KJVqOppo7+0DPA3cM2AcdCT3GF7HN5tmHXm/h4BZQ65F01iSOcBZwOKqur3dvb7+BHtUkyTJFsDFwGeq6u51DLMfNSy70tx/4tPAOcCbgUXABe3lPmA/akiq6k7gYJoZ/R/S9N2FwLur6sp22Hj9OHsjl6npa6K9Nxt4uF58TWmnenTmqAuQNL52hmAh8BzwuyMuR9PTB4GtgbNHXYhE8+H7tsBJVXVdu++m9lrYM5KcP7LKNO0keQVwLc0M1btplg/PBy5K8lRVfWmU9UlTieF1fGsY/Onsuj7FkCZVkq1prtHaC5hXVat7Dq+vP8eOSxskzaPB/gI4Bdiy79qsLZNsDzyG/ajh+THNzOuivv030txd+OexHzU8H6e5pvDIqnq23bckyQ7A55J8mabf9hhw7lg/PjTgmDQZJtp7a4Dtk6Rv9rVTPeqy4fGtoFkD3m9v4D+HXIummSSbA9cA+wNHVNW3+4asrz+/33vXQ2kD7AVsRXPThjU9X9DcYXMNsC/2o4ZnxTjH12I/anj2BZb3BNcx/wrsAOxM0497to9g7LU38Awvvs5QmiwT7b0VwJbAywaMg47kHsPr+K4HXptkr7Ed7bKk17XHpI2ifZbrl4BDgaOr6tYBw64H5iSZ13PedsBbsT81eb4FHDLgC5pAewjNm5/9qGH5x3Z7WN/+twCrq+p+7EcNz/3Aq9p7A/T6VeApmhmrG2ieA/v2sYPt40reAdxYVU8PqVZNPxPtva/SrCB4Z9/5v03zlIGVQ6h1XC4bHt/ngfcAC5OcCRTNA3x/QHPzEmljuZDmH5qzgSeSvLbn2Op2+fD1wDJgQZI/pZkBOwMI8Kkh16spqqoeBm7p358E4N6quqX92X7UsPwzcDNwcZIdgf+m+ffyzbxwXwD7UcNyAXA1cEOSv6G55vUo4HjgvKp6Bvj3JFcBn21XVa0ETgP25MVhQZqwJMe2376m3R6e5AHggapaWlUT6r2q+lGSc2nuG/AYcAdNwD2Upp87IS++oZT6tdd7nQe8ieZNbwnw/t4HAEuTLckqBl+jAPBXVfWRdtxs4DPA0TRLO5cBH6iq5Ru/Sk1nSQo4u6rO7NlnP2oo2lnUTwDH0lzb+h3gnKq6omeM/aihSHI4cDrNUvWtgO8BlwAXV9Xz7Zixm979FrA9sBw4fewDQOmn0b4XD7K0qg5ux0yo95LMoPmQ71RgF+Bu4KyqumZj1P7TMLxKkiRJkjrPa14lSZIkSZ1neJUkSZIkdZ7hVZIkSZLUeYZXSZIkSVLnGV4lSZIkSZ1neJUkSZIkdZ7hVZKkKSrJqiQLRl2HJEmTwfAqSZIkSeo8w6skSZIkqfMMr5IkTYIkc5Ncn2RNkieTfDPJ63uOX5ZkdZJfT3JbkqfaZb3vHfBaByRZnOTxJE8kWZLkgAHj5iVZlOSRdtzyJL83YNxxSe5qx9ye5MDJ/xuQJGnjMrxKkrSBkrwa+BdgNnAqcAzwY2Bxktf0DN0OuAq4HDgauAU4P8lJPa+1H7AUmAWcBJzQnrc0ydyecfOBJcAWwLuA+cAXgT36yns98MfAh4B3ADOAryTZfgN/bUmShipVNeoaJEnapCVZAuwKzK2qZ9p9M4A7gbur6ugklwEnAsdX1ZU95y4CfhF4aVVVkmuAN7Y/P9yO2Q5YBdxSVW9LEmAl8CBwQFWtXUddq4CXAHtV1Zp23/7AbcA7q+qKSf2LkCRpI3LmVZKkDZBka2AecDWwNsnMJDOBAIuBg3qGPw9c2/cSVwK7A3Panw8CvjIWXAGq6lHg+vbPAXglzQzrF9YVXHssGwuurW+3293H/+0kSeoOw6skSRtmNs1S3A8Bz/Z9vQeYlWTs/XZNVT3bd/7/ttux8DobuG/An3M/zVJigB3a7eoJ1PdQ7w9V9XT77VYTOFeSpM6YOeoCJEnaxD0MrAUuBP5+0ICqWtus9GVWks37AuzPtdsfttuHgF0GvMwuwNgM6oPtds6AcZIkTUmGV0mSNkBVPZHkG8Bc4I5xlvHOoLmZ05U9+44Dvs8L4XUpcESSbavqMYAk2wJvpbnBE8B/0VwDe0qSS8obWEiSpgHDqyRJG+4DwNeBryW5lGbZ747Aq4EZVfVn7bjHgE8l2RH4LnA8zc2ZTuoJoB8FjgSWJPkkUMDpwDbAWQDtjZ3eD1wH3JTkIuAB4JeBnavqwxv595Ukaei85lWSpA1UVXcAv0LzeJzzgRuBzwH70oTaMY/SzLSeCCwEDgHeV1WX97zWfwAHt2MvB/4BeByYV1XLe8YtBN7U/ngpzQ2dfp9mRlaSpCnHR+VIkjQE7aNy3lhVu426FkmSNkXOvEqSJEmSOs/wKkmSJEnqPJcNS5IkSZI6z5lXSZIkSVLnGV4lSZIkSZ1neJUkSZIkdZ7hVZIkSZLUeYZXSZIkSVLn/R8A1H9HsTQoBwAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA74AAALYCAYAAABbpC7yAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAADIbUlEQVR4nOzdd3gU1dcH8O8Noffei0gRCNIEUUgEEQEh2Ht7bWDvvff+UxAVwa7YRYUIqKjgRnpRIXQIBCH0BAKB9Pv+cXbMZrNlZne2Jd/P8+TZZHd29mZ3s5kz59xzldYaRERERERERBVVXKQHQERERERERBRKDHyJiIiIiIioQmPgS0RERERERBUaA18iIiIiIiKq0Bj4EhERERERUYXGwJeIiIiIiIgqNAa+REQUMkqpj5RSWinVIdJjMSilhjjH9KSH265WSq1SSuU6t7nTeb1WSs0P81A9UkrNV0pxLUIiIiILGPgSEZElSqmTnQHtFqXUUefXBqXUVKXUwEiPL1BKqcEAPgJQHcAkAE8BWByBcTzpDLSHhPuxg6GUildK7XaOfUqkx0NEROQqPtIDICKi2KCUqgLgNQC3AygA8BuA7wGUAOgM4FIANyilrtJafxqxgfq3FEA3APvdrh/lvLxaa+0e8HYDcDTUAzPpKgC1Ij0ID0YDaA5AA7hEKXWn1vpYhMdEREQEgIEvERGZ9zwk6F0B4AKt9TbXG5VS9QE8CKBB2Edmgdb6KID1Hm5q6bzc7eE+nraPCK319kiPwYtrISdBJgK4C8AFAKL5BAgREVUiLHUmIiK/lFKdAdwLyZKOcg96AUBrfUhr/RCAqX72VV8p9aBSKtVZGluglNqulJqilGrhYfsGSqnnlFLrnXNvDymlNiql3lNKNXbZroZS6n6l1Gql1GGl1BGlVLpSappS6niX7crM8TV+BnCNc5Otztu1y308zvFVSrVUSk1USm1WSuUppfYppf5QSv2fyzbVlFK3K6XmKqV2On/fXUqpz53Pq+v+5gN4wvnjPGMcSqltrtt4muOrlGqmlHpTKZXh8hgfK6WO87DtNudXHef4M5VS+c75zRe4b++PUqo5gLMAzAfwMoBiSCDsbXu/z5vLthcopX5TSmUrpY457zNFKdXO/ffx8ljlni/XcnKl1LVKqb+c+/7BeXsrpdTTSqmlzrHlOx/3VaVU3UB+J6XUUOdjvuXl/qc6b/f590NERIFhxpeIiMz4P8jJ0ila632+NtRa5/vZVzcATwL4HcC3AI4B6AVgHIAzlVJ9tdbZAKCUUgB+AXCS83ImgKoAjgNwCYAJAA449/spJMu4AMC7kOxje0gJ85cAtngZzzbIfN5znOOYCOCgn98BSqluAOZBynvnOX+XegD6ArgDMl8YABpBSsQdAFIAHALQFcBFAEYqpfpprbc6tzXucxqAj51jg7/xKKWaAVgCoAOAuQA+dz7GlQBGK6UGe8haV4U8pw0BTIeUT18C4Gul1Eit9S/+ngMXV0GOKT7VWu9WSs0FMEIp1VFrne42VrPPG5RSbwC4DcBeAN8AyIK89hcCmAMg2Oz3AwASIa/LzwAOO69PgmStfwOwEFK+fTKAewAkKaUGaa0LLf5O8wFsAnCZUuoerXWe21iuc16+F+TvREREnmit+cUvfvGLX/zy+QU5mNcATrd4v4+c9+vgcl19AI08bHuFc9tHXa470Xnd6x62rw2gpss+SwB872G7agDquvw8xLnPJ/2N1eU2DWC+23UrnNdf6WH71i7fVwfQysM2pwEoAvCe2/VPOvc7xMtzOl/+fXsc++Nu11/rvH6e2/XbnNf/AKCay/XDnNf/ZPF1XguZA13X+fNlzv0842Fbs8/bWOd2KwDUd9uuput7yPn7bLPwfBnPcQ6AHh7u0xRAbQ/XP+q83xUB/k73O7e7zG2bOpCge5WV551f/OIXv/hl/oulzkREZIZRgrwz2B1pKYnO8nDTZ5BA5AwPt5VrkqS1ztWlzZM0AOVluwKt9WH364OhlDoZks37TXto5KW13unyfb7WOtPDNn9AAkZPv6+VsVQHcDEkK/qy280fAkgDMMS1NNjFXVrrApcx/QYgA0B/C49/CiSLP8Plef4BwBEAVyul4ly2Nf28AbjJeXmH1vqQ23bHvLyHrJqqtV7jYRz7tNa5HrZ/23n532tm8Xf6CEAhypeBXwwJft+3NHoiIjKNgS8REYWdUmqYUipFKbVHKVXknINZAikPbemy6VpI4PagUmq2UupmpVSCswT6P1rrHAA/AbhUKeVQSt2tlOqnpBN1KBiBoalyYOdYvlJK7XDOvzXmEPdE2d83EF0B1ACwRLuVz2qtNYA/nD/2crvfQV1aYu1qB6w1KDOCuP+CPi0NxKYDaAtguMu2Vp63/gCOaq3/tDAWq5Z7u0EpdaFS6lel1H6lVLHz9TLK6l1fM9O/k9Z6L+SkwOluc6+vA5APNgMjIgoZzvElIiIzdgM4AUBrABuC2ZFS6mIAX0BKO3+ClKkamdo7IaXBAACtdZFSaihkDu75KF1yaKdS6hWt9USXXV8AKUW9FMD/nNcdUEq9DSm5LYR96jsvy2Vy3SlZH/g3SGD/M4DNAHIhWer/g8xDDkY95+UeL7fvdtvOcMh9Q6cimDwxrpSqhdJss3vg9ymAqyGB8c/O60w/b85tM8yMIwgenzOl1H2Q7PleyFzinQCMkwpPwOU9Cmu/EyDN3y6ENFN73Dk/+BQAX9mUxSYiIg8Y+BIRkRkLIXNjT4c0pQrG45Agop/WerNxpTOLe7/7xlrr/QBuUUrdCqAHpMz0dgATlFI5WusPndvlAngIwENKqU7Osd4C4DFIkPmE+76DcNB52crEtg9B5hkP0lovdL3BeRIgWDnOy+Zebm/utp2dLgRQ1/lV6JaIN5ytlGrkDOoOOq8z87wdhPlseAnkOfakvpfrAXlflKGUioecQMkE0Mv5/jNua47y76ODzkszvxMgJ0G2QMrAnwSbWhERhQVLnYmIyIyPIMHFOKVUE18bOuec+nI8gLWuQa9TH0jTIo+0SNNaTwBwnvPqsV623ay1ngpgqHPcHrcLwjLn5Zkmtj0ewAEPQW9z523uip2XZsu0N0BOJAzw8twnOS//Mbk/K4wy5y8h81PdvxZBsqOXO7ez8rwtA1DLmTH3JxtAM2fQ+h+lVG0AnT3fxasmkOz4Iteg12mQl3EC5n4no/z8PQDtIEtAXQlgKyQgJiKiEGHgS0REfmmtNwF4FdLtdpZSqlx5rlKqrlLqWciyRL5sB9DZuQSPcd96kGWE3PfZwdNjoTSLmefcrqlSqoeH7ZpB/te5Lx0TFK31UgArAQxTSl3hfrtSqrXLj9sBNHKWtBq3VwPwJmRJIXdGuWtbk2PJB/AV5Dm5x20cV0M6Y8/XWge79E8ZStZGTgSwUWt9qdb6evcvANc7N7/WOVYrz9tk5+VEpVR9t+1qKKUauVy1DPJcXu6yjQLwAqT7txV7Ie+Xvkqp/07EKKVaAnjefWOLv5PhQ0iTqymQ9+iHzoCYiIhChKXORERk1sOQjOxtADYqpX6FNJ8qgWQuh0MyZVf62c+bkCB3pVJqOiQjOApSWuo+T7I3gOlKqSUA1kHmZLaDrLlbAGCSc7vWAP5SSv0FYLVzP82c22nIOrp2uwKyVM6nSqlrACyFlPz2hgRbfZzbvQl5bhYopb6CzKE9AxKo/YPyTaeMpaOedwbzhyCNqN70MZb7IcsjPaeUSoIEYl0BnAtpyHSTj/sG6lpIJ+0PvW2gtV6rlFoKyUb30Vr/BZPPm9Y6RSk1CaXvtx8gJwXaARgJKRH+wflQb0LmzL6nlBoOYB8kKG8Az8+xV1rrEqXUZMg6vn8ppWZB1mIeA2kU1tXD3cy+F4zH2KOUSoFULpTAx3NIRET2YMaXiIhM0VoXa61vhzTi+QLS7OoWAHdAAotvAZystZ7mZ1eTANwKmXM6DsBoAN9DSkXdG1AtB/AK5P9VMoC7IeWmMwD0dykf3gZZm/Wwcz93Oy//BJCktf4mkN/ZF631OsgyNm9DAv97AFwECWxfd9luJqQBVAak2dOFAJY4f4+DHva7FhLE7YcEfc8AuNfPWPYCOBnAWwC6O7cfBGAa5HlaH/Av6oFziaKrIEGbv07ERlBnZH1NPW/ObW+HNCvb4Ly8A8AAAN9A1s41tkuDBMMrIE3OroSclDkVHp5jEx6EzOWNh7zHkyDP7WWeNrbyO7n42Hn5s9Z6RwBjJCIiCxQra4iIiIjCSyn1GICnAZyvtf4u0uMhIqroGPgSERERhZFzGahNzh/ba62LIjkeIqLKgHN8iYiIiMLA2aH6NMic9lYAbmXQS0QUHgx8iYiIiMLjDMjc4b0AXkRp52oiIgoxljoTERERERFRhVapMr5NmjTRHTp0iNjj5+bmonZtq8sJEoUG348UTfh+pGjC9yNFE74fKZrEwvtxxYoV+7XWTd2vr1SBb4cOHbB8+fKIPf78+fMxZMiQiD0+kSu+Hyma8P1I0YTvR4omfD9SNImF96NSKsPT9VzHl4iIiIiIiCo0Br5ERERERERUoTHwJSIiIiIiogqNgS8RERERERFVaAx8iYiIiIiIqEJj4EtEREREREQVGgNfIiIiIiIiqtAY+BIREREREVGFxsCXiIiIiIiIKjQGvkRERERERFShMfAlIiIiIiKiCo2BLxEREREREVVoDHyJiIiIiIioQmPgS0RERERERBUaA18iIiIiIiKq0Bj4EhERERERUYXGwJeIiIiIiIgqNAa+REREREREVKEx8CUiIiIiIqIKjYEvERERERERVWgMfImIiIiIiKhCY+BLREREREREFRoDXyIiIiIiIqrQGPgSERERERFRhcbAl4iIiIiIiCo0Br5ERERERERUoTHwJSIiIiIiogqNgS8REREREVFlpjXw6qvAxo2RHknIMPAlIiIiIiKqzLZsAe67D3jppUiPJGQY+BIREREREVVmDodczpoFlJREdiwhwsCXiIiIiIioMjMC3z17gGXLIjuWEGHgS0REREREVJmlpgJDhgBVqgApKZEeTUgw8CUiIiIiIqqsduwA0tOBs88GBg8GZs6M9IhCgoEvERERERFRZZWaKpdJSUByMrB6NZCREdkxhQADXyIiIqLK5O+/gXPOAfLyIj0SouCVlABXX106R5WsS00F6tYFevWSwBeokOXODHyJiIiIKpPnngNmzADWr4/0SIiCt3Yt8MknFbY8NywcDmDQIJnf26WLfDHwJSIiIqKYtWcP8MMP8v2OHREdCpEtjExvZmZkxxGr9u8H1qyRMmdDcjIwfz5w+HDEhhUKDHyJiIiIKosPPwSKiuR7Br5UETDwDc6ff8plYmLpdcnJQEEB8MsvkRlTiDDwJSIiIqoMSkqAd9+Vrq1VqjDwpdindWljpl27IjuWWJWaClSvDvTvX3rdoEFAw4YVrtyZgS8RERFRZfDrr7Jkyc03A61aMfCl2JeeLpneevWY8Q2UwwEMHCjBryE+HjjrLGDWLKC4OHJjsxkDXyIiIqLKYMoUoEkT4LzzgDZtGPhS7DPKnM89FzhypMLNSQ25w4eBlSvLljkbkpNl/u+SJeEfV4gw8CUiIqLoMGmSlOGS/Xbtkk7O//d/ktlh4BsbLrsMx0+eHOlRRK/UVDmZc/rp8jPLna1ZtEimQLg2tjKMHCmZ3wpU7szAl4iIiKLDV18BCxZwfdlQ+PBDKVm84Qb52Qh8tY7suMi7Y8eA6dPRYOXKSI8kejkccrKsdWv5meXO1jgcMt//lFPK31a/vgTEFWiZKAa+REREFHnHjgFLl8r3O3dGdiwVjdHUauhQWZ8TkMA3Nxc4dCiyYyPvli4FCgpQc9cunqDwZOdOYMsWCc5atZLrGPha43AAffsCdep4vj05WdZJTk8P77hChIEvERERRd6SJUBhoXzPElx7/fILsG0bMH586XVt2sgln+vo5Zy/Gp+bCxw4EOHBRCGjm3NSEtCypXzPUmfz8vLk5IqnMmdDcrJcVpByZwa+REREFHlGkxqAwZjdpkwBmjaVBkAGBr7Rz/VvooJk3GyVmiqZyl69pCy3Zk1mfK1YtgzIz/cd+B5/PNCtGwNfIiIiItukpspBFsBgzE6ZmXLQes01QLVqpdcz8I1uhYXSeGjoUPmZgW95DoesNxsfDygl5c4MfM0zTqwMGuR7u+Rk4I8/KsS0CAa+REQUnN27Of+MglNYCCxcCIwaBTRowGBs/36goMCefb3/ftmmVoaWLSVYqOzPdbT66y+Zg33VVfIzA9+yDhwA0tLKZisZ+FrjcAAJCUDjxr63GzsWKCoCfv45POMKIQa+REQUuB07gHbtKlTXR4qAlSuBo0flILayL7NTUgJ07w688krw+youBt57DzjjDKBTp7K3Va0KtGhRuZ/raGZk40aMQH6jRtLEiUotWCCXroFvy5ac42tWUZGcbPRV5mwYOFCWjKoA5c4MfImIKHArV0q2bt26SI+EYpnRpCYxkYHvnj3Avn3A4sXB7+vnn4Ht24Fx4zzfXtmf62iWmgp07gy0bIm8Vq2Y8XXncMh61P37l15nZHxZgeTf338DR47IZ64/VaoAZ50FzJ4tAXMMY+BLRESBS0uTS5aXUTAcDjnIb9GCwVhGhlwaf1vBmDIFaNYMOPtsz7dX9uc6WpWUSODrDEqOtWzJwNedwwGcfLIEv4ZWraQ8/PDhyI0rVhgVBWYCX0Dm+WZlybzzGMbAl4iIAsfAl4JVUgL8+WdpyV2bNpL1tGuOa6wxAt9t24I7gN+xA/jxR+Daa8s2tXLFwDc6rVkDZGf/9zeR16oV8O+/lfdvwt2RI1Jt5F6myyWNzDOaCbZubW77M8+U6RExXu7MwJeIiAJnBL480KBAuR3ko00bKVWsrO8pI/AFgLVrA9/P++/LSQX3plau2rSRTq3MkEUX1/Vp4cz4ai0nQ0iyjsXF5bOVrVrJJU/E+mZUFJiZ32uoVw8YMiTm+3kw8CUiosAUFgLr18v3PNCgQLmX3BnL7Pz7b2TGE2kZGdJtGQi83NloanXmmUDHjt63M57rnTsDexwKDYdDXpsOHQAAx4yAjuXOwuGQeaennFL2ega+5qxbJ12xzZY5G5KTgQ0bUDOGq0QY+BIRUWA2bpTgt3VrNhSpDLQGZs0C8vPt3a/bQX6lX192+3ZZYqR2bWD16sD2MWeOPH/emloZKvtzHY20lr+JxMT/ToDkxXrgm5kpyzPZJTUV6NsXqFu37PWhLHXOzQXmzrV/v5FgnGy0kvEFJPAF0HjhQpsHFD4MfImIKDBGNmr4cJl7lp0d2fFQaH38MTBmjFzaRevSkjsjy1nZg7GMDDkJ0KNH4Bnf6dNlbc6xY31vV9mf62iUni6Bm0tQUtCoEVCjRuwuafTYY8CIEfacHM3Pl47nnrKV9eoBtWqFJuP79NNSQRGrr4Gr1FTJjvuqBvGkQwegZ080XLkyJMMKBwa+REQUmLQ0KTcbOlR+ZnlZxZWdDdx/v3z/xx/27XfLFjnIdz2IrVcPqFOn8gZjGRlA+/aS9Q008F2+XDreVq3qezsjk1hZn+to5Ckbp5QEKbGa8d26VZbo2r07+H0tWybBr6dspVKlSxrZKT8f+OAD+d54fWKVh4oCS2bMQNqzz9o/rjBh4EtERIFJS5MlaIyzxgx8K67HHpM5Yb17y0GTXWXt3g7yK2u34YMHgZwcoF07CXyNNX2tyM2VplgnneR/2xo1gKZNK+dzHa0cDsnWd+tW9vrjj4/dwNd4f9mxRJfxmTF4sOfbQxH4fv89sH+/fDYZjcdi1datMqffapmz4bjjoOPj7R1TGDHwJSKiwKSlycE5l5Co2FauBCZPBm6+Gbj+ejmIde08HIzUVKBJk/IH+ZU18DWeVyPjC0jXayv++Ue6tvbrZ277yvpcRytv2Tgj4xtrvRS0tjfwTU2VaQCNG3u+vWVL+/8XTZkCHHeczHGN9YyvW8fwyoaBLxERWXf0qJSpuga+zPhWPCUlEvA2aQI880zpwZJdB38Oh2Ru3A/yK2sw5inwtRosLF8ul1YC38raQTva7Nwpwa2noKRjR1m/1moFQKRlZwPHjsn3wQa+RUXAggW+gzYj42vXCYKNG4H582VZsCFD5P9eLP+vcziAhg2B7t0jPZKIYOBLRETWrVsnBxYJCdJMpEGD2D4YIM8+/BBYsgR45RV5jXv0kIMmO8r9duzwfpDfpo1kbYqKgn+cWLJ9u1y2bw+0aAE0amQ9WFixQu5rzN/1p7KeZIhGvrJxxpSSWCt3Nt5bSgUf+P7zj6w57S/wPXpUpgzYYepUID4euOaa0l4EsVzubFQUxFXOELBy/tZERBQc4wCmZ0+5DMW8KoqsrCzggQckI3vllXJdXJz8bEfG19dBfps2km22oxlOLMnIAKpXB5o1k0AhkAZXK1ZIttds45q2beW1PnrU+niD8fffwGefhfcx7bZzJ/D887Jush0cDmns1qtX+duOP14uYzXw7d9fyvZLSgLfl/GZ4Wv9WTun3uTlAR99BJxzjpxM6t1bXp9YLXfetQvYvLnSljkDDHyJiCgQaWlygG4cjIViXhVF1iOPSLOlt94qG0QlJUn5X7BBaWqq94P8tm3lsrJlIjMypLGVkY0xAl+zZZu5uVKNYaaxlcFY0mjnTmtjDdarr0r5aKzNWXX1zDPydzJ7tj37S00FBg2SDKM7Y53rWA18R46U92cw/QEcDsl8t27tfRuj0sGOE7HffSdN/Yz1sOPj5fWJ1cDXmAZxyimRHUcEMfAlIiLr0tJkjlCVKvIzM74Vy7Jl0tDl1luBE08se5uRLfjzz+Aew+HwfpBfWdeXNZYyMiQkAIcOmQ9K//7bWmMrIHLPdXq6zP3cvz+8j2uXw4dLM9ZTpgS/vwMH5HPVWzauZk35nI21dWR37JATOcOHy8+Blju7rvnti52B75QpEmgPG1Z6XWKi/A5ZWcHvP9yM597oH1AJMfAlIiLrjI7OhlatJOMby9kbEsXF0tCqeXPgqafK396nj8zrDibrceCAlD16O4itzIFvu3alP1ttcGW1sRUQ2cAXsK9DeLh98YU0mzrzTGDOnNL52YEyTiT5KuONxbV8d+yQiiDjBFqgge/69XKSxF/ga1ep8/r18hk3blzZ+bB2nfiLhLQ0+XypVy/SI4kYBr5ERGTNwYNyMOMa+LZsCRQUxOZZcCrr/fclgHr1VaB+/fK3V60KnHpqcIGvv4P8hg0lw1WZAt+8PFm31zXj26OHXJoNFlaskL9Fs42tgNKy0XA+10eOyO8KxG7gO2WK9DiYMkVO+L33XnD7S02V6SP9+3vfJhbX8t2xQ06u1Ksn7+1AA1/j88bXiQEAqFsXqF07+Izv1KnyWXfNNWWv799fXqdYLHdevbpSZ3sBBr5ERGSVsa6oe8YXYLlzrNu/H3joIclqXHaZ9+2SkoBVq+QkSCAcDt8H+UpVvm7Drh2dDY0ayd+WlYyvlWwvINn7Ro3C+1xv3Vr6fSwGvitWyPrW48fL3NuRI+WEUTBdyB0O4OSTgRo1vG/TsaOUveflBf444fbvv6VVBYE0azM4HHJSx+gr4Y1SwU+9ycsDPv4YOPdcaTTnqkYNYMCA2At8Cwsli83Al4iIyAJP84QY+FYMDz0kc0rdG1q5S0yULNeCBYE9jpmDfAa+wmywcOSIHNhaaWxlCPdz7Zq1jMXAd8oUqUi44gr5efx4+eybNSuw/R05IoG0vzLejh3l727btsAeJ9y0Lh/4rlsnQZjV/Tgc8vyY6VYebOD77bdSvTR+vOfbk5Lk9TpyJPDHCLdNm+R5Z+BLRERkwerVUk5mdN4F7F1CgiJj5Uop17zjDv8HRyefLGWAgWQ9Dh8G/vrL/0F+ZQt8jQDQU+C7dq3/JXP+/lsCBKsZXyBygW/LloHPjT18GLjgguDn1lqVkwN8/jlwySWlUwFGj5ZgK9AmV4sWyevrr4w3kCWN/v4buPhiaSQWbjk50snZNfAtLJQgzIqMDHl/+nt+DMGuMjBlCtCpEzBkiOfbk5Lk9Vq0KPDH8OT552X5pFBgYysADHyJiMgqo7GV65l3I/Blxjd2ffYZUK0a8Pjj/retWVPK/Yx1Na0we5Dfpo2UdQaz7mcsyciQJjpGkGBISJCgxbU82JNAGlsZwh34btkicz779Ak847t4MTB9OvDjj/aOzZ/PP5dgzjUbGB8PXHcd8NNPgWVjHQ7pkO9vmZmOHeXSSuD78cfA11/LV7gZ7ynXwBewXu5sdn6vwcj4BtJscc0a6UHg3tTK1SmnyOtlZ7mz1tJX4dVX7dunq7Q0+X26dQvN/mMEA18iIjJP6/IdnQEJhBo2ZOAbq7QGZs4Ehg713NDKk8REWfbo6FFrj2X2IL9NG5kzuXevtf3HqowMOWCvWrXs9T17yqW/YGHFCrm/cRLKijZt5HnOz7d+30Ckp0v2sn37wANfI/gLdM5oILSWbGCvXnLix9X118vJwECaXDkcQN++UknjS7NmMifbSuBrBGd2LLlklXvge8IJEnwFEvg2bGg+W9mqlZwsOnTI2uMAwLvvygnA//s/79vUrSsnbQI58efNrl1AdrYE3gcO2LdfQ1oa0Lmz7+kllQADXyIiMm/PHvmn7OkAhGv5xq4NG4DNm4HkZPP3SUqSwHTxYmuPlZpq7iC/si1p5L6UkaFbNwmoVq/2ff9AGlsZjOc6XH+/6emSvWzfXuZSBjJXMhKB77JlUjo8fnz5uabt2gGjRkmTKytzWPPzgSVLzGUzlZLnzexavjk5Mt5WraTSwt97yG7ugW+NGhJ8WX3NUlOBwYO9Z2DdBTr15tgxyZCfdx7QtKnvbZOS5LPPrpNFrs9JKJZK8nTCuhJi4EtEROb5micU7LwqipyUFLkcM8b8fU49VQ5ErWQ98vLMH+RXxsDXfX4vIEuzdOzoO1g4fFhOXgTS2AoI73NdXCxl20bgCwSW9XUNfMO1fvjUqZJxvfxyz7ePHw/s3l3692TGsmUSPPmb826wsqTRwoUyVeC116SLerizvjt2lHZZNljt7Lx7N7Bxo/nnBwi82eI330inem9NrVwlJcnrtmyZtcfwxnhO4uPt7xh97Jic2GTgy8CXiIgs8BX4MuMbu1JSpHzTU+DlTf36QO/e1g7SrBzkV6bAt7hYfk9vz7+/YCGYxlZAeJ/rzExZ89sodQYCC3yNrGd2dnhOuB06BHzxBXDppTI/2ZNRo+S5nDrV/H6Nv5/Bg81t37GjBL5mgn2HQwKpMWOkEdinn8r85HDZsQNo3lxKhw0JCRKEmW22ZZxYC0fgO2UK0KULcNpp/rc1Xi+7yp3T0uS5GjTI/sB33Tp5vzDwZeBLREQWpKVJCZj72oaAHGzs2hW+7AvZ48ABWZbISpmzITFRSigLCsxtbxwkmjnIb9JEDpgrQ+C7a5eUjfsKfDdu9F5WGUxjKyC8ga+RrXTN+FrtzKy1BL6BNksKxGefyXx2X9nA+HiZ6/vLL/6bkRkcDvk9Gjc2t33HjjKOPXv8b2tMK6hdW8adkxPeJlc7dnhu1qa1BGNmpKZKlr1PH/OPG0ipc1qaZMjHjTO3ZFLjxkCPHvYFqUYpcmKidL0/fNie/Rr7Bhj4goEvEVH0ysmRZiALF0Z6JKXS0kqb7bhr2VLmtoWiMQeFzpw5Ug4ZSOCblCSZmxUr/G+rtaxz2qOHuYP8uDigdevKEfh6W8rIkJAggfHGjZ5vX7FCnqsWLQJ7/Lp1JYsZ7sC3RQtp5mU145udLZ+PY8fKz6EOfI2mVn36+C8nv+46CZzefdf/fpculcDObLdiwHxn52PHZP9GpnTwYJkvHs5yZ2+BL2D+NXM4ZFqFe9M3X+rWBerUsZbx/fBDOdF29dXm75OUJCcNi4rM38eTkhJpapWQEJqlktLS5Hfr1Mm+fcYoBr5ERNFq40aZt2f3WoGBcv3n7Emg5WUUWSkpEoAEMj/UOGA3U+43fbqcxLnxRvP7ryxr+ZoJfAHvwUIwja0M4Xqut2yRrt7t2snJjbZtrQe+RpnzgAHy3g1106YlS4BVqzw3tXLXpo2s6/vBB76bXH3+uQQ5zZsD99xjfixm1/JdulQqMYzAVynJZi5ZAvzzj/nHC4anwLdTJ5lvbCbwzc6W591KmbPB6tSbBQuk03yTJubvk5gomdlgn89t2ySLn5AQmqWS0tLkpEd8vH37jFEMfImIopVRpmWmpC0ctm+X7qsMfCuOggJZe3T0aPMdU101bSpVCf4O0o4cAe68U+YEM/Atzwj8PHV1BmTeYXy852Dh8GE5SRZoYytDuJ7r9HT5PY0MXiBLGrlmja02SwrE1KlSLnzZZea2Hz9ePrdnzCh/W0kJ8Oij0iDr5JMlQDWCWTPat5cg1l/gm5oq27lOK7jqqvA1uTpyRBpFuQe+8fEShJl5zRYskGy7lYy4wUrgW1go8+StnjyycuLPF+PETUKCZKr79rU/8GWZMwAGvkRE0cv4p717d2THYfA3T4iBb+xJTZWS0UDKnA1JSbL8RnGx922eeQbYuRN46y1rWQcjGKvo88YzMoBGjeSg15Nq1YCuXT0HC3/9FVxjK0M4A1+jXBcILvA97jj5PFqzRgLKUDh4EPjySwl6/S3BZRg5UoJ79wDzyBFpMvXcczIXeO5caxlGQJYEat3a/5JGxtzhhg1Lr2vUCLjoImDatMCWkLJi5065dA98AfMnK1JT5QTJySdbf3wrqwysXSvz562ePGrTRt7LwQapxnPRo4dcJiXJCZG8vOD2C0hTtn//ZeDrxMCXiChaRWvga/xzdmfML+SSRrEjJUUyQGecEfg+kpLk4Mrbgey6dbKcyjXXyFw9K9q0kQPS/fsDH18s2L7df0dtb8FCsI2tDG3ayN+ulTVoA+Ep8DU6PVvZR/PmcqIgIUHms5ptJmXVtGmyfzNL3BiqVJHA9tdfpYMxIMH94MGSBZ4wQbLIrt2OrTA6O3tTWCjTCjyVCI8fL1UCX30V2GOb5b6Gr6uEBAnGDh3yvQ+HQ8rZa9a0/vhGxtfMSTOjR0Egf0OJiRKgB3NyLi0N6NCh9MSKnUslrVkjlwx8ATDwJSKKXkYAGU2Bb7t23pfyqFFDMgrM+MYGrYGZM4Fhw6SMM1BGuZ+nrIfWwC23SIDy0kvW911ZljTytoavq4QECXbcl6NZsUKep+bNgxtDmzbyeoXy8+bwYWDfvvKBr9bWXuMtW0r3YTTbC0W5s9HUql8/60HRtddKAPzeexKEDhggwfmsWcAdd5jrHOyNv7V8//pL3ieeAt9TT5WTl6Eud/YX+AKlQZknublyUieQMmdAAt+8PMnY+7N8ufxfC6T5U1KSnJhbv976fQ3upciDBsmlHeXO7OhcBgNfIqJoZQSQ0TLH18w8Ia7lGzvWrpUD8WDKnAE5GdK+veeDtK++AubNA55/XuYDW1UZAl+tzQe+gLxuruxobAWE57k2gjXXOa3GvGYrSxq5Zo27d5fLUAS+ixbJfq1kew2tW8v6uZMnA0OHSmC1eLGUQQerY0f5nPW2Fq4x59RT0Gg0uVq2TALkUDHeR61bl7/NTGfnJUukW3Igja0Aa0sarVgh82oD6XNgjC/QILWgQIJm1/+tjRvLz3asEZyWJicevfUPqGQY+BIRRSsjgNy3L/jlEoJVWCglq/4CXyvzqiiyUlLkcsyY4PeVlFS+3O/wYeDuu+WActy4wPZbGQLfrCzJbpkNfF2DhZwcexpbAeENfN0zvoD5eb4FBVIma+yjTh2Z6xuKwPfTT6Ua4tJLA7v/zTfLazR4sARy3brZMy7jd/dW3u1wSPbSCP7cXXmlVOiEMuu7Y4fMX65Ro/xt7drJ6+arG7fDIYGo1ekRBrM9JwoLpStzoCePjj9epvkEGvhu2iT/393/t9q1VFJammT4AwnqKyA+C0RE0WrXLimV01qC30javFkOOJnxrThSUiQo9VSKaFViolQmbNpUet1TT8l7+O235X0ciObN5b4VOfD1t5SR4bjjZK6ja4BnZOxiLePrGvi2bSuXZgPfjAxpZOWaNQ5VZ+f584EhQ7w3HfPnzDOBlSulc3qjRvaNy9daviUlchLKV6a0YUPg4ouBzz6TE1Sh4GkpI4NS/l8zhwPo1QuoXz+wxzcb+K5ZE1hjK4NS8lw7HIHN8/VWipyYKA3I/v47sHEBMp7Vq1nm7CLsga9Sqq1S6lul1CGlVI5S6jullKn8u1KqnVLqY6XUdqXUMaXURqXUs0qpICYnERFFoaIiCSSMMr5Iz/M1O0+oVSsJdkLVYZXssW+flHEGW+ZscC/3S0uTBj7XXx9YR1ZDlSrynqoMga+/UsQqVeTzwDVYsKuxFQA0aADUqhXa53rLFgm6XDsNV68umUmzga+n4DkhQcpFrTTI8mfvXtlnoHNMDX36lC7dZBdfa/muXSvr3/ob9/jxElh98YW9YzP4CnwBec1Wr/YcLBYUSFl4oGXOgPlS52AaWxmSkuT3tdqdHJC/5ypVpGu7KzuWStq7V+YfG/PgKbyBr1KqFoDfAZwA4GoAVwLoDGCev+DVefuvAJIAPAbgLADvAbgHwAchHDYRUfjt2SMHBH36lP4cSWlpUip1wgm+t2vZUoL2AwfCMy4KzOzZ8v6yK/Dt0gVo1qy03PmWWyRT88ILwe+7oq/lazbjC5TPkq1YIRnTZs2CH4dS8lz/+2/w+/LGvaOzwcqSRt4C36KishUHwTICjmCCr1Bp0kSy0J4CX+Pkk79xDxwoAdHUqfaPDzAX+B44IMGZuxUrZP5yMM99nTrSJdlfxnf5cvmssrKWsrtg5vmmpQGdO5cvCW/dWsYUTIMrNrYqJ9wZ3xsAdARwjtb6B631DABjAbQH4K9zwCBIkDxea/2x1nqe1vplABMBnO8MqomIKgbjn3XfvnIZDRnfTp38LyvBtXxjQ0qKvFbG+ytYSkmGwuEAPv9cLl94wfoapZ5U9MB3+3b5uzLzXCUkyN9WVpb8bFdjK0Oon2u7Al8jS2ww0yzJqtRUeV3sfH7topQ8j57W8nU4JGg67jj/+xg3ToJMI+tpl7w8yTT6C3wBz6+ZEewNHhzcOMxMvQmmsZWhRw+pYgg08PUWmBpLJQVaQcXAt5xwB75jASzWWm82rtBabwWwAMDZfu5rLHaW43b9QcjvEURfeCKiKGOUZxkZ32gIfM3882TgG/3y84Gff5amVsEsqeIuKQnYtg247Tagf3/guuvs2W/bthKMBbNOZjQzOjqbeS2MksU1a2QN1E2b7GlsZQhl4FtcLO8PT4Fvu3aSaTZzgG8sZeQaqHTtKuWidga+DgdwyimBr7Ubap6WNNJaAqXERHPvpyuukODe7iZXO3fKZTCB7wknBF/J4C/wLSgIrrGVIS5OgnSrZclHj8r72dv/1qQkyYoHulRSWpqcULOjIqSCiA/z4/UAMMPD9WsAXOjnvr8C2ATgJaXUTQC2AxgA4A4A72itc33dmYgophj/rDt1kpKtSJY6Hzsmza0uucT/toEEvunpqGkcKIXKzJm+y6+VAkaNCn4t1Fjwxx8yt8+uMmeDUe538GBwDa3ctWkjB4gHD5adGxqo1FR5P/tQMz6Iw6P8fFm3dehQc9ubWcrIYBwgr14tgSRgf8Y3M1P2bdfrZ9ixQzroeiopbd9enre9e6VDri+essbVq0u5vV2B76FD0lTo8cft2V8odOwIzJkjwa4R5Kany+tntkS4QQP5XP/8c+DVV72v0W6VrzV8Dc2byxJn7q9ZcbF0M7744uDH0bKl9DLwZs0aCX7tOHmUlCSVNLt3+38PG9atk9fPV+ALyIkAo9+HFcYJaztPcMa4cAe+jQBke7g+C4DP/2Za6zyl1GAA0yGBsuE9ALd6u59SahyAcQDQvHlzzJ8/3+KQ7XPkyJGIPj6RK74fo1uHhQvRPi4OjnXr0L9+fRxetQrrIvR61dm0CSeVlGBNXBz2+RmDKijAaQC2LlyIDJNzpnrffju65Odjvqf1Hm1QZ/NmnHTDDX63yxw9GhvvvTckY4gmnSdPRovq1bGgalWU2PmeKi7GwKZNsX/wYGw+ckQ64tqg6aFD6AFg2fffI9dTttCC+CNHcOq55yLOzxIhXbp2xXyj27BFrX74AV0mTsTyKVNwpEsXv9ufunkz9rdsiY1mni+tMahOHez9+WfkrV6N4wEsyMtDoU3PdaujR9GluBgLv/8eBXaUqbto8Ndf6A3g75wcHHQbb+OcHPQEsOK773DY1wG+1hi8aRN2d+yIzW776N6sGeouXYolNjwXjZYswYla4+969cqNNRI8/b9uVVSELnl5WPjddyho3BgA0GLOHJwAYGmNGjhqctz1+vVD3w8/xLrnn8ceO9YYBtBs7lx0B7Bk504c8zGOXm3aIG7hQvzlsk2dzZtx0qFDWNekCfYE+dx3LC5Gmx074Jg3z2Pw13LWLHQFsKSoyOc4zahbpw76AVj75pvYe8YZpu7T/Kef0A3Aktxcz4+vNU5p3BgHv/kG6/z11/Bw38H//IPdI0eW+1sJVkwfP2qtw/YFoADAix6ufxZAkZ/71gAwD8AGAFdAmlzdCyl9nmzm8fv166cjad68eRF9fCJXfD9Gueuv17plS/l+8GCthwyJ3Fg++URrQOt168xt37ix1jfdZG7bwkKta9TQRdWqaV1UFPgYfZkwQca/dKnW27Z5/hoxQuvOnUPz+NGkpETrdu20Hjs2NPs/elTr4mJ797lwobx+s2cHv68vvpB9/fCD9/fCHXfokrg4rQ8fDuwxLrxQHuPxx/1ve+SIbPvss+b3P3iw1omJWl9yibyWdkpJkfEsWWLvfrXW+r33ZN/p6eVvW7VKbvvqK9/72LtXtpswofxtTz2ltVLynAbrwQe1jo/XOjc3+H3ZwOP/6zlz5LlITS297ppr5PPXyt9gcbH8rzn//KDH+Z8XXpCx+fsbuu02revUkc8lw8SJct+MjODH8dprsq8DBzzfPn681vXrl338QBUXa92smfxdmnXvvVpXry7/B725+GKtW7e2PsZt2+R3f+cda/czIRaOHwEs1x5iwXDP8c2G58yut0ywq+sADAFwltZ6mtbaobV+FdLV+UalVC9bR0pEFEmZmaXNW1q0iOwc37Q0mefWqZO57a2s5bt2LZCXhyoFBZ47lNrB4QA6dJB5p+3be/464wyZL+lv6YtYt3q1NFOyu8zZULNmcE1iPLFzfdmUFJnzNmaM9/fCqFFQJSW+SyS90bq0wU1Kiv/tt2+XS7OlzkBpZ2e7G1sBoV3Ld8sWID6+dN1eV8bv76/BlaeOzoaEBHn+160LbpyAvIYnnSTLO0UrT0saORwy19TK32BcnPw9/PyzlJvbYccOKaP2t/5xQoJMuzD+DgCZitC+vf/lvcwwpt54+1xfsUL+huwoBY6LA0aPlvLzwkJz91m9GujWTf4uvElKkjnT27ZZGw8bW3kU7sB3DWSer7vuANb6uW9PANlaa/cWdkudl92CHBsRUfTIzCz9p928eWTn+Kal+f/n7KplS/MBpGs3UTsb0xiMZi/+5rwZtwezZmIsMIKx0aMjOw4rWrSQg8pgg7GiIjkoHT3a9/zVU0+FjosLrEPrpk3yt9qlC/DXX/7HHGjgm50t85TtbGwFhDbwTU+X39PT50i9ehIoBRv4AsF/jhw7BixbFp3LGLkyGqIZz0lmppxcCGTcyckSgP7xhz1j87eUkcH9NTNOHAW7drLBOHns6URsQQGwapW9J4+Sk2V++J9/mtveTNNI47mw+nlkPKc9PIVdlVe4A9+ZAAYqpf77xFJKdYAsVTTTz313A2iolHJPOZzsvAxxZxQiojByDXxbtJADXbvOxltltqOzwUrGd/ny0qxKKALfDRuAffv8Hwz26QPUrl05At/+/csuBRPtqlaVv4Fgg7EFC+TvyF+2u25dHO7cObD3gnEfY/3iH3/0vb2VNXwNrn+Ldmd8GzeWRlGhCnx9zdFu165s5s/bPgDPS/Ucf7yMPdjPkSVLJGMX7YFvtWqSPTeWNApm3eFhw2QdWTNVCmaYDXyNoMx4zTZulAZndj33vpotpqXZ19jKMHy4vC5mnsfsbMnk+vvfGuhSSWlp8ho0aGDtfhVcuAPfdwFsAzBDKXW2UmospMvzvwD+66WulGqvlCpSSrm20/sIwGEAs5VSVyulhiql7gPwKoAVkCWRiIhiX2GhBGuupc5AZLK+OTlyMGo18N2929zSJCtWACedhGOtWoUm8DUOFvxlEKpWBU49NbAsX6zYswdYujR0Zc6hZMcyOykpclB65pl+Nz3UsyeweLH1k00Oh3SqPfdcCfL8HQBnZEj22ThAN8M1g2N34KtU6JY08hf4mlnLd8sW+Vz0VIJcpYp0vg32c8ThkOdh0KDg9hMOHTuWngxwOKS0uHdv6/upVUume6Sk2LNsmNnAt359Cd6N1yyY4N0TXxlfo9rIzr+hOnXkJMLMmf6fxzXOPr3+/rfGxZWu52uF1RPWlURYA18tSw6dDmAjgE8BfAZgK4DTtdZHXDZVAKq4jk9rvQ3AQAB/Q5phzQZwA4CpAIZrrQNc3ZmIKMrs2SP/NF0zvkBk5vkGMk+oVSspK92/3/d2hYWyhuJJJyG3Q4fQBb7NmwOdO/vfNjFR5lxl+2s5EaOmT5f3VWUOfIcMAerW9bvpoV69JOhdtszaYzgcctCuFDB2LPDbb0Cuj9UWMzLkd7OyfFKTJvKZ0L69fG+3UAS+hw7JcmK+Or2bCXz9Bc/G/OdgOBzAiSfGRqbMdS1fh0NO3gW6FNfYsfL8B/v8FRTI/zAzgS9Q9jVzOGTNWRPd0E2pXVvK6D1NvVm+XF7jIDvFl5OcLCdo/K29a/zOxtrcviQmWutBUVQkc90Z+JYT7owvtNbbtdbna63raa3raq3PcQa1rtts01orrfWTbtev1VpfpLVuq7WuqbXuorW+V2tdQY9SiKhSMs5Ou87xBSKT8Z00ScoH+/c3fx9fZ9ldORtboV8/5B53nJS52V3ObczvNdO8JClJAsMFFbCA6N13gTvukJLuXjHYCzLYYGzjRvkyGfQfMg5GrWRZ/v1XGtAY1QXJyfJ+/vVX7/exsoavq8svB666yvr9zAhF4Otrbq6hfXsJkA8d8r0fX/vo2VPKRwM9eVVYKE3Nor3M2dCxo5wQ3bFDAqlgxj1mjFwGW+5sfO5bCXzXrZNgzZjfa+e6s96m3tjZ2MqV2ecxLU1OwplZNs1qD4otW+Szh4FvOWEPfImIyA/jrK57qXO4M76//w58+SXw4IOlwbcZvuZVuTJKzU46SQLfoiIJTuySkSFl2mYbpQwYIKWwFancuagIuPNOYNw4KcH7/Xf7D/TCoU0bKbvPyQns/sZBqMnAt7B+fSmbtfJecC/TTEyUUk5fB8AZGYF1r331VeDpp63fz4w2bSR4NDNVwSyzgS/gPeubny8Bnr+ML1BaRmrVypXA0aOxFfgCwCefyGUwTaFatpT5rsEGvsZJEyuBb34+MG+evPZ2P/eeAt/8fPsbWxnatpVyczOBb0KCuc9jqz0o2NHZKwa+RETRxj3j26yZXIYz8C0oAG69VZrIPPCAtfuaDXyXL5cz3p06SeAL2FvubAQtZg+kataUzHZFCXwPHpTsw8SJku398cfYKN/0xDiI3hlgH8uUFMkGWsmuJiVJ9r+42Nz2DoeUVZ54ovxctSowcqQ8756CyMJC+X0CyfiGUps28vfvb6qCFXYEvtu2SUWGr3LpYDs7m+0JEC2M5/Ojj+Sk3YABwe0vOVmaewVTXRRI4AsAb78tl3Y/955WGUhLk78/u7uiG5KTgYULpbzfE62tzcGtWhU45RTz/5vS0iSg7sYFb9wx8CUiijaZmdLQwgh4q1cHGjUKb+A7caKUn73xhgSEVhgZan/zkVasAPr2BeLicLRtW5mbZmfgm5oqgZ6Vs95JSTIuX/MyY8GmTcDAgTLHdOpUYMKEwOf+RYNgltnJzpblRazObU5KAg4flnnoZqSmSkMk16WSkpMliPA0VzgzUwLiaAx8AXvLnbdskY7R9et738Zf4GsmeG7TRk4+BPo5kpoq80utVLhEknESYNMm4OSTpTNzMJKTJSibNSvwfVgNfLt1kyBt5syyJ47sYmR8XZtNLV8ul6HI+ALyPJaUALNne759zx4Jiq3+bzLbgyItDejUKbrXoY4QBr5ERNFm1y4JHl0PoMO5lu+OHcBTT0mzE2O+khXVqknTHV8ZX5fGVgCgq1aVA067M77ugYg/SUlSHrx4sX3jCLfffpOD4P37ZX7pDTdEekTBCyYYmzNHsrZjx1q7n5X1M/ftkznr7tUFo0bJ+89T2WMgSxmFQygCX39zcwHphl29uvcljcwEvkoF3uCqpMTcmt/RpFEjCRYBezKlvXvL6x9MufOOHVLJ4+skh6uaNSVIKykBBg+29nltRqtWUsGQlVV63YoVskSQp2Wx7NCvn/wP9/Y8BlKKbKUHBTs6e8XAl4go2mRmll9ntUWL8GV8775bAoUJEwLfh7+1fNeskXlWrmfc7ejIatizR9bwtXoQe+qpkm2P1XLnt98GRoyQ53/pUuC00yI9InsY5fOBBGMzZ8qJIysN2gAJAI47ztx74c8/5dL9/daokZx8YeDrP/CNi5P5zt4yvlu2SJBkVJR4Y3yOWF2WJy1NpgfESpkzIIG+8bzaEbArJSc7f/lFGg8GwuxSRq6MZnKheO6Nzw7XCqRQNbYyxMVJ1vennyTodhdI4DtggJQ8+/s8ysuTCgAGvh4x8CUiCpX582VupVWZmeXX9QxX4Dt3LvDNN8DDDwd3NtzTvCpXLo2t/tOzpxwg21Fm7C0Q8adePcl6WF0z8eef5YRBJD31FHDLLZJlXLjQ/mU6Iql6dSn9txqMFRbKwefo0XIwalVSkrwX/AVRqalSZuppzmBysjTScQ/ojJ8DaW4VSs2aSVm8XYFvUZH8rr7m5hp8LWlkBM/+gpWEBCkjtfp5abUnQLTo2FHe26eeas/+xo6VBl/z5gV2/0ACXyNIC8Vz777KQH6+lAyHqszZkJwsUyU8BappaVLhYExnMqNmTQl+Z80Cjhzxvt369XLimoGvRwx8iYhC5eWXZY6s1aU1PAW+4Sh1zs+XhlbHHw/cd19w+/KX8V2+XIJM14PhYDuyunI45EChb1/r901KkiVNPJ2p90RraQD2+uuBZ0mCdfAg8NJLwAUXAD/8UFr+WJG0aSNLBlmRmirL4wS6dnFiopSM+1uT0+GQOdXVqpW/zXjsH38se31Ghhz4Wp1DH2pxcUDr1vYFvv/+K8GvmRMxZgJffwJtcJWaKh15oy0D78+4ccAzz5han9qUoUOlg3Cg5c6BBL6XXgrcdJP1qgwz3Jstrl4d2sZWhmHD5GSYp+cx0FLkW2+Vz6LBg71PCWBHZ58Y+BIRhUJuriwdA0jJrVmFhTJf0FOp85Ejvs/0Buv112U5oTffDL5JSqtWknHx1hHXpbHVf4LtyOrK4ZAumJ4CEX+SkiSANRqg+LN0aWkDJG8HI6E2bRpw7JgsPWX3HLloEcj6sikpki0ePjywxzQyUL7KC3NygL/+8p6t6toV6Ny5/AFwoEsZhYOda/mamZtraN9ePjfcTyBpHdrAV2t5jc2u+R1NRoyQCh271Kghfy8//mi9XLyoSCp9rAa+J5wg0zSqVrV2PzPcM76hbmxlqFULOOMM+bt3fR5LSuTkbiCB6SWXyOuydaucJFi0qPw2aWnyPHbuHPjYKzAGvkREoTB3rmRQAWtr0xrleZ5KnYHQZX23b5eswbnnyhIswWrVSoJeT0uiFBRI6af7GffjjpPsV7CB78GDEogGWjY3eLBcmi13njKl9PtIBL5aS+fmfv1CfzAXSVaDMa3loPP00yWDFYhOneRvz9d7YdEiOZj19X5LTpbS0cOHS6/bvj16s4uRDHyB8pn9vXvlZKKZcmmjhNTK58jmzfLZG0vze0MpOVleA7MdzQ27d8vfgtXAN5Rq1ZJGW8bUmxUrZO59hw6hf+zkZAlS164tvS4jQ05gB5qRHTVKmi/WrQsMGVK6hrMhLU1OJITiJEIFwMCXiCgUUlKk3DQ+3lrG130NX4MR+IZqnu9dd0mg8Prr9uzP/Sy7K0+NrQDJVHbvHnzgu3Ch/C6BBr5Nm8oSG2aaGh06BHz5ZWlG0VuZZigtXizle+PHh/+xw6lNG5k2YHYO+Pr10hAp0DJnQLJ/SUnAH394z345HPJ3PnCg9/2MHSsnfObOlZ+1jo3A12rGz5P0dDkINxMMGRlw9xNIVoJnwHqjPOPERqzN7w2V0aPlvW+13NnqUkbh4jr1JtSNrVwZqyLMnFl6nR2lyN26yXrLgwYBV18tU22M6ip2dPaJgS8Rkd1KSqQBxahRcqBmJeNrnJV2L3U21pUMRcb3p5+A774DHn3UvgNx93lVrjw1tjLY0dnZ4ZAD7ZNPDnwfSUnSIMtbqbbBKDF++mkp245E4DtlClCnjsyTq8iMg+mdO81tbxy0B7Ikl6vERDmg9/baOhxyIO0rqzxokCyfYoxp3z5530Rz4HvsmPX+BJ5s2SLZNTMl+N7W8g0k8F2zRj6LzXA4ZAm2E04wt31F17y5NFKqaIFvXl54Glu5Pm6/fmWfR+P/W48ewe27cWNpqnjjjdJP5Nxz5bMxI4OBrw8MfImI7LZsmQSoycmyNm00Z3zz84HbbpNx3nOPffv1FfguXy6lZ57KFhMSJPg/cCDwx3Y4JKiuVSvwfSQmytzN1au9b6O1BJ39+km2r1Wr8Ae+2dnAV18Bl18uwW9FZnWZnZQUoE8faVgUDCML6KncOS9P5nj7yxTGx8uJsFmz5GRKtC5lZLBzSSOzc3ONx/V0AmnLFrk0W56akCCVAWb/Hh0O+ZuPtfm9oZScLP/LfDUpdBetga+xysDq1TIPOdSNrVwlJ0tVzt698nNamnwmmV3n2JeqVYHJk6Uvx+zZpb8XA1+vGPgSEdktJUWyG6NGSWObTZvMZx4yM+W+TZuWvb5pUzkgtDvwffVVmd82aZI0AbKLkaH2tKSRr1KzYDs7Hz0qgXWwJYtmmhoZJcbjxsnPvjrS+pOVJScffvrJ2v0+/VSCr4pe5gxYC8YOHJCS92DKnA0JCUCDBp7fC0uXSgmzmbmhycmS6V2yJHYCXzs6rKenm5ubC8iBvKcTSOnp0mnabAdsKw2uduyQeZgscy5r7Fi5nDXL/H127JDXqGHD0IwpUEbGd9ky+TmcvRDGjpWTpLNny8+hKEW+5RZgzpzSpnAMfL1i4EtEZLeUFGmQ1KiRBDN5eeaXYdm1S7K77mWBRjBsZ+C7bRvw3HPA+ecDZ55p334B6abctGn5bIHR2MrbgUewnZ2XLJHO2ME2qWnbVrJLvgJf9xLjYALfVavkBMmNN0rwboaRce7fXzKbFV3r1nJpJvCdPVtONtkR+MbFyd+zp/eCwyEncIyGaL6MHCmZ35SU6F3D19C3r3SFfeKJ0iZ9gcjOli8ra0p7+juykjUGSstIzXyOcH6vZwkJ8lpYKXc2ljKKtsx5q1byf2HuXCkRDucJp9695TlJSZExrF8fmsB0+HAJ7KdNq1hruNuMgS8RkZ0yMiSIMQ64u3aVS7PzfDMzy8/vNdi9lu+dd8oBil0Nrdx5Wss3LU2CX2+Bb+vWUgIWaOBrBCKDBgV2f1dJSbI/Tw1+XEuMjfUz27WTAz9/84I9MUo5MzKAF14wd5+FC6VbaGXI9gJSut6okbnANyVF/o4CWcfZk6Qk+Rt2P/HkcAA9e5rLcDVoICdkUlKkeVOdOtGXGTNUqyZVIJs2Af/7X+D7sTo3F7An8K1XT/Zj5nPE4ZC/4V69zO+/MlBK/o/9+qvM9zYjkDV8w8GYevPLL+FrbGVQSvoM/PKLVFAUFMhnRih06iT/k8grBr5ERHYyzo67B75m5/lmZpaf32to0cK+jO+sWcCMGcDjjwc/B9IbY16VK1+NrQA5SAimwVVqqhzANmgQ2P1dJSZKaaqnkxbTppUvMW7fvnQdS6vS0yUbePHF0qhk0yb/95kyRQ7YL7nE+uPFqrZt/Qe+BQVSMj5mTNl1ooNhZAP//LP0uqIiOflgpbogOVkOfufPl/dLtGXGXI0YIdUgzz4beCVDIIGv+wmkY8ekaY/VLJbZzxGHQ06UVdT1r4ORnCzP/2+/mds+WgNf42Ty0aORWfItOVmWMHrzTfmZpcgRw8CXiMhOKSlS3tyli/zcvLkEJ9EU+OblAbffLh1M77or+P154ynju3y5BKW+DmKNA1arS6kUFFgPRHzxNs/XW4mxt460ZqSny/1ff13mWt92m+/fPysL+Ppr4IorAl+jNhaZWV/2jz9kvVw7ypwNfftKxtn1vfDXX9JAyUqJrDGmVauid36vq9dfl+A80M+JQDO+hYWln3Xbtsml2XnChoQEKSstLPS+zf79UjXBMmfPTjtNKhPMlDuXlMgJimgMfF3/p4azsZXh9NPl8+OTT+RkHLuHRwwDXyIiuxw+LJkc1wNupSTra6bUuaBADsS8lTq3aCGlzsGurfnyy3JA+uabUtIYKq1aycGra+mvmTUUExKklNhq5nTlSslO2HUQ27mznLhwD3wXLpSsnXuJcTCB75YtEhy0bClLI/38M/DDD963/+QTmXtZWcqcDWYC35QUoEYNYNgw+x63alXglFPKvheM762caOnUqfSgNxYC37ZtpSrk+++leY5VW7bIXH9jOoAZ7n9HgQTPgHyOFBRI8z5vjAw+A1/PqleXzP+PP/r/v7N3r1RBRGPg6/o/NRIZ3xo1ZA5uYaF8Bpht0hZFdu0C3n1XCmmmTo3dOcQMfImI7PLLL3Kg5Z5pMrukkZHh8Jbxbd5cgp1DhwIfY3q6zCG9+GJ7AwNPWrWSLMC+ffJzfr7vxlaGQBtcGU1q7Mr4KiX7cl/GxigxvvjistcHm/E1DuxvvVXmgN1xh2QU3RkZ55NPrnzzEtu0kffTqlXAunWev1JSgDPOCG45K0+SkuRxDx6Un1NT5SDW24kqb4zPh1gIfAHJ9nbtKlUIRtdYs6zOzQXK/x0Z898DCXwBaWjk7b3y448S3EUiCxgrkpOlcmflSt/bGQ0cwxz45uQAX3wB/N//ySIFHlfCq1lTKo2aNIlYQ7nCkfJ3v7lGAv78Uyqfo5nWcn73+edLV+sbN06uq1OnKNLDC1h8pAdARFRhpKRIsxr3xkpdu8p/5mPHfJ/p9baGr8F1Ld9A57DecYfMJQ2mYY1ZRkCQmSljT0uTM97+Al/XjqxWuk07HHKSwVhKyQ5JScC338pBePv2pSXG115bft3c2rWlY6jVwPfQITlaMw7s4+OBt96Sx37uOTnycPXnn1LC+cEHgf9eseq44+TSX8D/yCP2P3ZiohwNLlggS5WlpgLnnGN9P+ecA7zyilQUxIJq1eT9eMYZMu7HHjN/3/R0yZRb4SnjW7s20KyZtf2ccIKM/Y47fG83dKi9S7lVNGedJeW5H3zg+7M7jGv4ZmUBM2cC06eXnm+uXx/4+GN5e15yCXDzzTIb5T8dO8r/1gjMq9+1C7jh/TH4HvH4aFUfPOdcMvqEE+QpNb769AntcuybN8u/yaIiKWKJjy/9Mn4uKZHCtZkzS8859e8vU/3HjpXzSX/8sR1AbGZ9GfgSEdmhuFgaRo0aJf89XHXpIgfMmzf77uZolPb6KnUGJPANZI5QSopkOF59tXRpmFAyAvjMTJkj6a+xlaFpUwlerWR8i4slELnwwsDG6o1RApmaKgfkn37qu8S4XTvp2GvF1q1y6TqHMTERuOoqea2uvrq0SRog2d569YCLLrL2OBXBBRdIEORriZ3q1eVg3W4nnyxHhw5H6UmQQEpkTz0VWLTI7ag8yg0bJhUOzz8v88qNExC+FBbK34LVLrPuJ5CMrLHFgKWkWg2k3JuKE+tu9T1cq4F5ZdO0qXzevfOOpPy8nXQKceC7Z4/M/pg+HZg3T4K3du0kwD3/fPmzWrMGePtt+Zj+6CP5V3PLLfLWrfntt1JyHGZLlgDnnQccPNgc815ehlvP7YSB6+Xf4fLl0jds2jTZVik51/vyy8CJJwb/2EVFcp7uxx/l37/ZViPVqsmf/H33SWlzOA4XwoWBLxGRHZYskfm5nhrquC5p5Cvw9ZfxNTKZgSxpdOyYNLTq3l0uw8E18AXkv3zDhuYOmnv2tBb4pqVJ5tTuuXoJCZJKSE2VA3h/Jcbt25tfusrgrZTz5Zel8/Ztt8mcX6UkM/ztt8D111euplaG6tUDy7LaoVYtCVZTU2WNZyDw99vAgbYNK2z+9z85gr7zTnlf+rN9u5yQCmRN0fbtS08gpadLSbkFRUXADTcAH300APHxA/D008D991tv3JyaKlP6W7aUeK51a7mMpj+9bdsk42n0UwyJZ58FvvlGokiHo0y39J075aaR83aga7VqUE2aBPwwBQXycm/YIB+jxteGDaX/9jp3Bu69V4Jd93YRPXsCkycDL70kwe/bbwPXXAPcfTdw7bXH4bTT5NyVa7bT+L5qVfmob9PGvmbwH34oS7O3bi3nuk48sTcAYEwnCSgNu3ZJILx4sYy5d28Z9zPPeD8c8CY7W6bj//ijXB48KIHskCHy8o0YIVnlwkL5OykqKvt9cbEcJliZlh9LGPgSEdkhJUX+e44cWf42o6TR3+nWzEw5Mmva1PPtrhlfq154QY6Q5s2T//Dh0Ly5HJUYmWwzja0MCQnA1KlSd2XmKMTu+b2GKlWAwYPlYO/PP2VeoK8S4/btZU6h1uYzVN6a9zRvLgect90maY4LLpBavsrY1CpaJCZKANi0qRzNGgFwjPn6azmH88gj0nDWlNatgSeflDTQjz+WPXL3xEdTqjVrgPffl6TusGHysVlms3btZEkvrWU/FqY85OcDl10GfPcd8PDDspuHH5Yg4NNPzU2tzsiQX/ObbzzfbgRIrVtLwDl+fHhXqPn3X3kNv/oKWLZMPmpuvVUS8iEplW3USE7EXXst8OmnOHL+1fjuO3k+f/tNXqam2IF6NVrj0IY4dOtmfteHDwMTJ8pHW3q6fOQbmjWT53f0aClyGjlSnmd/H6316kmQd/PN0uT97bflMczM8KlRQ/5lG4szuH41bmzuY72wUILtN9+UGQJffin39aZlS/lzGjMGuOce+difNEnud999Euj7el23b5dzUd9/L/+qiovluTv3XNnn8OEVN5C1TGtdab769eunI2nevHkRfXwiV3w/2qxHD61PP9377a1ba33VVb738X//p3WbNt5vLy7WumpVrR980NrYNm3Sulo1rS+7zNr97NCsmdbjxmmdlydjf+ABj5uVez++957WgNabN5t7nAsv1LptW61LSoIbrycvvSRjOfNMrevX1zo31/u2r70m2+7fb37/N96odePGnm8rLNS6d295/+TkaN21q9annGJp+GSd18/HWbPk9VVK60svDeuY7FBUpPVDD8mvUL26XF59tdb79pncQUGB1t27a33ccVofPep723fekQf491+ttdaHD2v9/vvy9gXk46BdO/ke0LpLF61vv13rOXO0LrzlDq3r1NF650658c03TQ3vyBGthw+Xu0yYINeVlGj98cda162rdb16Wk+b5v3+ublaP/641jVqaF2zptZPPaX1gQPyETpvntz3xRe1vvVWrc89V+v+/bWuVUseb/RorR2O0HwEzZs3T2dmav3GG1oPGlT6nPXtKx9Pt90mb8l27bT+6Sf7H19rrYsKinV2t1P0oepNdcua2RrQumNHrZ94QuuNG7Xe2TlJL4hP1FWrynvM18ek1lofO6b1669r3bRp6cfrY49p/emnWi9ZonV2tr3j37NH66VLtV64UF6n337T+uef5U96xgytp0/XesoUre+9V+vkZPmojY8vfa4BGes552j9v/9pvWyZfDx7epykJNn+nns8b2PGli1aX3SR7KdlS/mXWFQkt5WUaL1mjdbPPqt1v36l4+veXeuHH9Z68WI5XAiVWDh+BLBce4gFIx6MhvOLgS9RKb4fbbRli3ycvv66922GDtV64EDf+xkxQo6kfGnTRgJks0pKtB41So76MjPN388uvXtrPWaMHCUAWn/9tcfNyr0fFy+W7X/4wf9jlJRo3by51pdfHvx4PVm0qPTI4pZbfG87fbpst2KF+f2feabv133hQtnn0KFy+eGH5vdNAfH6+XjwoEQYgNaTJ4d1TME6eFCCM0Dr66+Xnx96SA7umzSRgMNU0DZvnuzkiSd8b3f//bqkWjW9eGGxvv56iWMBrbt1k8Bh7155vA0btJ44UT6matSQbe6LlxNIM2+YKVfMnu13WNnZWp96qtZxcZ7/RNLT5XZAzlm4BlYlJVp/+aWcOwO0vuQSrbdvN/FcaAmMn3mmNIA75RT52LIr8PjmG6179876723Xs6cEPJs2ld1uwQKtTzhBtrnqKmvn3twdO6b1unUSFE6aJB97rVpp3RsrdRHi9O89btV//un2funYUR8791J99dUyhg4dtE5JKb/vggKtp06Vf2WA1mecIYFuNCoslKD+xx/lnObVV0uwb/w7qFNH/m0/+6wE04sWyXuoRg3fJ1isWLiw9GRRQoLWd98tJ4mMMQwcKCdjNmyw5/HMiIXjRwa+DHyJyuD70UYTJmi/2ckbb9S6YUPfR5Y9e2p99tm+H6tfPzlCNOv77/0H5aF01lkyZiP7k57ucbNy78ecHNn+2Wf9P8bGjbLtO+8EP15P8vMl/QNovWqV722XL5ftvvvO/P6PP17riy/2vc2118p+/WWcyRY+Px9795bXIi0tbOMJ1oYNEhTFx2v91ltlP4ZWrZKDZ0CypaaKLC67TFLG770nR/huX8WfTNO7jz9Fp1frqgHJil57rRzE+/oIPHpUMpbvj5ETSP/DXVoD+unL1+uNG73fb/durXv1kizy9OnetysslCC1ShUJUObP13rlSq0TE+X3791bAphA5OZKYrpDh9IA/4MP5OMjEEeOaH3NNbKvNm1y9eOPS5bPl7w8yZrGx0uxzddfe3++i4vlnO3332v99NMS1CUmSnGJa5bTeP3GjtX622+1LrzxVjm74Hpyr6REqoruu09rLc9rt25y33PO0TojQx7v88+17tSpNGD77bfAnptI27FD6y++0PrmmyUYdX2u2ra1dt7TjJISOQHSsaO8tsOHa/3221IQEQmxcPzIwJeBL1EZfD/aaNgw+S/vy+uvy0eur5rCxo21vukm3/sZPVrrPn3Mjy0xUWq2Aq23CtZ110md1vXX+wz8Pb4fO3SQ1Is/Dzwgz20oT3mPHSsZV3/27bN2oqGwUI5kHnrI93Z790pK6d57ze2XguLz8/GRR+S9GcpaQhvNmSPnS5o0kWStJ0VFEhDXrSvZqhdflMycV5mZskP3CMnt6+eGF+mpU7U+dMjioJ0nkHK79NLFULp2fJ5WSkpQf/+97MdIRobWnTtLcPbzz+Z2v2SJBGBKyVeTJlLmapSSBqOwUAK8Xr3kaWjdWs7J+Xw+3axZI2WrSmn96KNa//rrfEtj+Oef0hLYs8+Wc4OLFsk4brpJyqXr1i19qZSSDGxSkhQUPf20nMNYuFBOKpT52M7Olqh64MDSv4G9e2VHEyf+t1l+vtYvvCDnDGvXLg2ETzxRMsGhKAmPlP37pVz6lVek1DlUioqi47xnLBw/MvBl4EtUBt+PNjl4UAKX++/3vZ0xN/DPPz3fnpcntz/9tO/9GIGkGUVFcjR4++3mtg+Fxx6T7MCJJ8ppai88vh/HjJHT6b6sWydpHivl34HIy5P6P39KSuRI7667zO1361Z53d97z/+2OTmRO4FRyfj8fCws9Hn0uXatBCvr19s/LitKSrR++WX58+vVS95q/uzYofV55+n/Smo/+8zHVN5DhySicn4dXrlRP3PVRt1FbdQnN9qoZ7y6UZccywts8MYJJKW0bttW79olc2+NcuJevaSc+Z9/JMNWv76U+lpx+LDWd9wh55KysgIbpi8lJZK9NubkdukiGVN/Ad+HH8pHSLNmWs+dK9cF8v+6sFACMaN83PiqV0/Oh95yi5QcL1kSQDD18ceys/ffl59XrpSfPaTbt26V+dB9+kiWNEbOF5EPsXD86C3wtalhNxFRJfXzz7IGgKdljFy5LmnkidGp2d/aBS1aAHv3lm196c369cDRo9JJOVJatZKxrlplfRwJCfI7FBR4vl1raWVau7asXxFK1aubWwNSKWkba6xB6o+Pzrfl1K1bfo1oCr/4eFnayIPPP5cVj559FujWTdbvXLLE/K43bQIefFCaRQ8bJl2Fvb39fcnNBa68UpbwOf98WcvTTAPq1q2lgfgPP8hHx+WXS8fZm26S7sFau2xcrx7QuTN0p874+q/O6DK6Mx7/tDPOuKkzftrSGWPv6QxVo7r1wQPSArdmTXnAjh3RogXw1FPSvfb996Vr7TXXyKpi+fnA/PmyjqsVdeoAEyYAr7wiq6zZTSlZOiY1FZg5U942F1wgK1n98Uf57Y8ckSW7r7lGtvn7b+kIHKj4eOkGvHq1dDOeMUOWDD94UDr/vvmmLPk0YIDXt7N3V14p3e4feEDWs/axhm+HDtJhe+VK4JJL7FsqiCgQfPsREQUjJUUO0k45xfd27dvLMkLeljTyt4avoXlzOeo7cMD/2FaskMuTTvK/bai0bFn6fSCBb1GRRAOefPONrKXx3HOydkO0sBL4elvDl2JKfr4snXL55bIG519/yVJB8+dLEDNkiCynUyZwdDp6VJaFOe00WTLl1VdlHc30dOCii2Rln0cekdXIfNm/H/joIwm2mzUDPvtMAvCvvrK+7uzZZ8s5ut9/B8aOlaVmBgyQdVJfe03OvQHA5s2yxMzFF8uf+pIlwFtvAQ0aWHu8cowTSECZv40aNWRFnVWrgF9/lTVSU1PlOY9WSsl50VWrZCW0zEx5P4wZI0EpIMuQ9+8v74MnnpAV0Vw/OoPRqZMsrTN2rAShZldZ80kpeaGzs4FHH/UZ+BJFEwa+RESBKioCZs8GzjpL1nv1JT5ejkC8ZXyNtW7NZHwBc2v5rlghR7xGtjkSXH8fqwG4sTBmWlr52w4fBu66C+jTJ/rWtLWa8a1alQeMNsnMlPVpv/kG+OcfCSpDbds2SX5NniwZtnnzJBB75hnJUL72mpzfOOssuf6zz+Sj46+/ZK3RVq2Aq66Ssb/4oqzROnu23GfOHODkk+X6jh1lPdOUFDn3Bcg5oVdfBZKS5JzYNddIZvb//k+yvI88EnigExcHDB0KfPKJfDxNmSJZ0nvukczw8OHyJ7p4MfDGG8DSpRK82cZD4GtQSjLikyfLyYJYUKWKvD4bN8qSuAsWSMb67LPlpEJ2tgTzTz7p/99JVDjxRKm4eecdSSfHx8ubkCiKsWaKiChQixZJmZe/MmdDly7+M77+TvO7Br49e/redvlyOdKO5FGUEfg2alR6IGtW164y9rQ0SSm5euYZec6mT4++o8T27SX9lpvrP9WWni5pmGj7HWJIcTHwyy/A1Kllg0JD27byp9e1q1x26SJZWDvKW1NSJGjVGvj+e+Ccc8reXqeOnJ+55Rbgiy8k4LniCskO5+RIBvOCC4Drr5fg1TVIjYuTbOrIkRIMv/ce8O67krlr21b2vW6dbNurlyTezj5bzgXZktVzUb8+MG6cfK1dC3z4ofzpnXeelNHalZ0sw/i8OP74EOw8cmrWBO67D7juOjmh8cYbcuJk2rTSj/eY8dRTUlLw889SmsDPMYpyDHyJiAI1c6Zk60aMMLd9166SwikuLn+AkJkpZ8ybNPG9D+PIaM8e39sVFckksRtuMDe2UGneXI7CTzrJ+tF4jRpA587lM75r1wKvvy5HjgMH2jdWuxgH7Nu3y0RPX9LTgY4dsXmzBBU9esiczMTE8B9Dbt8ucw/POUemE4dbcbFkaOPizJXm7tghpaPvvy9jb9ZMMq5XXSXzYjdulPNMGzfK12efAYcOyX2rVAEGDZJzVmPGyJ+mlbdnUZEEmi+9JIHmt9/6rlavVk3mb155JTBrFvDllzIn9bLLzAXgbdtKjPHoo8CPP0oAXFQkc2/HjrV+TikY3bvLvNhXXgnxA/nI+FYEjRrJiZAnnpA5tnafrAiL+vWl5OCKK1i1QjGBgS8RUSD27JGjz1GjpMmLGV26yBF5Rkb5g7nMTEmb+Ov8YZSS+St1jobGVoAE88nJ5rPi7hISJIA3aC3ps7p1gRdesGWItmvXTi7NBL5btiCna3+cfro0nVm0SJrONG0qAej550u5abVq3ndRXCwZwc2b5S3Uo4e14R47JkHMiy/K961ayXmFCy80fzBeUgJ8/bUk4vfskfFWqyY9wdy/LymRZHhurrxFje/z80v3V6+eHEe3bi2Xrt8XFEjGcdYs2dfw4ZJ1HDu27PPkPu9Ta2DfPsmSzp0r2dr77pOvTp0kAB4zRk46uCoslDJU4ysrSwJeh0Oq7CdMMNf3DJA/72D+HKpWBc49V74qvKFD5UXs3j3SIwkpq/Ovo85ll0mps9UPHqIIYOBLRBSIBx6Qo/aXXzZ/H2Ou7YYN5QPfXbvM1QvWrSu1cv4C32hobGWYMSPw+yYkSE3l0aOSFvnyS+kYNHmyRIfRyMhU+Zvn64ykJs3qiCNKAqlOnYCffpJf+Ysv5NxKgwYSKJ13nrz0mzZJkGt8padLcGY44wyZhzlihO/AVWt5ae66S+apXnihZEsff1wqy999V4Jwf1PEf/1V/hxWrpRpf5dcIsFpfr5cun6fny/Z1pYt5YC/Vi25dP2+uBjYuVO+duyQBP+uXWUbmTdvLo953XXmK2GVkqxws2bSSOrZZ+XcxKxZEgRPnixBbL16QJMmJ6GgQF6i3Nzy+6pVSxoRXXGFucemAJxyikyEpuimlJz1IooBDHyJiKz6809pc/rww9YaRxldWDZulEyxq8xMiXr8UUrKnc0EvrVrx07nF28SEiRCW7dOyp7vuUeC+UiXcPvSqpVEd34C3z2Lt6I5gHUFx2OuozRDecEF8pWXJ5nJ6dOlqv7TT0vvW6uWvF0SEiQz3KmTBIBLlgCTJsnbq3t36eZ6+eXlM5Lr1wN33CFzY3v0kObYp58ut40aJf1qHnlEppHfd598777kycqVsvTO3LkS63/6qSR/QrFcSVGRvOV37pRzIIMHS/YzWO3aSbnwTTdJgPvrrxIIr12bh86d66BhQ5T7atRInutoPe9CRESeMfAlIrKiqEhKbdu1k8DXiqZNJX3nqcFVZqZ0tzGjRQv/c3yXL5fJh7HebMS1s/Pnn0v0M2NGdP9e8fFSk+sj8M3MBF68dgveAPDAlI7o6aEivUaN0rLYwkLpAluligS5LVp4zuYOHSrB7pdfSvnv9dfL2/TWWyW4q1YNePppYOJEOS8yYYI0WnINIqtUkbf4BRdI0Pv88zI/duJEKSfeulXmmn7xhQSBr70m+6ge4JKtZhhPaSinEdauLc2hzj4bmD8/DUOGDAndgxERUdhxOSMiIiveeksWZJwwwfrkLKUkQ+y+pFF+vkwcNNsatXlz3xlfo7FVNJQ5B+v44yWi+uoribxuuMHmNVOCt2aNBJhDh0qzpaNH4XNJoz17ZCmWBlnpAICeZ/tv3lO1qqz9mZgobxNfJczVqknJ8t9/SwazXz8pX27bVoLm//1Pbt+4UbK+3jKnzZvLUjZ//CEV9uecI73ETjgB+OEHCajT06VUOpRBLxERkR2Y8SUiMmvXLokgRo4sv26JWV26yEKf7vsF/K/ha2jRQsqtvVm3TroURbqxlR3i43G4XXfUnTMHuTUa4fVaz6PKCxKI1a0rS7rUrStPnZEcDgetJaj83/9kJY+aNSWwvO466Sz8S4v26HVwPtxjyn37JOjdvh24aUQ6sLhpyFooG2udDhtW2gjbeAsPGGB+P0lJUtY8aZJ8XXONdKI1+3YlIiKKBgx8iahyOnZM0l0XXihtYc24/36ZeDlpUuBrT3TtKpMhXdd4NdbwtRL47t8v9a+e0nXR1NgqSNOmAdicgCvwFx5WL+KtSY3LrdNquPtu6TUWyiro/Hwp8X3tNWD1ankpnn0WuPFGKftNTQXefhv45et26KN3YtTwItx4azzGjJGuzWecIVnSWbOAls+nh22plu7dpVlVoKpWlef37rvtGxMREVE4MfAlosrpq68kEnj/fUmF3Xab72DW4ZAo7LHHzDWh8sZoNrVpU2k3I6sZX2NJo337PN9nxQpJhcZwYyutZXnI++8H7up9DS7sXx8T37kOE5ScezhyBDh8uPTLCEbXrZPv69e39nhFRXIuoqhIvgoLy35fUCDB6qRJUmWekCBL6lx6adky36Qk+Tp0UntUua8E2Wk7cc457dGunTSH2rpV1mEdOhTAdVuicx1iIiKiCoiBLxFVTlOmSGB4wgmS+V2zRtZu8ZRBLSyU7j0dOkgb22C4LmlkBL5GxtfsHN8WLeRy927Pga/R2CoU7XXDoKREMosTJwIXXQS88MlQVK8+FACgIGXFNWuW7ao7eLB0IL71VlkFJSXF3DI3hYXyVnjySeDAAf/bn3mmNPQePtz3eZL6J8qSRn9+noGUg+3x9tuyRu8PP0jWF4WFUu982WX+H5SIiIiCxsCXiCqfVauAxYslRXjHHbJWy4svSrefb78FGjcuu/2kSRIYz5hRfk0Xq4xssWuDq8xMCbjdH9cb18DXXVER8M8/Unsbg/LzpfHS118Dd94pc2jNxu833ijnMi64QOawTp8uDaE80RqYPVvm465fLxnYMWPkZYiPL/1y/bl7d/kyxbmWb/yODJx7JXDuuRLQ//e7/PuvLFhrdhFaIiIiCgoDXyKqfKZOlfrUq66SSOSFF2Qx0+uvl4gpJaU0wsnMlE4+o0fLujLBqlVLlkJyXdIoM1OyvWYjPCPw9bSkUQw3tjp0SALEefOAV16RJXutTqU+/XRg6VJ5qYYPlybc48aV3Wb1atn33LmyNPCMGbJ9oNO2PWrXTi5dOjuXeXnTpaNzuOb4EhERVXaxWQdHRBSo3FxpLnXBBWUzrFdcAcyfL7cPHCjpQEBSgoWFwBtv2BcZdelSNuO7a5f5MmegdI6vp4zv8uVyGWONrfbvr4akJGkO9emn8rQH+nR36iQJ/TPOAMaPB26/XRLhe/bIz717y9M0YYIsDzx2rM1BLyC12M2aeV/Ld8sWuWTgS0REFBbM+BJR5fLVV0BOjkRA7gYOBJYtk0goORm49lrplPTkk/YGKEZnZ60l4srMtNaIqmZNoF49z4HvihWyPE7nzvaNN4S0liD01lv7IjdXGkideWbw+61fXxL3998vvcsWL5aS5mPHJBB+7DHpwhxS7drJPF5P0tNlwV2uCURERBQWzPgShduMGVKPqXWkR1I5TZkCdOsm3ZA8adtW1sg95xzgvfck4L3/fnvH0KWLBN9798rPmZnWA6AWLbxnfKO8sdWhQ8B33wE33CCx4YABQEFBHObPtyfoNcTHyzTu996Tac9Dh8pU7ddfD0PQC8g8X28Z3/R04LjjQrv2EhEREf2HGV+icJs9WyYx5uRYX3OFgvP33zIBdMIE37WttWsD33wDfPAB0L+/ZFjt5NrZuX59IDvbWqkzIIGv+xxfo7HVzTfbM06blJTIsObMAX76CVi4UPo61asn5ciPPw40abIM/foNCsnjX3cdcOWVkmANq/bt5e/dyOy7Sg/fGr5ERETEwJco/IymNvv3M/ANt6lTgRo1JAryJy5Oml2FglHWvHGjZJgB6xnf5s2lO7WrtWtlkdsoaWxVXAx8/rn0Btu6Va7r2xd44AFg5EipLDdWj5o/vzCkYwl70AtI4HvsmKy33KxZ6fVayxzfU06JwKCIiIgqp+ithSOqqIzA18yioWSfI0eAadOACy8MU52rD+3aSVfpDRtK1/C1o9Q5ShpbaS2Z3b59pXF2w4bARx9JD68VK4DnngMSEz0vmVyhOJc0KlfunJ0t9d5cyoiIiChsGPgShVNRUelBMAPf8PryS+DwYc9NrcKtShVpPhVs4HvokGR4DUZjK2Ot4AhYtgwYNgwYNUrONXz5pVx39dWlqzBVGt4CXy5lREREFHYMfInCaft2qf8EGPiG25QpslbvqadGeiTCWNJo1y75OZA5vkDZeb7Ll0uaNQKNrTZtAi66SBpVpaUBkybJksIXXxzVfbZCi4EvERFR1OAcX6JwMg54AZnjW5Hl5gK1aoVggVQnrSWlWLeu/21XrpSg0M61eIPVtSswc6YERVWrll1T2AzXtXzbt5e1hv/5B7j1VtuHWlQEHDwoFbqevjZuBD77TKq3n3gCuOcecy9LhdegAVCnTvkljYw1fI87LuxDIiIiqqwY+BKFk2vgW5Ezvnv3ykH9tGnAuefav/+sLEkvLloEfPIJcP75vrefMkWaWl1xhf1jCVSXLhJRLlwoZc5WA3Ij42vM8127FsjPt9TY6uBBefocDjmHcPSonK/IzS39/ujRstXUntSuLRXkjz1WGo8T5DX1tKRRero8UXXqRGZcRERElRADX6JwSk+X7F6dOhU78F2yRCKmxYvtD3zXrweSkyWL1qULcMEFwNNPA48+6jl4PHxYWgtffLF0WYoWxpJGy5cH1ozKvdTZZGMrrWXO7TvvyPzbY8dkSnCjRpKgb9lSAtlatcpeNmzo/atGDevDrzS8Bb4scyYiIgorBr5E4bRli2RClarYpc4rVsjlhg327vfnnyWArV4d+P13yW6OGycLwa5dK+vuuq+5+8UXks6MhqZWrowljYqKrDe2AkqXxzEyvitWyMK4XjoFHzkiT8U770jld+3asqrT+PEyLZhCpH17qUxwtWULMHhwZMZDRERUSVXWliNEkWFkeho3rtgZXyP7uHGjPfvTGpg4ETjrLAkkli4FBg2SVOPHHwMvvQR89RWQlFTaJdkwZQrQs6csGhtNGjcundcbSOBrzAs2Al8Pja20Bv76C7jlFnmIceNkKvDbb8vTNGUKg96Qa99eJkIfPiw/FxQA//7LjC8REVGYMfAlCqf0dMnIVeTAV+vSjO/mzZLRDEZBgURsd94JjB0LLFhQ2i0XkOz5/fcDP/wgZdD9+0stLyDB4MqVktaMlqZWroysr9WOzgZjLd/CQmDVqv/KnDMygBdeABISJLB9/33gnHNkOvE//wA33STJYQoD987O27cDJSVcw5eIiCjMGPgShUt2tnQTqugZ38xMCcZ695aAbNu2wPe1fz9w5pnAe+8BDz8MTJ/uvSHQ2LES2VWrJpnfr76SlGbNmsDllwc+hlAy5vkGkvEFJPDdswdYswbIz8ev2f2QmAh06CBPV8OGwOTJwM6d0sTqlFOiM/6v0Nq1k0ujszOXMiIiIooIzvElChdjCZOOHWXt1oo6x9fI9l5+OfD331Lu3KmT9f2sWweMGSNR22efAZdd5v8+PXtKGfR55wGXXALEx8tE1gYNrD9+OBgZ3wADX928OY7+vhgfXrMctwK48f2TUK0b8Nxz8nR16GDbSClQ7hlf188BIiIiChtmfInCxTXT07ixrBFz9GhkxxQKK1bIPNOLLpKfA21wddddwKFDwB9/mAt6DU2bAr/+Clxzjfx8882BPX44nHyyBOdGAGzB5s3Adwuk1LnexuU4Vr0+vllxPNaskWwvg94o0bKlzMc2At/0dGnOFmh5OxEREQWEgS9RuLgHvkDFLHdevhzo3h1o21ZqbQNtcPX338DZZ0twaFX16jKxdf/+wJYKCpfTTwf27bMUpebny+pNCQnAX7tboDaO4sp2f6DmqX3Rp69iKXO0iYuTvwXXwLdjxzJNyIiIiCj0+J+XKFzS02UJmjp1gCZN5LqKVu5sNLbq108mk3btGljGd98+mbuakBD4WJQC6tcP/P7hYqEM+9dfpZr7iSekWdU9L8tavmr9+ugO8Cs717V8uYYvERFRRHCOL1G4bNlSesBbUTO+O3dKwNqvn/zcpQvw22/W97NmjVwGE/hGsWPHZGnX338H0tIk4dujhyTKu3eXRLmr3buBu++WdXg7dZLljM88E8AvzUs3Mp5zij7t2wO//CInhrZsARITIz0iIiKiSoeBL1G4pKcDp54q31fUwNdobGVkH7t2lXbCR45478bsSVqaXFaQwLewUFZY+v13+Vq4UEqWq1QBOncG5s4tO927ZcvSQLh+fVnCOC9PMr0PPijLFwOQrs4GZnyjV7t20tBu1y5Zz5dLGREREYUdA1+icCgslOVMrrxSfq7IgW9cHNCrl/xsNG3auFEWlDUrLQ1o1KhsYBdjDh4Evv5alhd2OIDcXLm+d2/glltkem9ioqynW1Iib481a4C1a0sv339f7jd8OPDWWxIkl2E8Pw0asHw2mrVvL9leh0N+5mtFREQUdgx8icIhI0OiG/dS54o2x3f5cklV1qolPxvr1AYS+CYkxNyis8XFMg/3o48k4M3Lk9Lkq6+WQPe000qnd7uKi5Ny5w4dgNGjS68vKZFzI02aeHkqGjeWtHHfvjH3XFUqxpJG8+bJJQNfIiKisGPgSxQOrh2dAVnepF69ipXxNRpbnXVW6XWdOklAZqXBldYS+F5+uf1jDJF164CPPwY+/RTIzJQ5utdeC/zf/0kFcqAxaVycrM7kVZUqwIgRst4xRS/3wPe44yI3FiIiokqKgS9ROBiBr+vcvsaNK1bgu2MHsHdv2SZLNWvK/EYrSxrt3Cnr90b5/F6tpZT5tdeApUslBh01SubjJifLikphMWtWmB6IAta2rVxu2iQTuI2KCCIiIgobBr5E4ZCeLpFQy5al1zVpUrFKnd0bWxmsLmkUA42tdu0CbrwRmDlTGlD973/AZZfF9JRkCiXjb3/XLpY5ExERRQjX8SUKhy1bpLwxzuVPrqJlfFeskLSn0djK0KWLZHy1NrcfI/Dt0cPe8dlAa2DaNBnaL78Ar74KrFolSw0x6CWfjHJnBr5EREQRwcCXKBzS08sf8Fa0wNdobFWzZtnru3aVJVx27za3n9WrgVatpKtzFMnMBM4+Wxpzd+sG/P03cM89EusT+dWunVwy8CUiIooIBr5Eoaa1BL7ua3dWpMDXaGzlOr/X4LqkkRlGR+coobUsRdyjh6y3+9prsiqN0bCayBQj48s1fImIiCKCgS9RqGVlATk55TM9TZrI9QUFkRmXnf79F9i3z3Pga0SIZub5FhfLArZREvju3CmNqq6+Woa0ahVw113M8lIAWOpMREQUUQx8iUJtyxa59FTqDEhgHOu8NbYCpKNtjRrmAt/0dFn8NsKBb04O8OSTwAknAL//DkyYAPzxB9C5c0SHRbFs9Gjg0kuBPn0iPRIiIqJKiV2diULNfQ1fgxH4HjgQ+52RjMZWJ55Y/ra4OIkYzZQ6G42teva0d3wm5eUBkycDzz0nL8uFFwIvvMDqVLJBhw7A559HehRERESVFjO+RKFmJvCNdcuXS5bWvbGVoUsXcxnftDRAKekeFUZFRcCHH8ow774b6NsXWLZM1ull0EtEREQU+xj4EoVaerpkdGvVKnt9kyZyGetr+fpqbGXo2lWeh8JC3/tKS5MTBLVr2ztGL7QGvv9eEtXXXisv06+/ylJFnqq2iYiIiCg2MfAlCrUtWzw3tKkoGd/t2yV49xf4FheXZr+9CWNH5z17gEGDgPPOkwB4+nRgyRJg2LCwPDwRERERhREDX6JQ87SUEVBxAl9fja0MZpY0ys+X28MQ+O7bJwHuP/8A770nSwefd55UWRMRERFRxcPAlyiUCgpkqR9PGd9atWRObKyXOq9YAcTHe25sZTACX1/zfDdulMm2IQ58s7KAM8+URHxKCnDddTJ8IiIiIqq4eLhHFEoZGVJH623tzsaNYz/ju3w50KOHLFnkTaNGMqfZV+BrdHQOYeB76BAwYoQsFTxzJnD66SF7KCIiIiKKIsz4EoWStzV8DbEe+BqNrcx0gura1Xepc1qapF6N7LDNDh8GRo0C/v4b+PZbCYCJiIiIqHJgxpcolIxmTt7WxIn1wHf7dhm/r8ZWhi5dgNmzvd+elibBcbVqACSm3rVLHuLff+XS+DJ+rlIFGDcOuOUWoFkz77vOzQXGjAGWLpUlipKTLf6eRERERBTTGPgShVJ6upQAt2jh+fYmTSQFGauWL5dLsxnfDz+UeuP69cvfnpYG9O8PQILeSy8Fvvqq7CZ16gDt2slXv37Azp3A008DL70EXHmlrMHrvgTwsWPA2WcDf/4JfPaZNLEiIiIiosqFgS9RKKWnS5mzt3bBsZ7xNRpb9ezpf9uuXeVy48b/Atz/5ObKc3XNNQCAH3+UoPfGG4HRo0uD3fr1yz+VGzYAr78OfPyxdGgeNQq45x6Zv1tQAJx/PvD778BHHwGXXBL8r0xEREREsYeBL1EoeVvD19C4MZCdLWvcVqkSvnHZZflyaUblq7GVwXVJI/fAd+1auUxIQF4ecMcdQPfuwBtvAFWr+t5t167AO+8Azzwjl2++CZxxBtCrlzy9v/8OTJ0KXHWV9V+PiIiIiCoGNrciChWtva/ha2jSBCgpAQ4eDNuwbGOlsRUgz0NcnOfOzqtXy2VCAl55Bdi61VzQ66ppU+Cxx6SR9nvvAYWFEvS++SZwww3m90NEREREFQ8zvkShsn8/cOSI/4wvIOXOxvexIiNDFsU109gKAKpXBzp08Bz4pqUBNWtimzoOzz8PXHghMGxYYMOqUUPW5r32WmmO1apVYPshIiIiooqDGV+iUDE6OpsNfGONlcZWBm9LGqWlAd274577qyAuDnj11eCHpxSDXiIiIiISDHyJQsVYw9dXqXMsB74rVkgtspnGVoYuXSTwLSkpe31aGnY27onvvgMeeUQaWRERERER2YWBL1GoGBnfDh28b9OkiVzu3x/y4djOaGxVvbr5+3TtChw9CmRmll534ACwaxem/ZWATp2kIzMRERERkZ0Y+BKFSnq61NrWrOl9m1jN+FptbGUwOju7zvNdswYA8Pu+BEycaC2OJiIiIiIyg4EvUagYa/j6Uq+erIMba4Hvtm2yDJPZxlYG17V8nQ7+mQYAaHlGAs46y6bxERERERG5YOBLFCpbtvie3wtIB6bGjWOv1Pm33+RywABr92vdGqhVq0zGd/lHachGAzw2mZ2oiIiIiCg0GPgShUJeHrBzp/+MLyCBb6xlfKdOBbp3B3r3tnY/pUobXAFwOIBqm9KQ0zYBx3dS9o+TiIiIiAgMfIlCIyND5sFWxMD3r7+AZcswIW88tmUEEKx27Qps2ICiIuC2WzVOjEtDqxEJ9o+TiIiIiMiJgS9RKBgdnf2VOgOxF/hOnYqCKjXwVPqVuOkmie8t6dIF2LYNUyflY9/qXWhQko2qvRn4EhEREVHoMPAlCgVjDV8zGd8mTWJnju+RI9CffYZvq1yMqk0b4qefgG++sbiPrl2BkhJ89NgWXNVHGlshgYEvEREREYUOA1+iUEhPlyZOzZr539bI+FpOnUbAF19AHT6MSQXj8eGH0tT5jjuAQ4fM76Kooyxp1LlkA+4/yxn49ugRgsESEREREQkGvkShYCxlpEzMgW3cGCgsBI4cse/x58wpu1auXaZMwba6PZHRYiBGjgSmTAH27gUeftj8Ll77UQLfe8duRKPMNKBFC8l6ExERERGFCANfolBITzc3vxeQwBewr9w5Nxc491zgwQft2Z9hxQpgxQq8njsOV1ypUKWKZHxvuw2YPBlYvNj/LhYtAh5+qT4O1miOPrU2AGlpLHMmIiIiopBj4EtkN61LM75mGNlOuxpczZ0L5OcDqalASYk9+wSAKVNQWLUmPi65AldfXXr1M88ArVoB48dL4tqbnBzg8suBtm2BOn27AuvXA2vWMPAlIiIiopBj4Etkt717JetqNvA1Mr52Bb4zZ5bub906e/Z5+DDw+eeYVfcSdOrXoMyU3Lp1gTffBFatAiZM8L6L226TVZ6mTQPie3QFli4Fjh5l4EtEREREIcfAl8huxlJGkQh8S0qAWbOA/v3l59TU4PcJAJ9/DuTm4oWs8WWyvYZzzgHOPht44glg27byt3/5JfDJJ8CjjwKDBkGWNCoulhsZ+BIRERFRiDHwJbKblTV8gdJSZzvm+C5dKhnnO+6Q+mOHI/h9ag1MmYKdTXvhr/gBuPRSz5tNmgTExQG33FK2QXVGBnDjjcDAgcBjjzmv7Nq1dAN2dCYiIiKiEGPgS2S3LVukm3P79ua2b9hQtrcj45uSAlSpAowaBSQlSeAb7DJJy5cDf/2FSXnjMHqM8tqAuW1bme87ezYwfbpcV1wMXHmlJKI/+wyIj3du3EU6O+O444A6dYIbHxERERGRHwx8ieyWng60bg3UqGFu+ypVgAYN7At8Bw0CGjUCEhOBnTs91x5bMWUKiqrXwuTDl+Oqq3xvetttQJ8+wO23y9q+L74o1dZvveVW+d2xo/zeLHMmIiIiojBg4Etkt61bJZNpRePGwZc6Z2QAq1cDycnyc1KSXAZT7pyTA3zxBRytLkXVxvUxerTvzePjZW3f3buBiy+WOb+XXAJccYXbhlWrAnffDVx7beBjIyIiIiIyiYEvkd0yMsyXORuaNAk+45uSIpdjx8pl9+5SRh1M4PvZZ8DRo3hsx3hceilQrZr/u/TvD9x6K/Dzz5L4njxZKrnLefll6YpFRERERBRi8f43ISLTiouBHTusB76NGwOZmcE9dkqKzJ015s/GxUm5c6CdnZ1Nrfa17YOF/56EiR66OXvz7LNS6nzLLVLFTUREREQUScz4EtkpM1OC30AC32AyvocPA/Pnl5Y5G5KSgE2bgF27rO9z6VLgn3/wQdXx6N5doV8/83etVw/4+GNgwADrD0tEREREZDcGvkR2ysiQy3btrN2vSZPg5vj+8gtQUFA+8E1MlMtAsr5TpqCkVm08l34prr7aS7kyEREREVEMYOBLZCcj8A0k43v0KJCXF9jjpqTIfN5Bg8pe36cPULu29cD30CHgyy+xostlyI2rV745FRERERFRDGHgS2SnQDO+jRvLZSDlzsXFwKxZsnZvvNu0/apVgVNPtd7gato04NgxPLl7PIYPB1q1sj4sIiIiIqJowcCXyE4ZGUDTpkCtWtbuZwS+gZQ7L14s93MvczYkJsoyR1lZ5vbnbGqV06UfZu/uh6stNLUiIiIiIopGDHyJ7LR9u/UyZ0Dm+AKBZXxTUiTTO3Kk59uTkiSYXbDgv6vefx944YUT8O67wIYNcvN/Fi8GVq/G9MbjUa8eVxwiIiIiotjH5YyI7JSRIevnWhVMqXNKigS33tYNGjBAFuBNTQWSk7FkCTB+PFClSlP88ots0qyZJIaTkoArfp2ChnXq4KF/LsGFlwI1a1ofEhERERFRNGHGl8guWkvga3V+LxB44JueDqxd673MGZDItX9/wOHAkSPAFVcArVsD06cvwoYNwLvvAiNGACtWAE/ckY2aKV/hvaOXY8/RuixzJiIiIqIKgRlfIrscOCCdmQMpdQ50jm9Kilz6CnwBSeW+8goeuj0XW7bUxvz5QElJEbp0Abp0Aa6/XjbLemoaaj6Zh73njse4xsDgwdaGQ0REREQUjZjxJbJLoEsZAUD16kCdOtYzvikpQLduwPHH+94uKQkoKsLaDxfjvvvkx3K0RqNvpgD9++ORb/tgyhSu3UtEREREFQMDXyK7BBP4ApL1tRL4HjoE/PGH/2wvgL2dTkUx4nBRcweeftrLRgsXAmvWyARgIiIiIqIKhIEvkV3sCHytlDr//DNQVOQ38NUauO6uevhH9cYV7RyoXt3LhlOmAHXrApdcYn4MREREREQxIOyBr1KqrVLqW6XUIaVUjlLqO6WU6W5ASqluSqlvlFL7lVLHlFIblFJ3hHLMRKZs3y7lyg0bBnb/Jk2sZXxTUiRYPuUUn5u9+y7w44+ASkpC7dWLgYKC8htlZQFffy2dr2rXtjhwIiIiIqLoFtbAVylVC8DvAE4AcDWAKwF0BjBPKeX3aFspdRKAJQCqA7gewFkA/gegSqjGTGSa0dE50ImxVkqdi4qA2bOB0aOBKt7f/hs3AnfdBQwfDvS6NRHIywOWLy+/4SefAPn5LHMmIiIiogop3F2dbwDQEUBXrfVmAFBKrQKwCcB4AK95u6NSKg7AJwB+01qf63LTvNANl8iCjIzAy5wBa4HvokWSpfVR5lxYCFx5pfTN+vBDIK5aotzgcACnnlq6odbA1KnAyScDvXoFPn4iIiIioigV7lLnsQAWG0EvAGittwJYAOBsP/cdAqAbfATHRBFlR+B78KBkc/1JSQGqVgXOPNPrJs89ByxdKlN3W7cG0LSpdIBOTS274Z9/AuvWMdtLRERERBVWuAPfHgDSPFy/BkB3P/c1VhStoZRarJQqVErtVUq9oZSqaesoiazKzZVsbTCBb5MmcpmV5X/blBRgyBCgXj2PNy9eDDz7LHDVVcCFF7rckJgogW5xcel1U6YA9esDF18c8NCJiIiIiKJZuEudGwHI9nB9FgB/HYFaOS+/AvAmgAcBnATgaQBtAZzr6U5KqXEAxgFA8+bNMX/+fMuDtsuRI0ci+vgUOrW2bcMAAGtzc7E3wNe42Z496A5g6Zw5OOojgK65YwdOXr8em4YPx04Pj7VvXzXcfnsfNGmicOGFyzB/fmmQ26xJE3TPycHyDz7AkZYt8eeMGTj166+ROWYMNi9dGtC4iezAz0eKJnw/UjTh+5GiSSy/H8Md+AbDyE5P01o/7vx+vlKqCoAXlVLdtNbr3O+ktZ4KYCoAnHTSSXrIkCFhGawn8+fPRyQfn0Lop58AAN1HjUJ31/mzVji7LQ/o2FEys968JtX+ne++G507dChz0549wI03SgL6t9+A/v3d9nP88cDzz+Oko0cxv04dDF65EigsRJunnkKbnj0DGzeRDfj5SNGE70eKJnw/UjSJ5fdjuEuds+E5s+stE+zK6Poz1+36X5yXfYIYF1FwjDV825lemas8o9TZX4OrlBQgIQFwC3oPHJDuzf/+Kw2f+/f3cN+2baUcOzW1tKnVKacADHqJiIiIqAILd+C7BjLP1113AGtN3NeXkoBGRGSHjAwgPh5o2TLwfTRuLJe+At/sbAlax44tc/XBg9LnauNGYOZMYPBgz3cHACQlAQ4HGvzzD7BhA5taEREREVGFF+7AdyaAgUqpjsYVSqkOAAY5b/NlDoB8ACPcrh/pvPSwOClRmGRkSDbVx5q6fpkJfH/6SRpTuSxjdPgwcNZZwOrVwHffAcOG+XmcpCRg3z50fOcdoEED4KKLAh8zEREREVEMCHfg+y6AbQBmKKXOVkqNBTADwL8AphgbKaXaK6WKlFLGXF5orQ8AeAHAjUqp55VSZyilHgTwOICPXZdIIgq7YJcyAoDatYFq1YD9+71vk5ICNGsGDBgAADh6VGLgpUuBL7+UANgv5/zhehs2SNvnmmyKTkREREQVW1gDX611LoDTAWwE8CmAzwBsBXC61vqIy6YKQBUP43sawP0ALgIwG8BNAF4BcENoR07khx2Br1Iyz9dbxrewEJgzBxg9GoiLQ34+cO65gMMBfPIJcN55Jh+nSxcJngFg3LjgxkxEREREFAPC3tVZa70dwPl+ttkGCX7dr9cAXnN+EUWHwkIgMzP4wBeQcmdvge+CBTKZNzkZhYVSofzLL8AHHwCXXWbhMZQCLr4Y+1esQJMenqbcExERERFVLLG0nBFRdNq5EygpCa6js6FxY++lzikpQLVqKDhtOK64XJpYvfUWcM01ATzOG28gbf58DAlmrEREREREMSLcc3yJKh5jKSM7Mr7eSp21BmbORP7g0zHs7Dr45hvgf/8Dbr45+IckIiIiIqromPElCpadga+3UucNG4DNm/HMgbuwIk8aWV18cfAPR0RERERUGTDwJQqWEfi2bRv8vho3BrKypHQ6rrQgY/XzKegJYG71MUidC/TrF/xDERERERFVFix1JgpWRgbQogVQo0bw+2rcWNbpPXQIgMS/Tz4JZH+ago21emPGX+0Y9BIRERERWcTAlyhYdixlZGjSRC4PHEBurnRunvTUAQxWC9DxjmS0aGHPwxARERERVSYMfImCtX27PR2dAcn4AtiVdgCDBgHffw9Mu2wO4nQJ4s9NtucxiIiIiIgqGQa+RMHQWgJfuzK+zsD3nqv3Y9s2YNYsYFRRipRSs8aZiIiIiCggbG5FFIy9e4G8PNsD37oFB7D4L+CEjgXAxT9JzXMcz1MREREREQWCR9JEwbBzKSMAizfLHN8Lhh7ACScASE0FcnKAZJY5ExEREREFioEvUTBsDHy1Bu59pj6KEYfTEpxr+aakSLfoM84Iev9ERERERJUVA1+iYNgY+M6YASxYFIeCOo1QLWe/RMIzZwLDhgG1agW9fyIiIiKiyoqBL1Ewtm8H6tUD6tcPajdFRcBDDwEnnADUaNMEOHAAWLsW2LoVGDvWpsESEREREVVODHyJgmHTGr4ffACsXw+88AKgGjeWwDclRW4cMybo/RMRERERVWYMfImCYUPgm5sLPPEEcOqpwNlnQzo7798vgW+/fkCrVvaMlYiIiIiokmLgSxQMGwLf118Hdu8GXnkFUAoS+G7bBixaxG7OREREREQ24Dq+RIHKyQEOHgwq8N23D3j5ZeCccyTjCwBo0gQ4fFi+Z+BLRERERBQ0ZnyJAmVDR+dnngGOHpW5vf9p3FguW7cG+vQJfHxERERERASAgS9R4IzAt127gO6+ZQvwzjvAdddJN+f/GIHvmDHO2mciIiIiIgoGA1+iQG3fLpcBZnwfeQSoWhV48km3G1q0kEsuY0REREREZAsGvkSBysgAqlUDmje3fNdly4CvvgLuvhto2dLtxpEjge+/B0aNsmecRERERESVHJtbEQUqI0PKnOOsnT/SGnjgAelhdd99HjaIj5duV0REREREZAsGvkSBCnApozlzgHnzgIkTgXr1QjAuIiIiIiIqg6XORIEKIPDNyQFuugno0gW48cYQjYuIiIiIiMpgxpcoEPn5wK5dljs633MPsGMHsGCBTA8mIiIiIqLQY8aXKBA7dsilhYzvTz8B770H3HsvMHBgiMZFRERERETlMPAlCoSxhq/JwPfgQeD664Hu3YGnngrdsIiIiIiIqDyWOhMFwmLge9ddwO7dskpRjRohHBcREREREZXDjC9RIDIyAKWANm38bvrjj8BHHwEPPgj07x/6oRERERERUVkMfIkCkZEBtGrlt0NVVhZwww3AiScCjz8eprEREREREVEZLHUmCkRGhqmOzrffDuzfD8yezS7ORERERESRwowvUSBMrOH7/ffAZ58Bjz4K9OkTpnEREREREVE5DHyJrCopAf7912fgu38/cOONEvA+/HAYx0ZEREREROWw1JnIqt27gcJCn4HvLbcA2dnAr78CVauGcWxERERERFQOA18iq/wsZfTFF8DXXwPPPQf07BnGcRERERERkUcsdSayykvgqzXw1lvAlVcCp5wC3H9/BMZGRERERETlMPAlssoIfF26OhcWAjffDNx6K3DWWcDPPwPxrKcgIiIiIooKDHyJrMrIABo2BOrWBQAcOACMGAG88w7w4IPSzdl5ExERERERRQHmpIis2r79vzLndeuA5GRp8vzJJ1LmTERERERE0YWBL5FVGRnA8cfjp5+Aiy8GatYE/vgDGDgw0gMjIiIiIiJPWOpMZEV2NvS6dVhyNAGjRwMdOwJLlzLoJSIiIiKKZqYyvkqpgQBGAhgIoBWAmgD2A9gA4A8AP2its0M1SKKo8dNPUMXFuGPuGJxznpQ3164d6UEREREREZEvPjO+SqmrlVKrASwEcBeAWgA2AVgCIBvAyQDeA7BTKfWRUuq4EI+XKKKOfJmCPWiGXtcPwDffMOglIiIiIooFXjO+SqlVAJoC+ATAVQD+1lprD9vVBzAGwOUA1iql/k9r/VWIxksUOYWFqPLLHMzGuXjksTjEcaIAEREREVFM8FXq/D6AKVrrPF870FofAvAZgM+UUr0AtLBxfERRo9ixADXzDmJ7r2TXJXyJiIiIiCjKeQ18tdYTre5Ma/0PgH+CGhFRlPr3rZloiWroec/wSA+FiIiIiIgsCLhYUylVSynFGY5UOWiN6nNTkFr1dIy+qE6kR0NERERERBZYDnyVUu2UUg4AhwHkKKVS2dSKKrrsxRvQ8shmHBycjOrVIz0aIiIiIiKyIpCM7zuQZYx6ARgMoATAVDsHRRRtVj+fAgDo+dCYCI+EiIiIiIis8tXV+Xyt9XQPN50MoLXR9Eop9QyA70I0PqKI0xqo9XsKNtTqja7D2dWKiIiIiCjW+Mr4TlJKzfJQxpwB4EIAUErFATgHwLaQjI4oCqycewB9ji7AkaHJkR4KEREREREFwFfg2xXAZgD/KKUeU0pVc15/LyQo3gcgG8CVzuuIKqS/X5iDKihB13sY+BIRERERxSKvga/W+rDW+g4AiQBGAkhTSg3XWv8O4HgA1wC4HMDxWutfwjJaIjtobXrT3FygwZ8pOFizBeqc1i+EgyIiIiIiolDx29xKa/2P1noQgJcAfKaU+hJAVa31j86v/SEfJZFdli4FGjQAFi82tfn0LwpwRtFPyBs2BogLePUvIiIiIiKKINNH8lrr9yHlzzkA1iml7nTO8SWKHZ9/DuTkADfdBBQX+9185QQH6iMHza9nmTMRERERUazyGrgqpRoopT5QSu1SSmUrpWYDaKa1Hgcpfb4CwEql1MBwDZYoKFoDKSlA8+bA338Dkyf73Hz9euC4NSkojK8BNfyM8IyRiIiIiIhs5ytj+x6AkwDcAeAqAArAbKWU0lovAdDfuc0spdS7IR8pUbDWrQPS04EnnwSGDwcefRTYs8fr5h+8r5GMFBQPGQbUqhW+cRIRERERka18Bb5nALhXa/211joFkuE9DtLYClq8CaA7gGred0MUJVJS5HLMGGDSJODoUeCBBzxuWlgILP5gLTpiK2pcwDJnIiIiIqJY5ivw3QEpaTaMBlAMYLfrRlrrPVrrq0MwNiJ7paQAffoAbdoAXbsC994LfPwx8Oef5TadNQs4NcslUCYiIiIiopjlK/C9E8A1SqmDSqm9AN6FZICPhGVkRHbatw9YuBBIdsnePvII0LYtcMstQFFRmc3ffx84v2oKdN9+QOvWYR4sERERERHZydc6vr9CSpsvAzAOQGet9cRwDYzIVrNnS3OrsWNLr6tdG5gwAVi1Cnjrrf+u3rkTWDprH/oVLoIayzJnIiIiIqJY53M5Iq31Qa31bK31D1rr7eEaFJHtUlKAVq2Avn3LXn/uucDIkcDjjwO7dgGQ6ueRejbioMtmiImIiIiIKCb5Ws6or7fbfNynhlLqhOCGRGSz/Hzg559lrq5SZW9TCnjjDSAvD7j/fpSUAB98AFzbJEVKnPv0icyYiYiIiIjINr4yvg6l1Eyl1EillM/MsFKqnVLqYQBbAbATEEWXP/4Ajhzxnr3t3Bm4/35g2jRMu+EP/LslH6ce8RIoExERERFRzIn3cVtXAM8AmAEgRym1CMA/APYByAfQEEBHAAMAJECC3nu01p+HdMREVqWkADVrAsOGed/moYeQ9+6n6PvBLXj2tJdQ9Q8fgTIREREREcUUr4Gv1nongGuVUg8CuAbACAB3A6jpstlWAA4ADwL4WWutQzhWIuu0lsD3jDMk+PViz+FaeCB/Ij7COei+4QbZ9vTTwzhQIiIiIiIKFZ8lzACgtd6rtX5Ja3261ro2gEYAWgGorrU+Xmt9jdb6Jwa9FJXS0oCMDJ/Z2+Ji4PLLga+OjUVO4lmI270LOPNMn4EyERERERHFDl+lzh5prQ+GYBxEoTFzplyO8T71/Nlngd9+A957T6HekDeAkxZKJExERERERBWC5cCXKKakpAAnnQS0bOnx5t9+A556CrjySuDaawGo44EDB4A4v8UQREREREQUI3h0TxXXnj3A0qXA2LEeb969WxK7J5wATJ7s0sCZQS8RERERUYXCjC9VXLNmSXMrD/N7i4uByy4DcnIk61u7dgTGR0REREREYcHAlyqulBSgbVugV69yNz31FDBvHvDhh0CPHhEYGxERERERhQ1rOqliyssDfvlFmlr9V8MsfvlFGlr93//JFxERERERVWymAl+l1OdKqcRQD4bINvPmAUePlitz3rsXuOIKoHt34K23IjQ2IiIiIiIKK7MZ34EA5iul1iilbldKNQjhmIiCl5IiE3eHDi1z9bRpwL59wOefA7VqRWhsREREREQUVqYCX611RwBnAdgA4FUAO5VSHyqlBoZycEQB0VoC3+HDgRo1ytw0YwZw4onyRURERERElYPpOb5a65+11ucBaAfgRQBDASxQSv2llLpRKVUnVIMksuTvv4EdO8qVOe/fD/z5J3D22ZEZFhERERERRYbl5lZa691a62cAnAogFUAvAG8DyFRKvaKU4sIwFFkpKdLQavToMlf/+CNQUgKcc05khkVERERERJFhOfBVSp2ulPoawFYAPQG8DgmCJwG4EcAnto6QyKqUFGDAAKB58zJX//CDrG7Up09khkVERERERJFhah1fpVRjANcAGAfgeAArIUHuF1rrPOdmi5VSqwG8H4qBEpmSmQksXw4891yZq48elWWMrruu3OpGRERERERUwZkKfAHsBFAC4CsAl2utl3nZbj2AvXYMjMijyZN9r0N05Ihcus3vnTsXOHaM83uJiIiIiCojs4HvwwA+1Fpn+9pIa/03gOOCHRSRV99/D+zeDQwZ4n2biy4CEhLKXDVjBlC/PnDaaaEdHhERERERRR9Tga/W+rVQD4TIlKws4OSTgW+/NX2X4mKZ9jt6NFC1agjHRkREREREUclUcyul1OtKqU+93PapUuoVe4dF5EVWFtC4saW7LFwoSxmxmzMRERERUeVktqvzWAC/eLntZwDn2DIaIn+ysoBGjSzd5YcfgGrVgJEjQzMkIiIiIiKKbmYD39YAtnu5bYfzdqLQKioCDh2yFPhqLfN7hw0D6tYN4diIiIiIiChqmQ18swF08nJbJwBH7BkOkQ/Zzt5qFgLfNWuALVvYzZmIiIiIqDIzG/j+CuBRpVRz1yudPz8MYK7dAyMqJytLLi0EvjNmyOXYsSEYDxERERERxQSzyxk9BmAZgE1KqR9RWt48BkAegEdDMzwiFwEEvj/8IE2gW7YMzZCIiIiIiCj6mcr4aq23AegP4AcAQwHc6bz8HsAArfXW0AyPyIXFwHfnTmD5cnZzJiIiIiKq7MxmfI3g96rQDYXIDyPwNbmc0cyZcsn5vURERERElZvZOb5EkWcx4/vDD0CXLsAJJ4RuSEREREREFP1MZ3yVUs0AXAqgK4AabjdrrfV1dg6MqJwDBwClgPr1/W566BAwbx5w551yFyIiIiIiqrxMBb5Kqa4AFjm3rw1gP4BGAKpAljo6FKoBEv0nKwto2BCI81+oMGcOUFjI+b1ERERERGS+1PkVSFfn5gAUgFEAagK4HsBRAOeGZHRErrKyTJc5z5gBNGsmHZ2JiIiIiKhyM1vq3B/AjQDynT/Haa2LAHyglGoKYAKkyzNR6JgMfAsKgNmzgYsuAqpUCcO4iIiIiIgoqpnN+NYBkKW1LoGUNTdxuW0ZJDAmCi2Tge/8+UBODrs5ExERERGRMBv4bgPQwvn9BgAXutw2BsBB+4ZE5EVWlqmljH74AahdGxg2LPRDIiIiIiKi6Gc28J0LYLjz+9cAXKOU2qCUWgPgDgAfhGJwRGWYyPiWlMj6vSNGADVrhmlcREREREQU1czO8X0IQHUA0Fp/rZQ6BuBiALUATATwbmiGR+RUXAwcPOg38F2xAti5k2XORERERERUym/gq5SqAuAEAJnGdVrrFAApIRwXUVkHDwJa+w18p00D4uOB0aPDMywiIiIiIop+ZkqdNYDlAPqEeCxE3mVlyaWPwHfPHmDqVODKK01NBSYiIiIiokrCb+Dr7OT8L4DaoR8OkRcmAt///U+WMnrooTCNiYiIiIiIYoLZ5lZTANyplKoWysEQeeUn8N2/H3j7beCSS4DOncM4LiIiIiIiinpmm1vVBXA8gHSl1E8AdkFKoA1aa/2E3YMj+o8R+HqpYZ44ETh6FHjkkTCOiYiIiIiIYoLZwPdhl++v9XC7BsDAl0LHR8b34EHgjTeA888HuncP77CIiIiIiCj6mQp8tdZmS6KJQuPAAbls0KDcTZMmATk5wKOPhndIREREREQUGxjQUmzIypKgt0qVMlcfPgxMmACMHQv06hWRkRERERERUZRj4EuxISvLY5nz22/LTcz2EhERERGRN6ZKnZVSJSjbzKocrXUVX7cTBcVD4JubK0sYjRgB9O8foXEREREREVHUM9vc6mmUD3wbAzgTQHUAH9k4JqLyPAS+774L7NsHPPZYhMZEREREREQxwWxzqyc9Xa+UqgIgBcAhG8dEVF5WFtCx438/5uUBL78MDB0KDBoUwXEREREREVHUC2qOr9a6GMDbAO60ZTRE3rhlfD/4ANi1i9leIiIiIiLyz47mVtUBlO86RGSXkhIgO/u/wLegAHjxRcn0DhkS2aEREREREVH0M9vcqp2Hq6sBSADwIoDldg6KqIxDhyT4dQa+n3wC/PuvzPFVKsJjIyIiIiKiqGe2udU2eO7qrABsAXCLXQMiKicrSy4bNUJREfDCC9LF+cwzIzssIiIiIiKKDWYD32tRPvDNA5ABYJlzri9RaLgEvl98AaSnA6+/zmwvERERERGZY7ar80chHgeRdy6B79SXgO7dgeTkyA6JiIiIiIhih6nmVkqpLkqp07zclqSU6mzvsIhcOAPfw9UaY9Ei4JxzmO0lIiIiIiLzzHZ1ngDAW45tDIDXbRkNkSfOwPeP1Y1QXAyMGBHh8RARERERUUwxG/ieBMDh5TYHgP72DIfIgwMHAACzFjZE3brAKadEeDxERERERBRTzAa+dSHNrDwpBFDfnuEQeZCVBV2vHubMjcewYUDVqpEeEBERERERxRKzgW86gGFebjsdstwRUWhkZaGwbiNkZLDMmYiIiIiIrDMb+H4C4C6l1C1KqeoAoJSqrpS6BcCdAD42+4BKqbZKqW+VUoeUUjlKqe+UUu2sDlwp9aBSSiul/rR6X4oxWVk4qBoBYOBLRERERETWmV3H91XIPN5JACYqpbIANIIEztMBvGRmJ0qpWgB+B5AP4GrI2sDPApinlDpRa51rcj8dATwKYK/J8VMsy8rCzmON0LkzcNxxkR4MERERERHFGrPr+BYDuEApdTqA4QAaA9gP4Bet9XwLj3cDgI4AumqtNwOAUmoVgE0AxgN4zeR+JgP4DEBXs78Dxa6SA1nYfLAdRlwa6ZEQEREREVEsshQ0aq1/h2RsAzUWwGIj6HXuc6tSagGAs2Ei8FVKXQagL4BLAXwXxFgoRhTtzcK+4kYscyYiIiIiooCYmuOrlBqjlLrVy223KKXOMvl4PQCkebh+DYDuJsbRELJm8P1a6yyTj0mxTGtUOZSFQ3GNMGRIpAdDRERERESxyGxzq8cA1PZyW03n7WY0ApDt4fosAA1N3P8VABsBfGTy8SjW5eSgii5G/eMaoU6dSA+GiIiIiIhikdlS5xMArPRy29+QRlMhpZRKBHAVgL5aa23hfuMAjAOA5s2bY/78+aEZoAlHjhyJ6OPHoqNr9uMsALXaFPG5sxnfjxRN+H6kaML3I0UTvh8pmsTy+9Fs4BsHwFu+rS6Aqib3kw3PmV1vmWBXUwC8D2CHUqqB87p4AFWcPx/TWue730lrPRXAVAA46aST9JAI1svOnz8fkXz8WJQyfwUAIOmcE9CRz52t+H6kaML3I0UTvh8pmvD9SNEklt+PZkud/wFwuZfbLgewyuR+1kDm+brrDmCtn/t2A3AjJEA2vgYBGOj8/iaTY6AYsvoPmcrdoW+jCI+EiIiIiIhildmM7/8ATFdKfQPgXQA7ALSGlBCfC+BCk/uZCeBVpVRHrXU6ACilOkAC2Af93Heoh+smAKgC4DYAmz3cTjGsuBhIXy6Bb1zTxhEeDRERERERxSqz6/h+r5S6A8BzAM5zXq0AHAFwu9ba7LJC7wK4FcAMpdSjADSAZwD8Cylllh0r1R7AFgBP/3979x5nV1XfffzzSyZXCLlAAgrkBkmEgFiTCgIKBJCLCCK2Yq1irUD7aq3ah9by2FpLq9W2j4iXPgLl8V5vBCGh1ZCQRE0MSqIlIcAEQkICBCZkQkLul1nPH/sMHCZnZk4yZ845s8/n/XrNa8/svfae38B6MXxnrb1WSummQg0LOz4sIl4Emg5yL2H1Eb/5DQzYVli8e5QjvpIkSZIOTdn7+KaUvhwR3wDOBI4EXgB+mVLadhDP2B4RM8i2JPo2WXi+H/hYh+cE2UhuuVOxlUNz5sCRbMq+GFnOot+SJEmSdKCygy9ASuklYE7xuYg4B7gmpfShMp+xDriqmzZrycJvd886t5zvqb5pzhz4kzGtsONwGDiw1uVIkiRJ6qMOaUQ1Ik6MiJsiYg2wAPj9ypalRrdlCyxZAicd3eo0Z0mSJEk9UnbwjYjhEXFdRCwGmoFP8spqyq/tpfrUoObPzxa3GjfM4CtJkiSpZ7oMvhHRLyIujYgfABuArwHjgK8WmnwspXRrSmlrL9epBjNnDgwbBiMx+EqSJEnqmU6Db0T8H+AZYDZwGfBj4GJgLPApyngHVzoUKWXBd8YM6Le5FY50KyNJkiRJh66rEd+PA2OA/wbGppTel1K6L6XURrYNkdQrHn8c1q6Fiy4CNm1yxFeSJElSj3QVfO8AXgLeDjRHxFci4k3VKUuNbE5h3fCL3pag1anOkiRJknqm0+CbUroWOAZ4H7AUuB5YEhGPAp/AUV/1kjlz4MQTYeKYbbBvn8FXkiRJUo90ubhVSmlXSul7KaX2d3tvBPYDf0P2ju/nIuIPI2Jw75eqRrB7NyxYUJjm3NqanTT4SpIkSeqBsrczSiltSCn9S0rpFOBNZCs7TwK+Rbbis9RjixbBjh0GX0mSJEmVU3bwLZZSWppS+gjZ/r1XAQsrWZQa15w5MGAAnHceBl9JkiRJFdHUk5tTSnvJtjn6cWXKUaObMwfOOgsOP5xXgq/bGUmSJEnqgUMa8ZV6w5NPwvLlcNllhRObNmVHR3wlSZIk9YDBV3Vj5szseNVVhRPtI74jR9akHkmSJEn5YPBV3Zg5E6ZNg/HjCydaW2HoUBjsouGSJEmSDp3BV3Vh/Xr41a+KRnshC75Oc5YkSZLUQwZf1YW77sqOBl9JkiRJlWbwVV2YORNOOQUmTy46afCVJEmSVAEGX9Xcc8/BokUdRnshC75uZSRJkiSphwy+qrm774aUSgTfTZsc8ZUkSZLUYwZf1dzMmTBpUjbV+WUpOdVZkiRJUkUYfFVTmzbBggXZaG9E0YUdO2DPHoOvJEmSpB4z+KqmZs2C/fvh3e/ucKG1NTsafCVJkiT1kMFXNXXnnTB+PLzxjR0uGHwlSZIkVYjBVzWzZQvMnQvveleHac5g8JUkSZJUMQZf1cy998LevSVWc4ZXgq/bGUmSJEnqIYOvKuuFF8puOnMmvPa1cMYZJS5u2pQdHfGVJEmS1EMGX1XO/PkwZgwsWtRt0+3b4ac/hSuvhH6leqFTnSVJkiRViMFXlfPDH2b77371q902/clPYOfOTqY5QxZ8Bw+GIUMqW6MkSZKkhmPwVWWklL20GwF33QUbN3bZfOZMOOooeMtbOmnQ2uporyRJkqSKMPiqMn77W3jmGbjhBtizB775zU6b7tqVZeQrr4Smpk4aGXwlSZIkVYjBV5Uxe3Y22nvDDXDWWXDbbdkocAn33QfbtnUxzRkMvpIkSZIqxuCrypg9O1ueecwYuP56ePxxWLCgZNOZM2HECDjvvC6e19rqVkaSJEmSKsLgq5575hlYtgze8Y7s63e/G0aOhFtvPaDpnj0waxZcfjkMHNjFMzdtcsRXkiRJUkUYfNVz996bHduD75AhcM018OMfQ0vLq5ouWAAvvtjNNGdwqrMkSZKkijH4qudmz4bx42Hq1FfOXXcd7N0L3/jGq5rOnAmHHw5ve1sXz9u5M1sBy+ArSZIkqQIMvuqZHTvg/vuz0d6IV86fdFK2V9Ftt0FbGwD798Pdd8Pb355t0dup1tbsaPCVJEmSVAEGX/XMvHnZ6Gz7NOdi118Pq1fD/PkAPPhgtr3vFVd080yDryRJkqQKMviqZ2bPhmHD4JxzDrx21VXZysyFRa7mzctOX3BBN880+EqSJEmqIIOvDl1bW7aw1cUXl16iefDgbJGru++G559n7lz4nd+B0aO7eW578HU7I0mSJEkVYPDVoVu2DJ57rvQ053bXXgv79rH7a19nyZIyRnsh28oIHPGVJEmSVBEGXx262bOhXz+49NLO27zudXDOOez72u3s29vGhReW8VynOkuSJEmqIIOvDt2sWXDmmd1PSb7+eg577kkuHTCPs88u47mtrdnU6aFDK1KmJEmSpMZm8NWhWbcOHnqo62nO7d71Llr7H8UnRtzKkCFlPLu1NRvtLd4eSZIkSZIOkcFXh+bee7NjGcH3uc2DuGP/Bzlz0yzYsKH7Z7cHX0mSJEmqAIOvDs3s2XDCCdk7vN2YNw9u51r6t+2Dr3+9+2cbfCVJkiRVkMFXB2/bNpg/PxvtLWM68rx5sGnUZNJ558Htt2fbIHWltdWtjCRJkiRVjMFXB2/uXNizBy6/vNumKWXNzz8f4vrrYe1a+MlPur5p0yZHfCVJkiRVjMFXB2/2bBg+nHKWaH70UXj2WbJtjK68EiZMgL/6K9i7t/ObnOosSZIkqYIMvjo4bW3wX/8Fl1wCAwZ023zevOx4wQVkWxTdckuWhm+5pfQNu3bBjh0GX0mSJEkVY/DVwfn1r6GlpbxtjMimOZ9wQjbQC2T3veMd8OlPw9NPH3jD5s3Z0eArSZIkqUIMvjo4s2ZB//7ZiG839u6FhQsL05yL3XIL7N8PN9xw4E2trdnR4CtJkiSpQgy+OjizZ2fv9o4c2W3TX/0qWwD6ggs6XJgwAW68EX7wA7j//ldfM/hKkiRJqjCDr8q3di08/HDZ05znzct2O5oxo8TFv/5rmDgR/uzPshWi27UHX7czkiRJklQhBl+Vr30booN4v3f69E4GhwcPhi9/GZqb4eabXzm/aVN2dMRXkiRJUoUYfFW+NWuywDppUrdNt27Npjof8H5vsUsvhXe+E266Cdavz8451VmSJElShRl8Vb7nn4cxY7L5y91YuDBbv+qA93s7uvlmSAk+/vHs69ZWaGqCww/vcbmSJEmSBAZfHYyWliz4lmHePBgyBM48s5uG48fDJz8JM2fCnDlZ8B01qqxwLUmSJEnlMPiqfAcRfOfOhbe+FQYNKqPxDTdk06c/8hHYsMFpzpIkSZIqyuCr8pUZfJ9+Gh57rJv3e4sNGpQtdPX449l2SQZfSZIkSRVk8FV5Uio7+M6blx3LDr4AF10EV12VfR+3MpIkSZJUQQZflWfr1my/3TKC79y5WbNTTjnI7/GFL8DQoXDMMYdWoyRJkiSV0FTrAtRHtLRkx26Cb0rZiO8FF0C/g/2zytix8OCDjvhKkiRJqiiDr8pTZvBdsSJr2u02Rp05+eRDvFGSJEmSSnOqs8pTZvBtf7/3kIOvJEmSJFWYwVflKTP4zp0LU6bA8cdXoSZJkiRJKoPBV+VpD76jR3faZPdu+PnPD3I1Z0mSJEnqZQZflaelBUaMgIEDO23ywAOwY4fTnCVJkiTVF4OvylPGHr6/+EV2fMtbqlCPJEmSJJXJ4KvylBF8Fy+GqVNh1Kgq1SRJkiRJZTD4qjzdBN+2NliyBM46q4o1SZIkSVIZDL4qTzfBd+VK2LLF4CtJkiSp/hh81b19+2DTpi6D76JF2fHss6tUkyRJkiSVyeCr7m3aBCl1GXwXL4ZjjoEJE6pYlyRJkiSVweCr7rXv4dtN8D3rLIioUk2SJEmSVCaDr7rXTfB95hlYu9ZpzpIkSZLqk8FX3WsPvkcfXfLy4sXZ0YWtJEmSJNUjg6+6182I7+LFMHQovOEN1StJkiRJkspl8FX3WlqgqQlGjCh5edEiOP10GDCgumVJkiRJUjkMvupeSwuMHg39Duwu27bBQw85zVmSJElS/TL4qnstLZ1Oc/7Vr2D/foOvJEmSpPpl8FX3ugi+ixdnWxi9+c1VrkmSJEmSymTwVfe6CL6LFsGpp8Lw4VWuSZIkSZLKZPBV9zoJvvv3wwMPOM1ZkiRJUn0z+KprO3ZkK1iVCL4rVsBLL8HZZ9egLkmSJEkqk8FXXdu4MTuWCL6LFmVHR3wlSZIk1TODr7rW0pIdSwTfxYvh2GNh7Ngq1yRJkiRJB8Hgq649/3x27CT4nn12tqqzJEmSJNUrg6+61smI77p1sH6905wlSZIk1T+Dr7rWHnxHj37V6cWLs6PBV5IkSVK9M/iqay0tcNhh2UeRxYvh8MPh9a+vUV2SJEmSVCaDr7rWyR6+ixfDGWdAU1MNapIkSZKkg2DwVddKBN+tW2H5cqc5S5IkSeobDL7qWong+8AD0NaWregsSZIkSfXO4KuulQi+ixdDv35w+uk1qkmSJEmSDoLBV51ra4ONG0sG39NOg2HDalSXJEmSJB0Eg6869+KLsG/fq4Lvvn3ZVGff75UkSZLUVxh81bn2PXyLgu9DD8H27b7fK0mSJKnvMPiqcyWC7+LF2dERX0mSJEl9hcFXnSsRfBctgrFj4bjjalSTJEmSJB0kg6861yH4ppSN+DrNWZIkSVJfYvBV59qD71FHAfDUU/Dss05zliRJktS3GHzVuZYWOPJIaGoCsmnOYPCVJEmS1LcYfNW5lpZXvd/729/CkCFwyik1rEmSJEmSDpLBV53rEHxXroSTToL+/WtYkyRJkiQdJIOvOtch+D7yCJx8cg3rkSRJkqRDYPBV54qC79atsH69wVeSJElS32PwVWl79sDmzS8H38cey05PnVrDmiRJkiTpEBh8VdoLL2THQvB95JHsS0d8JUmSJPU1Bl+V1r6H79FHA9nCVoMGwYQJNaxJkiRJkg6BwVeltQffohHf173OFZ0lSZIk9T0GX5VWIvg6zVmSJElSX2TwVWlFwXf7dli71uArSZIkqW8y+Kq0lhYYOBCOOMIVnSVJkiT1aQZflda+h28EK1dmpxzxlSRJktQXGXxVWnvwJXu/d8AAOOGEGtckSZIkSYfA4KvSOgTfKVOgqanGNUmSJEnSITD4qrQOwddpzpIkSZL6KoOvDpQSPP88jBnDzp3w5JMubCVJkiSp7zL46kDbtsGuXTBmDI89luVgR3wlSZIk9VUGXx2oaA/fRx7JPjX4SpIkSeqrDL46UIfg29QEJ55Y25IkSZIk6VAZfHWgDsF30iQYOLC2JUmSJEnSoTL46kBFwXflShe2kiRJktS3VT34RsTxEXFnRGyJiK0RcVdEjC3jvukRcVtEPBYROyJiXUR8NyImVKPuhlIIvruGjWb1at/vlSRJktS3VTX4RsRQYD7wOuAa4P3AJGBBRBzWze1XA1OBLwGXAH8DvBFYGhHH91rRjailBY44glXrBtPWZvCVJEmS1Lc1Vfn7XQtMBKaklJ4AiIjlwOPA9cAXurj38ymljcUnImIxsKbw3E/1SsWNqKXFFZ0lSZIk5Ua1pzpfDjzQHnoBUkprgMXAFV3d2DH0Fs49BWwEjq1wnY2tKPj26weTJ9e6IEmSJEk6dNUOvlOBh0ucXwkc9LhiRJwEjAEe7WFdKlYIvitXZtsYDRpU64IkSZIk6dBVO/iOAjaXON8KjDyYB0VEE/A1shHfO3peml5WNOLris6SJEmS+rpqv+NbSV8BzgTenlIqFaYBiIjrgOsAjj76aBYuXFid6krYtm1bTb9/Wfbv55wXXmDNtp2sWpWYPn0dCxeuqXVV6gV9oj+qYdgfVU/sj6on9kfVk77cH6sdfDdTemS3s5HgkiLic2Rh9pqU0n1dtU0p3QbcBjB9+vR07rnnll1spS1cuJBafv+ybNwIbW0MGfe7tLUFl1wyjnPPHVfrqtQL+kR/VMOwP6qe2B9VT+yPqid9uT9WO/iuJHvPt6OTgUfKeUBEfBL4BPCRlNK3K1ib4OU9fNfuHAO4orMkSZKkvq/a7/jOAs6IiIntJyJiPHBW4VqXIuIvgH8CPplS+kpvFdnQCsG3uXUM/frBlCk1rkeSJEmSeqjawfd2YC1wT0RcERGXA/cA64Fb2xtFxLiI2BcRnyo6dzXwReCnwPyIOKPow3HJSikE3+XPjWHiRBgypMb1SJIkSVIPVXWqc0ppe0TMAG4Gvg0EcD/wsZTStqKmAfTn1cH84sL5iwsfxX4GnNtLZTeWQvB98KkxTnOWJEmSlAtVX9U5pbQOuKqbNmvJQm7xuQ8CH+ytulTQ0kLq148HV4/i41fWuhhJkiRJ6rlqT3VWvWtpYf/Io9i9r78jvpIkSZJyweCrV2tpYfthrugsSZIkKT8Mvnq1lhZa+48hAk46qdbFSJIkSVLPGXz1ai0tbNg/hvHjYejQWhcjSZIkST1n8NWrtbSwZrsrOkuSJEnKj6qv6qw6tmsXbN1Kc/+jDb6SJEmScsMRX71i40YAnt3viK8kSZKk/DD46hUtLdkBg68kSZKk/DD46hVFwdcVnSVJkiTlhcFXrygE36bXjmHYsBrXIkmSJEkVYvDVKwrBd/TJY2pciCRJkiRVjqs662VtG55nF0OYeOphtS5FkiRJkirG4KuXbV/TwibGcPLUqHUpkiRJklQxTnXWy/Y9soq1jGfq1FpXIkmSJEmVY/BVZu9eDn/yIZYy3RWdJUmSJOWKwVeZRx5hwL5dPDliGsOH17oYSZIkSaocg68yy5YBsG3KtBoXIkmSJEmVZfBVZulStsYRDD7lxFpXIkmSJEkVZfAVAPt/vYxl6Y1MPNEuIUmSJClfTDmCvXuJFdnCViecUOtiJEmSJKmyDL6ClSvpt2c3y5hm8JUkSZKUOwZfvbywlcFXkiRJUh411boA1YGlS9kxcDgvHn6CWxlJkiRJyh1HfAXLltF8mAtbSZIkSconk06j27MHli/nwTanOUuSJEnKJ6c6N7qVK2H3bhbsmc6JBl9JkiRJOeSIb6MrLGz1YJrGxIk1rkWSJEmSeoHBt9EtXcrew4azmhOc6ixJkiQplwy+jW7ZMp4/dhoQBl9JkiRJuWTwbWSFha0eP2IagwfDa15T64IkSZIkqfIMvo3s4Ydhzx6WMp2JE6GfvUGSJElSDhl1GllhYav5W9zKSJIkSVJ+GXwb2dKlpBEj+PnTEw2+kiRJknLL4NvIli1jzynT2LHTha0kSZIk5ZfBt1Ht3g3Ll7Nx7DQAg68kSZKk3DL4NqqHH4a9e3lixHTA4CtJkiQpvwy+jaqwsNVv+02jXz8YP7625UiSJElSbzH4NqqlS2HkSJZumsDxx8PAgbUuSJIkSZJ6h8G3US1bBtOmsfpJF7aSJEmSlG8G30a0ezesWJEF39W+3ytJkiQp3wy+jWjFCti7lx0nT+eFFwy+kiRJkvLN4NuIli4FYM0otzKSJEmSlH8G30a0bBmMHMmjO8cDBl9JkiRJ+WbwbUTLlsH06ax+MgCDryRJkqR8M/g2ml27XrWw1VFHwRFH1LooSZIkSeo9Bt9Gs2IF7NuXjfi6orMkSZKkBmDwbTSFha3cykiSJElSozD4Npply2DUKPa8Zhzr1xt8JUmSJOWfwbfRFBa2empd0NZm8JUkSZKUfwbfRrJrFzz88MvTnMHgK0mSJCn/DL6NZPnyVy1sBQZfSZIkSfln8G0kHRa2GjoUjjmmtiVJkiRJUm8z+DaSZcvgyCNh7FhWr4aJEyGi1kVJkiRJUu8y+DaSwsJWRLwcfCVJkiQp7wy+jWL/fnjsMTjlFFKCJ5/0/V5JkiRJjcHg2yjWrYPdu2HKFDZsgJ07Db6SJEmSGoPBt1E0N2fHKVNc0VmSJElSQzH4NopVq7KjwVeSJElSgzH4NormZjjiCBgzhtWroV8/GDeu1kVJkiRJUu8z+DaKVatgypSXV3QeOxYGDqx1UZIkSZLU+wy+jaK5GSZPBmD1aqc5S5IkSWocBt9GsGMHrF+fjfhi8JUkSZLUWAy+jeDxx7Pj5Mls2QKbNhl8JUmSJDUOg28jcEVnSZIkSQ3M4NsI2vfwnTTJ4CtJkiSp4Rh8G0FzMxx3HBx2mMFXkiRJUsMx+DaC9q2MyBa2Gj0ahg2rcU2SJEmSVCUG37xLya2MJEmSJDU0g2/ebdwIW7a4lZEkSZKkhmXwzbv2ha0mT2b37mw7X4OvJEmSpEZi8M27oq2M1q7NZj4bfCVJkiQ1EoNv3jU3w8CBMG6cKzpLkiRJakgG37xbtQpOPBH69zf4SpIkSWpIBt+867Ci82GHwdFH17gmSZIkSaoig2+e7duXpd2iFZ0nToSIGtclSZIkSVVk8M2ztWth796Xg+8TTzjNWZIkSVLjMfjmWdFWRhs3wmOPwbRptS1JkiRJkqrN4JtnRVsZzZuXffq2t9WuHEmSJEmqBYNvnjU3w8iRcOSRzJ2bfeqIryRJkqRGY/DNs1WrYMoUEsF998H550P//rUuSpIkSZKqy+CbZ4WtjB57DJ55xmnOkiRJkhqTwTevtm2DZ5+FKVO4777s1IUX1rYkSZIkSaoFg29etS9sNXkyc+fCpEkwfnxNK5IkSZKkmjD45lVhK6O9E6ewcKGjvZIkSZIal8E3r1atgggeeOFEtm83+EqSJElqXAbfvGpuhrFjmfPzIfTvD+edV+uCJEmSJKk2DL55VdjK6L774PTTYfjwWhckSZIkSbVh8M2jlKC5mV1jJ7N0qdsYSZIkSWpsBt88eu452LaNR9qmkJLv90qSJElqbAbfPCqs6PzzDZM54gh405tqXI8kSZIk1ZDBN48Ke/j+aPkUZsyApqYa1yNJkiRJNWTwzaPmZtoGDWbJM8f7fq8kSZKkhmfwzaPmZlqPnESin+/3SpIkSWp4Bt88WrWKVWkyEybACSfUuhhJkiRJqi2Db97s2UN68kl+uWkKF14IEbUuSJIkSZJqy+CbN2vWEPv3s3zPFN/vlSRJkiQMvvlT2MroiZjMjBk1rkWSJEmS6oDBN28KWxkd/sbJjBxZ41okSZIkqQ64w2vO7F7RzFaO4vRLRtW6FEmSJEmqC4745sxLS1fRjO/3SpIkSVI7g2/ODFjTzJqmyZxxRq0rkSRJkqT6YPDNky1bGL7zefZPmsKAAbUuRpIkSZLqg8E3R55ZkC1sNfrMyTWuRJIkSZLqh8E3R5pnZVsZnXzllBpXIkmSJEn1w+CbI5uWrGI//Rh//gm1LkWSJEmS6obBNyeeew76r25m07DxxOBBtS5HkiRJkuqGwTcH7rkHTj0VJu5bRdNUpzlLkiRJUrGmWheggqVLYcmSg7pl92646y745RK44Th4/aBm+p1+Ti8VKEmSJEl9k8G3XsybBzfeeFC3DALeW/jg6cLJN7+5snVJkiRJUh9n8K0XH/0oXHttt8327YNbboHPfx6OOQb+/d/h7LMLF5uaYPjw3q1TkiRJkvoYg2+9GDIk++jCmjXw/vfD4sVw9dVZ6B05skr1SZIkSVIf5eJWfcC2bfDZz8Jpp8GKFfCd78D3vmfolSRJkqRyOOJbx3bvhltvhc98Blpa4LLL4MtfhvHja12ZJEmSJPUdBt86tG8ffOtb8A//AOvWwbnnwt13u26VJEmSJB0KpzrXkbY2+OEPYepU+OM/zhavmjsX5s839EqSJEnSoTL41on582HaNHjPe2DAgGyE94EH4IILIKLW1UmSJElS3+VU5zqxdi1s3ZotXHX11dC/f60rkiRJkqR8MPjWiQ98INuqaMCAWlciSZIkSfli8K0TTf6bkCRJkqRe4Tu+kiRJkqRcM/hKkiRJknLN4CtJkiRJyjWDryRJkiQp16oefCPi+Ii4MyK2RMTWiLgrIsaWee/giPjXiNgQETsjYklEvLW3a5YkSZIk9V1VDb4RMRSYD7wOuAZ4PzAJWBARh5XxiDuAa4FPAZcBG4A5EfGGXilYkiRJktTnVXsTnWuBicCUlNITABGxHHgcuB74Qmc3RsRpwB8AH0opfb1w7mfASuAm4PLeLV2SJEmS1BdVe6rz5cAD7aEXIKW0BlgMXFHGvXuBHxTduw/4PnBRRAyqfLmSJEmSpL6u2sF3KvBwifMrgZPLuHdNSmlHiXsHAif2vDxJkiRJUt5UO/iOAjaXON8KjOzBve3XJUmSJEl6lWq/41t1EXEdcB3A0UcfzcKFC2tWy7Zt22r6/aVi9kfVE/uj6on9UfXE/qh60pf7Y7WD72ZKj+x2Nprb8d5xndwLr4z8vkpK6TbgNoDp06enc889t6xCe8PChQup5feXitkfVU/sj6on9kfVE/uj6klf7o/Vnuq8kuxd3Y5OBh4p494JhS2ROt67B3jiwFskSZIkSY2u2sF3FnBGRExsPxER44GzCte6MhsYAPxe0b1NwHuA+1JKuyterSRJkiSpz6t28L0dWAvcExFXRMTlwD3AeuDW9kYRMS4i9kXEp9rPpZR+S7aV0Rcj4sMRcT7ZVkYTgL+v4s8gSZIkSepDqhp8U0rbgRnAKuDbwHeBNcCMlNK2oqYB9C9R3x8BXwf+Cfgv4Hjg4pTSb3q5dEmSJElSH1X1VZ1TSuuAq7pps5Ys/HY8vxP4y8KHJEmSJEndqvZUZ0mSJEmSqsrgK0mSJEnKNYOvJEmSJCnXDL6SJEmSpFwz+EqSJEmScs3gK0mSJEnKNYOvJEmSJCnXDL6SJEmSpFwz+EqSJEmScs3gK0mSJEnKtUgp1bqGqomIjcBTNSzhKOCFGn5/qZj9UfXE/qh6Yn9UPbE/qp70hf44LqU0uuPJhgq+tRYRS1NK02tdhwT2R9UX+6Pqif1R9cT+qHrSl/ujU50lSZIkSblm8JUkSZIk5ZrBt7puq3UBUhH7o+qJ/VH1xP6oemJ/VD3ps/3Rd3wlSZIkSbnmiK8kSZIkKdcMvpIkSZKkXDP49rKIOD4i7oyILRGxNSLuioixta5L+RYR746ImRHxVETsjIjmiPjniBjWod3IiPiPiHghIrZHxLyIOLVWdatxRMRPIyJFxD91OG+fVFVExKUR8fOI2Fb4/bw0ImYUXbcvqioi4qyIuC8iWiLipYj4TUR8qEObwRHxrxGxofB7fUlEvLVWNavvi4jjIuLLhb60o/A7eXyJdmX1vYjoFxE3RsTaiNgVEQ9FxFVV+WHKZPDtRRExFJgPvA64Bng/MAlYEBGH1bI25d4NwH7gfwMXA/8X+FNgbkT0A4iIAGYXrn8EuAoYQNY/j6tF0WoMEfFe4LQS5+2TqoqIuB64B1gGXAn8HvAjYGjhun1RVRERrwfmkfWva4F3AQ8Cd0TEnxY1vaNw/VPAZcAGYE5EvKGqBStPTgR+H9gM/KKLduX2vX8EPg18BbgEeAD4UURcWtGqe8DFrXpRRHwU+AIwJaX0ROHcBOBx4K9TSl+oZX3Kr4gYnVLa2OHcB4BvAuenlOZHxBXA3cCMlNKCQpvhwBrgOymlv6hy2WoAETESeBT4OPCfwGdSSn9buGafVK8rjGg8CtyYUvpiJ23si6qKiPgs2R+rR6WUthWdXwKQUnpzRJwG/A/woZTS1wvXm4CVQHNK6fKqF64+LyL6pZTaCp9/GLgdmJBSWlvUpqy+FxFjgPXA51JKf190//3A6JTS66vyQ3XDEd/edTnwQHvoBUgprQEWA1fUrCrlXsfQW/Bg4Xhs4Xg58Gz7/9QV7ttCNsph/1Rv+TzwcErpeyWu2SdVDR8C2oCvddHGvqhqGQjsBXZ2OL+FV/4//fJCmx+0X0wp7QO+D1wUEYOqUKdypj30dqPcvncRWV/+Tof7vwOcWhj4qzmDb++aCjxc4vxK4OQq1yKdUzg+Wjh21T/HRsThValKDSMizgY+APxZJ03sk6qGs4HHgKsjYnVE7IuIJyKiuF/aF1Ut3ygcvxQRr42IERFxLXA+cHPh2lRgTUppR4d7V5KFjROrUqkaUbl9byqwG3iiRDuok9xj8O1do8jmzXfUCoysci1qYBFxLHATMC+ltLRwuqv+CfZRVVBEDARuBf4tpdTcSTP7pKrhtWTrbfwr8DngbcBc4CuFV5TAvqgqSSk9DJxLNpPgGbJ+91XgT1JK3y80664/jurlMtW4yu17o4AX04Hv0NZVH22qdQGSeldhZOIeYB/wRzUuR43rr4EhwGdqXYgaXj9gGPDBlNJdhXPzC+/+3hgRX6pZZWo4ETEJmEk2MvYnZFOerwC+FhG7UkrfrWV9Up4YfHvXZkr/Vbizv55IFRURQ8jeSZsInJNSerroclf9s/261GORbeH2SeDDwKAO76MNiogRwEvYJ1Udm8hGfOd2OH8f2SrOr8G+qOr5LNk7lJellPYWzt0fEUcCt0TE98j627gS97b3x9YS16RKKLfvbQZGRER0GPWtqz7qVOfetZJszntHJwOPVLkWNZiIGADcCUwHLk0prejQpKv+ua54dUmphyYCg8kWudhc9AHZaqabgVOxT6o6VnZzvQ37oqrnVOChotDb7tfAkcAYsv44obBNZrGTgT0c+F6lVCnl9r2VwCDghBLtoE5yj8G3d80CzoiIie0nClOpzipck3pFYa/e7wIzgHemlB4o0WwWcGxEnFN03xHAO7B/qrL+BzivxAdkYfg8sl+e9klVw48Lx4s6nL8YeDql9Bz2RVXPc8AbCusgFDsd2EU2UjabbJ/f32u/WNhS5j3AfSml3VWqVY2n3L73U7KZC+/rcP8fku3ksKYKtXbLqc6963bgz4F7IuJvgUS2ufN6skVepN7yVbL/SH0G2B4RZxRde7ow5XkWsAT4TkT8Fdmo241AAP9S5XqVYymlF4GFHc9HBMBTKaWFha/tk6qG/wYWALdGxFHAk2T/vXwbr6yDYF9UtXwF+BEwOyL+newd38uB9wI3p5T2AL+NiB8AXyzM5loD/CkwgQODhlS2iHh34dNpheMlEbER2JhS+llKqay+l1JqiYgvkK2T8BLwG7JwPIOsP9eFOHDxLVVS4d22m4ELyX5h3g98rHhzaKnSImItpd/JAPiHlNKnC+1GAf8GvJNsKuoS4C9TSg/1fpVqdBGRgM+klP626Jx9Ur2uMHr7z8C7yd7lfQz4XErpP4va2BdVFRFxCfAJsun1g4HVwG3ArSml/YU27YsD/gEwAngI+ET7Hw6lQ1H4PVzKz1JK5xbalNX3IqI/2R8IrwWOAZqBm1JKd/ZG7YfC4CtJkiRJyjXf8ZUkSZIk5ZrBV5IkSZKUawZfSZIkSVKuGXwlSZIkSblm8JUkSZIk5ZrBV5IkSZKUawZfSZJUUkSsjYjv1LoOSZJ6yuArSZIkSco1g68kSZIkKdcMvpIk1YGIOC0iZkXE5ojYGRGLI+ItRde/ERFPR8SZEfFgROwqTEX+SIlnvSki5kXEtojYHhH3R8SbSrQ7JyLmRsSWQruHIuKPS7S7OiIeLbRZGhFnV/6fgCRJvcfgK0lSjUXEG4FfAqOAa4GrgE3AvIiYVtT0COAHwDeBdwILgS9FxAeLnvV64GfASOCDwAcK9/0sIk4rancFcD8wELgeuAL4f8C4DuW9BfhfwN8B7wH6A/dGxIge/tiSJFVNpJRqXYMkSQ0tIu4HXgucllLaUzjXH3gYaE4pvTMivgFcA7w3pfT9onvnApOB8SmlFBF3AhcUvn6x0OYIYC2wMKX0rogIYA3wAvCmlFJbJ3WtBYYDE1NKmwvnpgMPAu9LKf1nRf9BSJLUSxzxlSSphiJiCHAO8COgLSKaIqIJCGAe8Nai5vuBmR0e8X1gLHBs4eu3Ave2h16AlNJWYFbh+wBMIRvZ/Y/OQm+RJe2ht2BF4Ti2+59OkqT6YPCVJKm2RpFNH/47YG+Hjz8HRkZE++/rzSmlvR3uf75wbA++o4ANJb7Pc2TTnwGOLByfLqO+1uIvUkq7C58OLuNeSZLqQlOtC5AkqcG9CLQBXwW+VapBSqktm53MyIgY0CH8Hl04PlM4tgLHlHjMMUD7yO0LheOxJdpJkpQ7Bl9JkmoopbQ9In4BnAb8ppupx/3JFr76ftG5q4F1vBJ8fwZcGhHDUkovAUTEMOAdZIthAawie+f3wxFxW3LBD0lSzhl8JUmqvb8Efg7MiYg7yKYqHwW8EeifUvqbQruXgH+JiKOAx4H3ki1k9cGi8PqPwGXA/RHxeSABnwCGAjcBFBbB+hhwFzA/Ir4GbAROAsaklP6+l39eSZKqynd8JUmqsZTSb4DfJdvC6EvAfcAtwKlkgbjdVrIR3muAe4DzgI+mlL5Z9KzlwLmFtt8Evg1sA85JKT1U1O4e4MLCl3eQLX51HdlIsCRJueJ2RpIk9QGF7YwuSCkdV+taJEnqaxzxlSRJkiTlmsFXkiRJkpRrTnWWJEmSJOWaI76SJEmSpFwz+EqSJEmScs3gK0mSJEnKNYOvJEmSJCnXDL6SJEmSpFz7/+M6isRpXylAAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA9IAAALYCAYAAACQf8oMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABsx0lEQVR4nO3dd5hcZdnH8e+dntASehUSQJoBlIA0IYCR3gWkSC8qiICKoiC8gIoiTZCOgBRBkN4hEFQM0kEiXTpIDYFdQurz/vHMsJvNJtlNdudM+X6u61xn98xzZu4JR+GXp0VKCUmSJEmS1DE9ii5AkiRJkqRaYpCWJEmSJKkTDNKSJEmSJHWCQVqSJEmSpE4wSEuSJEmS1AkGaUmSJEmSOsEgLUlSA4uIvSMiRcTeRdciSVKtMEhLktQBEbFMKXDeUnQt9Swijiv9Obc+miLi8Yj4WUT064LPSBExqgvKlSQ1qF5FFyBJkgp1PfAg8HbRhbRxDfAfIIDFgO2AXwIbASOKK0uSJIO0JEkNLaU0DhhXdB3t+EtK6dryLxHxU+Ap4OsRsXFK6d7iSpMkNTqHdkuS1A0iYtmIuDgi3oiIiaXz2RGxUDtt942ImyLi1YiYEBHvR8SNETGsnbafz2mOiG0j4p+loc9PlF4fVXq9d2mY9Mul93w+Ir43s/drda08jP2SiFguIq6PiLER0RwR90TEajP4zruUhmB/FhFvRsRpEdG/K4ZSp5Q+BG4s/bpGm89dIyL+EBFjIuLjUp2PRcT3IiJatRseEan064Ztho+3/v49IuKAiPhX6c+2qfTnvMOcfAdJUv2wR1qSpC4WEesAdwD9gJuAl4EVge8Am0bEmqVgWPYH4HHgLuADYBlg21Lb4SmlB9v5mF2Ar5fe/+9M/+/0PwNrAbcDU4CdgT9ExKSU0gUd/CrLkId9jwH+CCxbquu+iFgppfROq+98IHBeqf6LgM+A7YHlO/hZnTGpze8HAFsBfwNuBeYBvkH+c/0icFip3SvA/wHHAq8Cl7R6jycASsH7z+Q/r/8Al5Ze3xL4a0QcllI6owu/iySpBhmkJUnqQhHRB7iKHF5XTyk90+q1nYC/AMcDh7S6beWU0stt3mcl4CHgRHJgbusbwCYppVEzKGVJ4EsppY9L73cG8DTwQ6CjQXpD4Kcppd+0qusE4GhgH+Ck0rVBwKnAR8BXUkqvla7/Ahjdwc+aqYiYnxziAR5o8/KvgO+llKa2at8LuAX4fkScllJ6NaX0CnBcRBwLvJJSOq6djzqQHKLPBg5NKU0pvd+RwL3AbyPimpTSW13xvSRJtcmh3ZIkda2tgC8AJ7UO0QAppWuAR8m9ya2vTxOiS9eeAe4DvlYK521dP5MQDXBUOUSX3u85cgBdISLm6eB3eRk4uc21i0rnNVtd2xaYCzi/HKJLn9lMXiBsduxcGpr+fxFxPvAssARwVkrp4dYNU0qvtQ7RpWuTgfPJ/62zUSc+92DynPHDyyG69H7NwAlAH8Ah3pLU4OyRliSpa321dF4lIo5r5/X+wIIRsWBK6X2AiFgO+Bk58C1ODmutLcD0q2o/Mos6Hm3n2hul80Dgk1ncD/BE24Da5j3KynOm2+t9/mcHPqc9O7Vz7cyU0qFtL0ZEX+BQ8l9QrADM3abJYh35wIgYAHwJeB34Wavp1WXl+e0rduT9JEn1yyAtSVLXmr903nMW7eYC3o+I5clDuOcB7iFvR9UETCVv+bQa0Led+99p59rnWvdGtzK5dO45i9rKpnuPlNLkUsBs/R7lHu732nmPdzv4WW3tlFK6NiJ6AysDZ5KHaT+dUjq/TdvrgC3IvdZXluqYTJ7jvRft//m1ZxB5u60vkOdRz8hcHf0SkqT6ZJCWJKlrlcPn5imlOzrQ/jBy7+7uKaUrW78QEV+lpbe3rTSD60Uo925PtyI5sPCcvHFKaRLwZERsDTwHnB4Rt6aU3gSIiDXJIfoOYMs286R3IQfpjir/s/tXSmntOalbklTfnCMtSVLXeqh07mgQW7Z0vqn1xYjoB3ylq4rqZk+Wzu1953W64gNK+10fSx4a37q3uPznd2s7w9DXm8HbTaWdXvmU0ifkXu2VOzGPXJLUgAzSkiR1rRvIc2x/HBHTBcvSvspfbXWpvDjXeq3aBPBr5rA3t4JuAj4FDoyIJcsXS3OOf96Fn3Mx+c9274hYpnRtuj+/0mevTV6Buz0fklc1b8+Z5KHq55b+MmMaEbFKRNTKPxdJUjdxaLckSZ2zWkRcMoPX/pFSurC0zdXtwD8j4i7yfsS9yHN2NyQvyrVZ6Z7zyFtJXRcRV5NXjP4aMBgYBQzvlm/RhVJKH0bEj8hbRj0REVfRso/0M8Cq5F7gOf2ciRFxEnl/6GOA/YB/kRde+1ZELAo8DAwBtiEH/B3beat7yauC30Dev3sKcFNK6SngHGBdYHfyiun3Av8jL1g2FPgyuZd9dud+S5LqgEFakqTOWZKZz7u9MKX0r4hYHTgS2Jy8Gven5BWv/1Q6AEgpPRoRm5G3VtoJmEgO0LvStb253SqldE5EfAT8FNif3Ot7NXl/6dfo2CrhHXEReYXzPSPiVymllyJiK+A35L21v0oO73sDb9F+kP5B6bwxsDV5hN4bwFMppQTsERG3l77HtsAA8uJuzwDfA/7dRd9FklSjIv/7QpIkqetFxCbk1chPTikdWXQ9kiR1BedIS5KkORYR85f2c259bT7gl6Vfb6x8VZIkdQ+HdkuSpK6wMXmBrjvJw6QXATYFFgWuSCk9UGRxkiR1JYO0JEnqCv8mL+K1IbAgeXGx58hzl88ssC5Jkrqcc6QlSZIkSeoEe6TnwIILLpiWWWaZwj6/ubmZueaaq7DPl1rzeVQ18XlUNfF5VDXxeVQ1qYXn8dFHH30/pbRQ2+sG6TmwzDLL8MgjjxT2+aNGjWL48OGFfb7Ums+jqonPo6qJz6Oqic+jqkktPI8R8Wp71121W5IkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMM0pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMM0pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMqHqQjYqmIuDYixkXExxFxXUR8oYP39ouIkyPi7YgYHxGjI2KDdtodERE3l9qliDhuJu+5XUQ8HhGfRcSrEXF0RPScg68oSZIkSapjFQ3SETEAuBdYEdgL+DawPHBfRMzVgbe4CDgA+AWwFfA2cGdErN6m3QHAwsANs6hnU+CvwMPA5sAZwNHArzr0hSRJkiRJDadXhT/vAGAIsEJK6UWAiHgKeAE4CDh1RjdGxGrAbsC+KaWLS9fuB8YAxwPbtGq+SkppakT0Ar4zk3pOAv6RUjqw9Pt9ETE3cHREnJZS+t/sfElJkiRJUv2q9NDubYAHyyEaIKX0MvAAsG0H7p0EXN3q3snAVcCmEdG31fWpsyokIpYCVgcub/PSZUBvcg+1JEmSJEnTqHSP9CrAje1cHwPs1IF7X04pfdrOvX2A5Uo/d6YWgKdbX0wpvRwRnwIrd+K9GtLf/gaXXDJn79GvH5x4Isw/f5eUJEmSJEndrtJBen5gbDvXPwQGzcG95dc7WwszeM+xM3q/iDgQOBBgkUUWYdSoUZ382K7T1NRU6Of/+Mer8uSTAxk0aOJsv8e77/Zj/PiX2WuvV7uwMhWh6OdRas3nUdXE51HVxOdR1aSWn8dKB+mal1I6HzgfYNiwYWn48OGF1TJq1CiK+vypU+H552HffeHcc/vN9vtsvjncccdgzjtvMH36dGGBqrgin0epLZ9HVROfR1UTn0dVk1p+His9R3os7fc8z6i3uaP3QkvPdGdqYQbvOWg23q+hPPMMfPwxrLPOnL3P978P//sfXHdd19QlSZIkSd2t0kF6DC1zk1tbGfhPB+4dXNpCq+29E4EXp79llu9H23oiYhlgQAfqaWijR+fznAbpzTaDZZeFs86a85okSZIkqRIqHaRvAtaOiCHlC6Xgul7ptZm5mbya9ueLkpW2t9oFuCulNKEzhaSUXgOeBHZv89Ie5NXBb+/M+zWa0aPzAmHLLz9n79OjBxx8MDzwADz+eNfUJkmSJEndqdJB+gLgFeDGiNg2IrYhr+L9OnBeuVFELB0RkyPiF+VrKaXHyVtfnR4R+0fEJuStrwYDx7b+kIgYFhHfBHYoXVo5Ir5ZOlr3aP8M2DAizouI4RFxOHA0cIZ7SM/c6NGw9toQMefvtc8+MGAAnHnmnL+XJEmSJHW3igbplFIzsDHwPHm/5iuAl4GNU0pNrZoG0LOd+vYBLgZOBG4FlgI2Syk91qbdIcA1tOw5vVPp92uAhVvVcxvwTWBt4E7gcOBXwE/n5HvWu48+ynOk53RYd9nAgfDtb8OVV8IHH3TNe0qSJElSd6n4qt2lIdU7zqLNK+Qw3fb6eOCI0jGz+/cG9u5gPdcBLnXVCf/6Vz53VZAGOOQQOO88uPBC+MlPuu59JUmSJKmrVXpot+rA6NF5SPeaa3bde37pSzB8OJx9NkyZ0nXvK0mSJEldzSCtThs9Ogffeeft2vf9/vfhtdfg5pu79n0lSZIkqSsZpNUpU6fmod1dOay7bJttYKmlXHRMkiRJUnUzSKtTnn0Wxo3rniDdqxd897tw773wH3fxliRJklSlDNLqlNGj83nttbvn/Q84APr2hbPO6p73lyRJkqQ5ZZBWp4weDYMGwRe/2D3vv+CCsOuu8Kc/5Z5vSZIkSao2Bml1yoMP5t7oHt345BxyCDQ3wyWXdN9nSJIkSdLsMkirw8aNy3OXu2tYd9kaa+Q52GedlRc3kyRJkqRqYpBWh/3rX5BS9yw01tYhh8CLL8Jdd3X/Z0mSJElSZxik1WGjR0MEfPWr3f9Z3/wmLLqoW2FJkiRJqj4GaXXYgw/CKqvAvPN2/2f16QMHHQS33557piVJkiSpWhik1SFTp7YsNFYpBx2Ue8D/9KfKfaYkSZIkzYpBWh3y3HPw0UeVmR9dtthisNZacPfdlftMSZIkSZoVg7Q65MEH87mSQRpgxAh46KEc4iVJkiSpGhik1SGjR8PAgbDCCpX93BEj8rDy++6r7OdKkiRJ0owYpNUho0fn1bp7VPiJWXttmHtuh3dLkiRJqh4Gac3SuHEwZkzlh3UD9O4NG25okJYkSZJUPQzSmqWHH4aUignSkId3v/givPJKMZ8vSZIkSa0ZpDVLo0fnbai++tViPn/EiHy+555iPl+SJEmSWjNIa5ZGj4aVVoL55ivm81daCRZf3OHdkiRJkqqDQVozNXVq3vqqqGHdkHvDR4yAkSNzPZIkSZJUJIO0ZuqFF2Ds2GKDNOQg/cEH8PjjxdYhSZIkSQZpzdTo0fm89trF1vH1r+ezw7slSZIkFc0grZkaPTrPjV5ppWLrWGQRGDrUIC1JkiSpeAZpzdTo0Xm17h5V8KSMGAH/+Ad8+mnRlUiSJElqZFUQj1StPvkEnn66+PnRZSNGwMSJOUxLkiRJUlEM0pqhhx6ClIqfH122wQbQp4/DuyVJkiQVyyCtGSovNPbVrxZbR9mAAbDeegZpSZIkScUySGuGHnwwLzI2aFDRlbQYMQKefBLeeafoSiRJkiQ1KoO0Zuill2CVVYquYlojRuTzyJHF1iFJkiSpcRmkNUNNTTD33EVXMa0vfzn3kDu8W5IkSVJRDNKaoebm6gvSPXvCJpvkIJ1S0dVIkiRJakQGac1QczPMNVfRVUxvxAh480147rmiK5EkSZLUiAzSatekSXnP5moN0uDwbkmSJEnFMEirXc3N+VyNQXrwYFh2WYO0JEmSpGIYpNWucpCutjnSZSNGwKhRuedckiRJkirJIK12VXOPNOQg/ckn8K9/FV2JJEmSpEZjkFa7qj1Ib7QR9Ojh8G5JkiRJlWeQVruamvK5WoP0oEEwbJhBWpIkSVLlGaTVrmqfIw15ePdDD8G4cUVXIkmSJKmRGKTVrmof2g05SE+ZkhcdkyRJkqRKMUirXbUQpNdZJ9d3111FVyJJkiSpkRik1a5aCNJ9+sDw4XDnnUVXIkmSJKmRGKTVrvJiY9U8Rxpg883hpZfghReKrkSSJElSozBIq13lHukBA4qtY1Y23zyf77ij2DokSZIkNQ6DtNrV3Az9++e9mqvZkCGw/PJw++1FVyJJkiSpUVR5TFJRmpure350a5tvDvfdB+PHF12JJEmSpEZgkFa7mppqK0h/9hn87W9FVyJJkiSpERik1a7m5upfaKxsww2hXz+Hd0uSJEmqDIO02lVLQ7v798/bYBmkJUmSJFWCQVrtqqUgDXl49/PPw3//W3QlkiRJkuqdQVrtqqU50gCbbZbPboMlSZIkqbsZpNWuWpojDXkLrCFDHN4tSZIkqfsZpNWuWhvaHZGHd997L0yYUHQ1kiRJkuqZQVrtqrUgDXl496efwt//XnQlkiRJkuqZQVrTSan25kgDbLQR9Onj8G5JkiRJ3csgrelMnAhTptTWHGnIwX/DDQ3SkiRJkrqXQVrTaW7O51rrkYY8vPuZZ+DVV4uuRJIkSVK9MkhrOrUcpDffPJ/dBkuSJElSdzFIazpNTflci0F6xRVh6aUd3i1JkiSp+xikNZ1yj3StzZGGvA3WZpvByJF5rrckSZIkdTWDtKZTy0O7IQ/vbmqCBx4ouhJJkiRJ9cggrenUepDeeGPo3dvh3ZIkSZK6h0Fa06n1ID3PPLD++i44JkmSJKl7GKQ1nVpebKxs883h3/+GN94ouhJJkiRJ9cYgrenU8mJjZW6DJUmSJKm7GKQ1nVof2g2wyiqwxBIGaUmSJEldzyCt6TQ3522k+vcvupLZF5F7pe++GyZNKroaSZIkSfXEIK3pNDXBgAE5jNayzTeHjz+G0aOLrkSSJElSPTFIazrNzbU9P7psk02gVy+Hd0uSJEnqWgZpTae5ubbnR5fNNx+suy7cdlvRlUiSJEmqJwZpTadegjTAFlvAk0/C668XXYkkSZKkemGQ1nSamuonSG+9dT7femuxdUiSJEmqHwZpTade5kgDrLQSDBkCN99cdCWSJEmS6oVBWtOpp6HdEblXeuTIlv2xJUmSJGlOGKQ1nXoK0gBbbQUTJuQwLUmSJElzyiCt6dTTHGmADTaAeed1eLckSZKkrmGQ1nTqaY40QJ8+sOmmcMstMHVq0dVIkiRJqnUGaU0jpfob2g15nvT//gePPlp0JZIkSZJqnUFa0/jssxym6y1Ib7459OiRe6UlSZIkaU4YpDWNpqZ8rrcgveCCsM46zpOWJEmSNOcM0ppGeYuoegvSkId3P/44vPFG0ZVIkiRJqmUGaU2jHKTrabGxsq23zmeHd0uSJEmaEwZpTaOee6RXWgkGDzZIS5IkSZozBmlNo56DdETulR45Ej79tOhqJEmSJNUqg7SmUa+LjZVtvXVemfyee4quRJIkSVKtMkhrGvU8Rxpggw1gnnlcvVuSJEnS7DNIaxr1PLQboE8f2HRTuPVWmDq16GokSZIk1SKDtKZR70Ea8vDut9+Gxx4ruhJJkiRJtcggrWnU+xxpgC22gB49HN4tSZIkafYYpDWN5mbo2RP69i26ku6z4IKwzjoGaUmSJEmzxyCtaTQ3597oiKIr6V5bbw2PPw5vvll0JZIkSZJqjUFa0ygH6Xq31Vb5fMstxdYhSZIkqfYYpDWNpqbGCNIrrwyDBzu8W5IkSVLnGaQ1jebm+t1DurWIPLx75Ej49NOiq5EkSZJUSwzSmkajDO2GHKQ/+yyHaUmSJEnqKIO0ptFIQXqDDWCeeRzeLUmSJKlzDNKaRqPMkQbo0wc23TQvODZ1atHVSJIkSaoVBmlNo1HmSJdtsw28/TY88kjRlUiSJEmqFQZpTaORhnYDbLkl9OwJ119fdCWSJEmSaoVBWtNotCA9//wwfLhBWpIkSVLHGaT1ualT81ZQjRSkAbbfHp57Dp55puhKJEmSJNUCg7Q+V95PudGC9Hbb5bO90pIkSZI6wiCtzzU353MjLTYGsMQSsNZaBmlJkiRJHWOQ1ufKQbrReqQhD+9+5BF4/fWiK5EkSZJU7QzS+lyjB2mAG24otAxJkiRJNaDiQToiloqIayNiXER8HBHXRcQXOnhvv4g4OSLejojxETE6IjZop12PiDgqIl6JiM8i4smI2LGddgMi4v8i4vnS+70eEX+KiGW64KvWnKamfG7EIL3CCrDSSg7vliRJkjRrFQ3SETEAuBdYEdgL+DawPHBfRHQkvl0EHAD8AtgKeBu4MyJWb9PuBOA44Cxgc+BB4JqI2KJNuwuBHwMXAFsARwMbACMjosFmCjfuHOmy7beHv/0NPvig6EokSZIkVbNK90gfAAwBtksp3ZBSuhHYBlgaOGhmN0bEasBuwOEppQtSSiOBnYHXgONbtVsY+BFwUkrpdyml+1JKBwH3ASe1ajegdP/pKaWTS+0uBb5TqnG9LvvWNaKRh3ZDDtJTpsDNNxddiSRJkqRqVukgvQ3wYErpxfKFlNLLwAPAth24dxJwdat7JwNXAZtGRN/S5U2BPsDlbe6/HBgaEYNLv/csHR+3afdR6dxw88cbPUivsQYstZTDuyVJkiTNXKXD4irA0+1cHwOs3IF7X04pfdrOvX2A5Vq1mwC82E47yp+TUvoEuAw4NCI2ioi5I2IV4GTgSWDkrL9OfWnkOdIAEXlP6bvuavlLBUmSJElqq1eFP29+YGw71z8EBs3BveXXy+ePUkppFu0A9gF+T563XfYvYERKaWJ7RUTEgcCBAIsssgijRo2aRdndp6mpqUs//8knlwSW44kn/sGLL07usvetJYMHD+Szz1bnd797mg03fL/ocmpKVz+P0pzweVQ18XlUNfF5VDWp5eex0kG62pwI7EGeU/0w8AXgWOD2iNgwpTRdv2RK6XzgfIBhw4al4cOHV67aNkaNGkVXfv4//pHPm266Pr17d9nb1pT114df/hJeeOFLHHts0dXUlq5+HqU54fOoauLzqGri86hqUsvPY6WD9Fja73meUW9z23uXnsG90NLjPBYYGBHRpld6mnalYdw/BfZPKV1UbhQR/wKeB/YHzphFTXWluRl696ZhQzRAr16w9dZ5nvTEidCnT9EVSZIkSao2lZ4jPYY8h7mtlYH/dODewaXVttveO5GWOdFjgL7Asu20o9XnDC2dH27dKKX0AnnBsZVmUU/daWpq3PnRrW2/PYwbBzU6ykSSJElSN6t0kL4JWDsihpQvRMQy5K2mbprFvTcDvYGdWt3bC9gFuCulNKF0+Q7y6t67t7l/D+Dp0irhAP8rnddq3SgivggMBN7s0DeqI83NjbuHdGsjRuS/UHD1bkmSJEntqfTQ7guAQ4AbI+JoIAEnAK8D55UbRcTSwEvA8Sml4wFSSo9HxNXA6RHRG3gZ+C4wmFahOaX0bkScChwVEZ8Aj5HD9sbkLbTK/k5enfuUiBgEPEKeI300MA64tOu/fnVrbrZHGqB/f9hsM7jxRvjDH6BHw22EJkmSJGlmKhoRSot3bUyeg3wZcAU5EG+cUmpq1TTIezy3rW8f4GLyImG3AksBm6WUHmvT7uelNj8A7iT3eO+cUrqlVS1TgE2AC8mrcN9Wuucx4Ksppdfm9PvWGoN0i+23h7ffhn/9q+hKJEmSJFWbiq/aXQqoO86izSvkMN32+njgiNIxs/unkEPxibNo9wHww9LR8Jwj3WLLLfPCY9dfD+usU3Q1kiRJkqqJg1b1OXukWwwcCBtvnIP0dDuSS5IkSWpoBml9zsXGprX99vDiizBmTNGVSJIkSaomBml9zh7paW27LUS4erckSZKkaRmk9TnnSE9rscVg7bUN0pIkSZKmZZDW5+yRnt7228Pjj8MrrxRdiSRJkqRqYZAWAFOmwIQJzpFua4cd8vmvfy22DkmSJEnVwyAtIPdGgz3SbS27LHzlK/CXvxRdiSRJkqRqYZAWYJCemZ12gocecni3JEmSpMwgLSAvNAYG6fbstFM+X3ttsXVIkiRJqg4GaQEtPdLOkZ5eeXj3NdcUXYkkSZKkamCQFuDQ7lnZeWeHd0uSJEnKDNICDNKz4vBuSZIkSWUGaQHOkZ6VIUNgjTVcvVuSJEmSQVolzpGetZ12gocfdni3JEmS1OgM0gIc2t0R5eHdLjomSZIkNTaDtACDdEeUh3cbpCVJkqTGZpAW4Bzpjtp55zy8++WXi65EkiRJUlEM0gJyj3TfvtCzZ9GVVDdX75YkSZJkkBaQg7QLjc3a4MEwbJird0uSJEmNzCAtIAdph3V3zE47wSOPOLxbkiRJalQGaQF5jrRBumNcvVuSJElqbAZpAfZId8bgwbDmmgZpSZIkqVEZpAU4R7qzysO7//vfoiuRJEmSVGkGaQH2SHeWq3dLkiRJjcsgLcA50p21zDJ5eLerd0uSJEmNxyAtwB7p2bHzzvDoow7vliRJkhqNQVqAc6Rnxze/mc8uOiZJkiQ1FoO0AHukZ8cyy8BaaxmkJUmSpEZjkBYTJ8KkSQbp2bHTTg7vliRJkhqNQVo0N+ezQbrzyqt3X311sXVIkiRJqhyDtD4P0s6R7ryll4Z114U//7noSiRJkiRVikFa9kjPod12g3//Ox+SJEmS6p9BWgbpObTTTtCzJ1x5ZdGVSJIkSaoEg7Roaspng/TsWXhhGDEiD++eOrXoaiRJkiR1N4O0nCPdBXbfHV59FUaPLroSSZIkSd3NIC2HdneBbbeF/v3hiiuKrkSSJElSdzNIyyDdBeaZB7bZBv7yl7wntyRJkqT6ZZCWc6S7yO67wwcfwN13F12JJEmSpO5kkJY90l1k001h0CBX75YkSZLqnUFanwfpAQOKraPW9emTt8K64YaWP1NJkiRJ9ccgLZqbc4ju4dMwx3bfPf953nRT0ZVIkiRJ6i5GJ9HU5LDurrL++rDkkg7vliRJkuqZQVo0Nxuku0qPHrDrrnDHHfD++0VXI0mSJKk7GKRFczPMPXfRVdSP3XeHyZPh2muLrkSSJElSdzBIyx7pLrbqqrDyyg7vliRJkuqVQVrOke5iEbDbbvD3v8OrrxZdjSRJkqSuZpCWPdLdYNdd8/mqq4qtQ5IkSVLXM0jLOdLdYMgQWGcdh3dLkiRJ9cggLXuku8luu8FTT8HTTxddiSRJkqSuZJCWQbqb7Lwz9Oxpr7QkSZJUbwzSDS4lFxvrLgsvDCNG5CCdUtHVSJIkSeoqBukGN2ECTJ3qHOnusttueeXuf/6z6EokSZIkdRWDdINrbs5ne6S7x3bbQf/+cMUVRVciSZIkqasYpBucQbp7zTMPbLtt3gZrwoSiq5EkSZLUFQzSDa6pKZ8N0t1n771h7Fi45ZaiK5EkSZLUFQzSDc4e6e739a/D4ovDJZcUXYkkSZKkrmCQbnDlIO1iY92nZ0/YYw+4/XZ4552iq5EkSZI0pwzSDc4e6crYay+YMsU9pSVJkqR6YJBucM6RroyVV4Y113R4tyRJklQPDNINzh7pytlrL3jqKXjiiaIrkSRJkjQnDNINzjnSlfOtb0GfPnDppUVXIkmSJGlOGKQbnD3SlbPAArD11nDFFTBpUtHVSJIkSZpdBukG19QEEdCvX9GVNIa99oL33ssreEuSJEmqTQbpBtfcnHujI4qupDFsthksvLDDuyVJkqRaZpBucM3Nzo+upN69Yffd4eab4YMPiq5GkiRJ0uwwSDe4co+0KmevvfIc6T//uehKJEmSJM0Og3SDa2oySFfaaqvB6qs7vFuSJEmqVQbpBmePdDH22gseeQTGjCm6EkmSJEmdZZBucM6RLsZuu0GvXvZKS5IkSbXIIN3g7JEuxsILwxZbwOWXw+TJRVcjSZIkqTMM0g3OIF2cvfaCt9+Gu+8uuhJJkiRJnWGQbnAuNlacLbeE+ed3eLckSZJUawzSDc4e6eL07Qu77go33AAffVR0NZIkSZI6yiDdwFKCTz91sbEi7b03TJgAV19ddCWSJEmSOsog3cDGj89h2h7p4qyxBqy8MlxySdGVSJIkSeoog3QDa2rKZ4N0cSJgn33gwQfdU1qSJEmqFQbpBtbcnM8G6WLttRf07g0XXlh0JZIkSZI6wiDdwMpB2jnSxVpoIdhuO/jTn/J8aUmSJEnVzSDdwOyRrh777w8ffgjXX190JZIkSZJmxSDdwJwjXT2+/nVYZhm44IKiK5EkSZI0KwbpBmaPdPXo0QP22w/uvRdeeqnoaiRJkiTNjEG6gTlHurrsvXcO1BddVHQlkiRJkmbGIN3A7JGuLksuCVtsARdfDJMnF12NJEmSpBkxSDcw50hXnwMOgP/9D269tehKJEmSJM2IQbqB2SNdfbbYAhZbzEXHJEmSpGpmkG5gzc3Qqxf06VN0JSrr1Qv22Qduvx3eeKPoaiRJkiS1xyDdwJqbc290RNGVqLX99oOpU/NcaUmSJEnVxyDdwJqaHNZdjYYMgU02yat3T51adDWSJEmS2jJIN7Byj7SqzwEHwKuvwj33FF2JJEmSpLYM0g2sudk9pKvVdtvBAgu46JgkSZJUjQzSDcwe6erVty/suSfceCO8+27R1UiSJElqzSDdwAzS1W3//WHSJPjTn4quRJIkSVJrBukG5mJj1W3llWHddeHCCyGloquRJEmSVGaQbmD2SFe/Aw6A556Df/yj6EokSZIklRmkG5iLjVW/nXaCeed10TFJkiSpmhikG5g90tVvrrlgt93gmmtg7Niiq5EkSZIEBumGNXUqjB9vkK4F3/kOfPYZXHJJ0ZVIkiRJAoN0w2puzmeDdPVbbbW86Ng55+S/AJEkSZJULIN0gyoHaedI14aDD4YXXoCRI4uuRJIkSZJBukHZI11bdtwRFloIzj676EokSZIkGaQbVFNTPhuka0PfvrD//nDTTfDaa0VXI0mSJDU2g3SDske69hx0EKQE559fdCWSJElSYzNINyjnSNeepZeGrbbKe0pPnFh0NZIkSVLjMkg3KHuka9PBB8O778J11xVdiSRJktS4DNINyjnStWnECFh2WfjDH4quRJIkSWpcBukGZY90berRA777XfjHP+Cpp4quRpIkSWpMBukG5Rzp2rXPPtCvH5xzTtGVSJIkSY2p4kE6IpaKiGsjYlxEfBwR10XEFzp4b7+IODki3o6I8RExOiI2aKddj4g4KiJeiYjPIuLJiNhxBu85KCJOj4jXImJCRLwREZfM4desevZI167554ddd4XLLoOPPy66GkmSJKnxVDRIR8QA4F5gRWAv4NvA8sB9EdGRSHcRcADwC2Ar4G3gzohYvU27E4DjgLOAzYEHgWsiYos29QwC/gF8HTgaGAH8CPik89+utjQ3Q58+0KtX0ZVodnzve/mf4Z/+VHQlkiRJUuOpdIw6ABgCrJBSehEgIp4CXgAOAk6d0Y0RsRqwG7BvSuni0rX7gTHA8cA2pWsLk8PwSSml35Vuvy8ilgNOAm5r9ba/BuYGhqaUWvftXTWH37PqNTXZG13Lhg2DNdeEs8/OK3lHFF2RJEmS1DgqPbR7G+DBcogGSCm9DDwAbNuBeycBV7e6dzI59G4aEX1LlzcF+gCXt7n/cmBoRAwGKPWA7wlc2CZEN4TmZoN0rfve9+CZZ+D++4uuRJIkSWoslQ7SqwBPt3N9DLByB+59OaX0aTv39gGWa9VuAvBiO+1o9TlrAP2Bd0pztsdHRFNE3FAO2/WsudmFxmrdLrvk+dJnn110JZIkSVJjqfTQ7vmBse1c/xAYNAf3ll8vnz9KKaVZtFu8dP4dcDu5x3sh8nDvURHxpZTSdHOlI+JA4ECARRZZhFGjRs2i7O7T1NQ025//2mtDmTq1D6NGPdq1RamiRowYwrXXLsm11z7IggtOLLSWOXkepa7m86hq4vOoauLzqGpSy89jIy81Ve6N/y/wrXLwjoiXyIuT7QFMt8FQSul84HyAYcOGpeHDh1ek2PaMGjWK2f38vn1h0UWZ7ftVHZZaCq6+GsaMWZdjjy22ljl5HqWu5vOoauLzqGri86hqUsvPY6WHdo+l/Z7nGfU2d/ReaOlxHgsMjJhu+aW27T4onUe27r1OKf0L+Bj48izqqWnOka4Pyy4Lm20G558PkyYVXY0kSZLUGCodpMeQ5zC3tTLwnw7cO7i0hVbbeyfSMid6DNAXWLaddrT6nDHM3NRZvF7TnCNdP773PXjrLbj++qIrkSRJkhpDpYP0TcDaETGkfCEilgHWK702MzcDvYGdWt3bC9gFuCulNKF0+Q7y6t67t7l/D+Dp0irhpJTeAB4BRrTuvY6IdYB5gYc7++VqiT3S9WOLLXLP9BlnFF2JJEmS1BgqHaQvAF4BboyIbSNiG+BG4HXgvHKjiFg6IiZHxC/K11JKj5O3vjo9IvaPiE3IW18NBo5t1e5d8n7UR0XEERExPCLOATYGjmpTz0/JPdXXRsTmEbEn8BfgWeDKLv7uVcV9pOtHz55w6KHwz3/CQw8VXY0kSZJU/yoapFNKzeRA+zxwGXAF8DKwcUqpqVXTAHq2U98+wMXAicCtwFLAZimlx9q0+3mpzQ+AO8k93junlG5pU89IYGvgC8D1wGnAfcDwlNL4OfqyVc4e6fqyzz4w77xw+ulFVyJJkiTVv4qv2p1Seg3YcRZtXiGH6bbXxwNHlI6Z3T+FHKRP7EA9t5O3v2oYkyfDxInOka4n88wD++0HZ54Jv/0tLLlk0RVJkiRJ9avSQ7tVBZqb89ke6fry/e/D1Klw9tlFVyJJkiTVN4N0A2oqDaI3SNeXwYNhu+3gvPPg00+LrkaSJEmqXwbpBmSPdP067DD48EO47LKiK5EkSZLql0G6AZWDtHOk68/668NXvpIXHZta1zuhS5IkScUxSDcge6TrV0TulX72Wbj77qKrkSRJkuqTQboBGaTr2y67wKKLuhWWJEmS1F0M0g3IxcbqW58+cPDBcMcd8MwzRVcjSZIk1R+DdAOyR7r+HXQQ9O0LZ5xRdCWSJElS/TFINyAXG6t/Cy0Ee+wBf/oTfPBB0dVIkiRJ9cUg3YDskW4MP/gBjB8PF1xQdCWSJElSfTFIN6DyHOkBA4qtQ91r6FDYZBM46yyYNKnoaiRJkqT6YZBuQM3N0K8f9OxZdCXqbocdBm++CddeW3QlkiRJUv0wSDeg5mbnRzeKLbaA5ZeH006DlIquRpIkSaoPBukG1Nzs/OhG0aNHniv98MMwenTR1UiSJEn1wSDdgJqaDNKNZK+9YOBAOOWUoiuRJEmS6oNBugHZI91Y5p4bvvc9uP56eO65oquRJEmSap9BugE5R7rxHHoo9Oljr7QkSZLUFQzSDcge6cazyCKw995w6aXw9ttFVyNJkiTVNoN0A3KOdGP60Y/yftK//33RlUiSJEm1zSDdgOyRbkzLLQc77gjnnAMff1x0NZIkSVLtMkg3IOdIN64jj4Rx4+D884uuRJIkSapdBukGk5I90o1szTVho43gtNNg4sSiq5EkSZJqk0G6wUycCJMnG6Qb2ZFHwltvwRVXFF2JJEmSVJsM0g2muTmfDdKNa9NNYdVV4eSTYerUoquRJEmSao9BusEYpBWRe6WfeQZuuaXoaiRJkqTaY5BuMOUg7WJjjW3nnWHppeG3vy26EkmSJKn2GKQbjD3SAujdG444Ah54IB+SJEmSOs4g3WCamvLZIK399oP557dXWpIkSeosg3SDsUdaZXPNBYccAjfdlOdLS5IkSeoYg3SDcY60WjvkEOjfP6/gLUmSJKljDNINxh5ptbbQQrDvvnD55fDmm0VXI0mSJNUGg3SDcY602vrhD2HKFDjttKIrkSRJkmqDQbrB2COttgYPhm99C849F95/v+hqJEmSpOpnkG4wzc0QkefFSmU//zl8+imcfnrRlUiSJEnVzyDdYJqbYcAA6OE/ebWy8sqw445w5pnw0UdFVyNJkiRVN+NUg2lqcli32nf00fDxx/D73xddiSRJklTdDNINprnZIK32rbYabLNNHt798cdFVyNJkiRVL4N0g2ludg9pzdgxx8DYsXD22UVXIkmSJFUvg3SDsUdaMzNsGGy2GZxySssK75IkSZKmZZBuMM6R1qwcc0zeBuu884quRJIkSapOBukGY4+0ZmXddWHjjeHkk2H8+KKrkSRJkqqPQbrBGKTVEcccA//7H1x0UdGVSJIkSdXHIN1gXGxMHbHhhrD++vCb38CECUVXI0mSJFUXg3SDsUdaHRGRe6XfeAMuvbToaiRJkqTqYpBuICm52Jg6bsQIWGst+PWvYdKkoquRJEmSqodBuoF89lkO0wZpdUS5V/qVV+Dyy4uuRpIkSaoeBukGUt4X2DnS6qgtt4Qvfxl+9SuYPLnoaiRJkqTq0OEgHRE9I2K1iFioOwtS9ykHaXuk1VHlXukXX4Srry66GkmSJKk6dKZHOgGPAF/uplrUzZqa8tkgrc7Ydlv40pfgxBNhypSiq5EkSZKK1+EgnVKaCrwOGMNqlD3Smh09esCxx8Kzz8Kf/1x0NZIkSVLxOjtH+jzgsIjo0x3FqHs5R1qza4cdYPXV4bjjXMFbkiRJ6tXJ9vMAywL/jYg7gLfJQ77LUkrp2K4qTl3LHmnNrh494IQTYOut877S++9fdEWSJElScTobpH/W6ud923k9AQbpKuUcac2JLbeEr34Vjj8evv1t6Nu36IokSZKkYnRqaHdKqccsjp7dVajmnD3SmhMRecGx11+HCy4ouhpJkiSpOO4j3UCcI605tckmsOGG8MtfwqefFl2NJEmSVIxZBumImBoRUzp4TK5E0Zo99khrTkXkudL/+x+cfXbR1UiSJEnF6Mgc6eOZdkEx1aimprxolHNbNSe+9jXYdFM46SQ46CCYZ56iK5IkSZIqa5ZBOqV0XAXqUAU0N+fe6IiiK1GtO+EEWGstOOMMOProoquRJEmSKss50g2kHKSlObXmmrDttvC738HYsUVXI0mSJFWWQbqBNDe70Ji6zvHHw7hxcMopRVciSZIkVZZBuoHYI62utOqqsPPOcPrp8N57RVcjSZIkVY5BuoE0NRmk1bWOOw7Gj4ff/KboSiRJkqTKMUg3EHuk1dVWWgn22AP+8Ad4//0+RZcjSZIkVYRBuoE4R1rd4Re/gMmT4Yorli66FEmSJKkiDNINxB5pdYdll4V994VbblmM//636GokSZKk7meQbiDOkVZ3+cUvoGfP5J7SkiRJaggG6QZij7S6yxJLwI47vsGf/wyPPlp0NZIkSVL3Mkg3iKlT4dNPnSOt7rPrrq+xwAJw5JGQUtHVSJIkSd3HIN0gxo/PZ3uk1V3mnnsKxxwD994Ld95ZdDWSJElS9zFIN4impnw2SKs7fec7MHgw/OQnMGVK0dVIkiRJ3cMg3SCam/PZIK3u1Lcv/OpX8NRTcMUVRVcjSZIkdQ+DdIMoB2nnSKu77bwzrLEGHH00fPZZ0dVIkiRJXc8g3SDskVal9OgBJ58Mr78OZ55ZdDWSJElS1zNINwjnSKuSNtoINt88D/P+8MOiq5EkSZK6lkG6QdgjrUo76SQYNy6HaUmSJKmeGKQbhEFalbbqqrDXXnl496uvFl2NJEmS1HUM0g3CxcZUhOOPz3Omjzmm6EokSZKkrmOQbhD2SKsISy0Fhx4Kl18OTzxRdDWSJElS1zBINwgXG1NRjjoKBg2Cn/yk6EokSZKkrmGQbhDNzdCrF/TpU3QlajQDB8LPfw533QV33ll0NZIkSdKcM0g3iOZm50erOAcfDMsuC0ccAZMnF12NJEmSNGcM0g2iudlh3SpO377wu9/Bf/4D555bdDWSJEnSnDFIN4imJoO0irXttrDxxnDssfDhh0VXI0mSJM0+g3SDsEdaRYuA00+Hjz6C444ruBhJkiRpDhikG4RzpFUNhg6Fgw6Cs8+GMWOKrkaSJEmaPQbpBmGPtKrF8cfDPPPkhcdSKroaSZIkqfMM0g3COdKqFgsumOdJ33UX3Hpr0dVIkiRJnWeQbhD2SKuaHHwwrLBC7pWeOLHoaiRJkqTOMUg3COdIq5r07g2nngovvABnnVV0NZIkSVLnGKQbhD3SqjZbbAGbbZbnTL/3XtHVSJIkSR1nkG4AU6bAZ58ZpFV9Tj01z98/5piiK5EkSZI6ziDdAJqb89kgrWqz0kp5vvQFF8CTTxZdjSRJktQxBukGYJBWNTvuOBg0CA47zO2wJEmSVBsM0g2gHKRdbEzVaNCgPE961Ci47rqiq5EkSZJmzSDdAOyRVrU78EAYOhQOP7zleZUkSZKqlUG6ATQ15bNBWtWqVy84+2x4/XU44YSiq5EkSZJmziDdAOyRVi1Yf33Ye2845RR45pmiq5EkSZJmzCDdAJwjrVrxm9/k5/Tgg114TJIkSdXLIN0A7JFWrVh4Yfj1r+G+++DPfy66GkmSJKl9BukG4Bxp1ZIDDoBhw+CHP4Rx44quRpIkSZqeQboB2COtWtKzJ5xzDrzzDhx7bNHVSJIkSdMzSDcAg7RqzbBh8J3vwJlnwhNPFF2NJEmSNC2DdANoboa+ffMWQ1Kt+OUvYYEF4Hvfg6lTi65GkiRJamGQbgBNTfZGq/YMGgS//S2MHg2XXFJ0NZIkSVILg3QDaG42SKs27bln3l/6yCPhgw+KrkaSJEnKDNINoLnZPaRVm3r0gLPPho8+gp/9rOhqJEmSpKziQToiloqIayNiXER8HBHXRcQXOnhvv4g4OSLejojxETE6IjZop12PiDgqIl6JiM8i4smI2HEW771uREyNiBQRdTWb2B5p1bKhQ+HQQ+GCC+Chh4quRpIkSapwkI6IAcC9wIrAXsC3geWB+yKiI1HvIuAA4BfAVsDbwJ0RsXqbdicAxwFnAZsDDwLXRMQWM6irN3Ae8E7nvlFtcI60at3//R8sthgceCBMmlR0NZIkSWp0le6RPgAYAmyXUrohpXQjsA2wNHDQzG6MiNWA3YDDU0oXpJRGAjsDrwHHt2q3MPAj4KSU0u9SSvellA4C7gNOmsHb/xgI4I9z9O2qlD3SqnXzzANnnQVPPgmnnlp0NZIkSWp0lQ7S2wAPppReLF9IKb0MPABs24F7JwFXt7p3MnAVsGlE9C1d3hToA1ze5v7LgaERMbj1xYhYFjga+F7p/euOc6RVD7bfHnbYAY47Dl54oehqJEmS1MgqHaRXAZ5u5/oYYOUO3PtySunTdu7tAyzXqt0E4MV22tHO55wLXJNS+tssPr9m2SOtenHmmXlP9AMPhJSKrkaSJEmNqtKLas0PjG3n+ofAoDm4t/x6+fxRStP9Z3bbdkTEHsAawO6z+OzPRcSBwIEAiyyyCKNGjerorV2uqampQ58/btx6fPTRO4wa1fbvFqSu09HncU7tv/9inHLKChx55LNsueX/uv3zVJsq9TxKHeHzqGri86hqUsvPY12tTt0ZETE/cCrws5TSux29L6V0PnA+wLBhw9Lw4cO7p8AOGDVqFB35/M8+gy9+cUmGD1+y+4tSw+ro8zinNtgAHnkELrxwRY44YkUWW6zbP1I1qFLPo9QRPo+qJj6Pqia1/DxWemj3WNrveZ5Rb3NH74WWHuexwMCIiFm0O5G86vdfImJgRAwE+pVem6+Dq4hXvUmT8uHQbtWLHj3g/PNh/Pi8LZYkSZJUaZUO0mPIc5jbWhn4TwfuHVzaQqvtvRNpmRM9BugLLNtOO1p9zsrAqsAH5PA9FvhJ6bX3gStmUU9NaG7OZxcbUz354hfh2GPh2mvhhhuKrkaSJEmNptJB+iZg7YgYUr4QEcsA65Vem5mbgd7ATq3u7QXsAtyVUppQunwHefXttvOe9wCeLq0SDnAYsFGb49LSa18nr+Rd88pB2h5p1Zsf/QhWXRUOPhjGjSu6GkmSJDWSSs+RvgA4BLgxIo4GEnAC8DpwXrlRRCwNvAQcn1I6HiCl9HhEXA2cHhG9gZeB7wKDaRWaU0rvRsSpwFER8QnwGDlsb0zeQqvc7om2xUXE8NKP95e21qp5TU35bJBWvendGy68ENZeG376UzjnnKIrkiRJUqOoaI90SqmZHGifBy4jD59+Gdg4pdTUqmkAPdupbx/gYvL85luBpYDNUkqPtWn381KbHwB3knu8d04p3dKlX6gG2COterbmmnDYYXDuufD3vxddjSRJkhpFxVftTim9Buw4izavkMN02+vjgSNKx8zun0IO0id2srbjgOM6c0+1c4606t3xx8N118EBB8ATT0C/frO8RZIkSZojlZ4jrQqzR1r1bq654Lzz4Lnn4MRO/dWZJEmSNHsM0nXOOdJqBN/4Buy5J5x0Ejz6aNHVSJIkqd4ZpOucPdJqFKefDossAnvtBRMmzLK5JEmSNNsM0nXOOdJqFIMG5VW8x4zJe0xLkiRJ3cUgXefskVYj2Xxz2H9/OPlkePDBoquRJElSvTJI17nyHOkBA4qtQ6qUU06BJZfMQ7w//bToaiRJklSPDNJ1rrkZ+veHHv6TVoOYd164+GJ4/nn4+c+LrkaSJEn1yHhV55qbnR+txrPxxnDwwXDGGfC3vxVdjSRJkuqNQbrONTc7P1qN6Te/gSFDYO+9W6Y4SJIkSV3BIF3nDNJqVHPNBZdcAq+8AkceWXQ1kiRJqicG6TrX1GSQVuNaf304/HA45xy4556iq5EkSVK9MEjXOXuk1ehOPBFWXBH23RfGjSu6GkmSJNUDg3Sdc7ExNbr+/eHSS+HNN3PvtCRJkjSnDNJ1zh5pCdZaC37607wt1vXXF12NJEmSap1Bus45R1rKjj0W1lgD9t8/905LkiRJs8sgXefskZayPn3gyivhs89gr71g6tSiK5IkSVKtMkjXsZScIy219sUvwhlnwMiRcOqpRVcjSZKkWmWQrmMTJ8KUKfZIS63ttx/ssAP87Gfw2GNFVyNJkqRaZJCuY01N+WyQllpEwAUXwMILw2675VEbkiRJUmcYpOtYOSAYpKVpzT8/XHYZPP88HHFE0dVIkiSp1hik61g5SDtHWpreRhvBj38M558PN9xQdDWSJEmqJQbpOmaPtDRzJ5zQsiXWW28VXY0kSZJqhUG6jjlHWpq5Pn3giitg/HjYc0+3xJIkSVLHGKTrmD3S0qytsIJbYkmSJKlzDNJ1zDnSUse03hLr0UeLrkaSJEnVziBdx+yRljqmvCXWoovCTjvBRx8VXZEkSZKqmUG6jhmkpY6bf364+mp4/XXYd19IqeiKJEmSVK0M0nXMxcakzllnHTjpJLj+evj974uuRpIkSdXKIF3HmpvzkNX+/YuuRKodRxwB22yT95h+6KGiq5EkSVI1MkjXsebm3BsdUXQlUu2IgEsugcUXh513hg8/LLoiSZIkVRuDdB0rB2lJnTNoEPzlL/DWW7DPPs6XliRJ0rQM0nWsqckgLc2utdaCk0+Gm25yf2lJkiRNyyBdx+yRlubMoYfm/aV/+lMYPbroaiRJklQtDNJ1rLkZ5p676Cqk2hUBF10ESy0Fu+wCH3xQdEWSJEmqBgbpOmaPtDTnBg6Ea66Bd96BPfeEqVOLrkiSJElFM0jXMedIS11jjTXyPOnbbsv7TEuSJKmxGaTrmD3SUtf53vdg113h6KPhjjuKrkaSJElFMkjXMedIS10nAi68EIYOzYH6pZeKrkiSJElFMUjXMXukpa41YABcf30O1dtvn/83JkmSpMZjkK5TKRmkpe4wZAhcdRWMGQP77Zf/tyZJkqTGYpCuU+PH5//AN0hLXe8b34Bf/hKuvhpOOaXoaiRJklRpBuk6VR5y6hxpqXv85CfwzW/m8z33FF2NJEmSKskgXafKQdoeaal7RMAf/wgrrgjf+ha88krRFUmSJKlSDNJ1yiAtdb955oEbboDJk2GHHfKUCkmSJNU/g3SdamrKZ4O01L2WXx6uuAKeeAIOPNDFxyRJkhqBQbpOLbUUnHYafOlLRVci1b8tt4TjjoPLL4czzii6GkmSJHW3XkUXoO6x+OJw2GFFVyE1jqOPhscfhx/+EFZYATbfvOiKJEmS1F3skZakLtCjB1x2GQwdCrvskveZliRJUn0ySEtSF5l7brj55rw2wdZbw3vvFV2RJEmSuoNBWpK60FJLwY03wttvw/bbw4QJRVckSZKkrmaQlqQuttZacOml8MADruQtSZJUj1xsTJK6wc47w7PPwrHHwkorwU9/WnRFkiRJ6ioGaUnqJscck8P0UUfllby3377oiiRJktQVHNotSd0kAi66CL76Vdhjj7w9liRJkmqfQVqSulH//nDDDbDAAnkl77feKroiSZIkzSmDtCR1s0UXhVtugXHjYJttoKmp6IokSZI0JwzSklQBq64KV12Vh3fvvDNMmlR0RZIkSZpdBmlJqpAtt4Rzz4Xbb4fvfMdtsSRJkmqVq3ZLUgUdcAC8/jqccAIstRQcd1zRFUmSJKmzDNKSVGH/93/wxhv5vOSSsP/+RVckSZKkzjBIS1KFRcB558Hbb+ch3osvDltsUXRVkiRJ6ijnSEtSAXr3hmuugdVWg512gocfLroiSZIkdZRBWpIKMvfccOutsPDCeSGyl14quiJJkiR1hEFakgq06KJwxx0wdSpsthm8917RFUmSJGlWDNKSVLAVVoCbb84LkG21FTQ1FV2RJEmSZsYgLUlVYJ114Kqr4JFHYLvtYMKEoiuSJEnSjBikJalKbLst/PGPMHIk7LorTJ5cdEWSJElqj0FakqrIXnvBGWfA9dfn/aWnTi26IkmSJLXlPtKSVGUOPRQ++giOPRbmmw9OPz3vPS1JkqTqYJCWpCp0zDE5TJ92GgwaBMcdV3RFkiRJKjNIS1IVioBTTslh+v/+DwYOhMMOK7goSZIkAQZpSapaEXD++fDxx3D44XmY9z77FF2VJEmSDNKSVMV69YIrroBPPsmLj807L+y4Y9FVSZIkNTZX7ZakKte3L1x3Hay9dt4W67bbiq5IkiSpsRmkJakGzDUX3HorDB0KO+wAd95ZdEWSJEmNyyAtSTVi4EC4+25YaSXYdlu4666iK5IkSWpMBmlJqiHzzw/33AMrrpjD9D33FF2RJElS4zFIS1KNWWCBHKCXXx622QbuvbfoiiRJkhqLQVqSatCCC8LIkTBkCGy1Fdx/f9EVSZIkNQ6DtCTVqIUWymF6mWVgiy3g738vuiJJkqTGYJCWpBq2yCJ5aPcXvgCbbw4PPFB0RZIkSfXPIC1JNW7RRXOYXmIJ2Gwzw7QkSVJ3M0hLUh1YbDG47z5YfHH4xjdcgEySJKk7GaQlqU4svnhedGzIkDxn+rbbiq5IkiSpPhmkJamOLLoojBoFq6wC220Hf/1r0RVJkiTVH4O0JNWZBRbIq3kPGwa77AKXX150RZIkSfXFIC1JdWjgQLjrLthgA9hzTzj//KIrkiRJqh8GaUmqU3PPDbfemrfFOuggOOOMoiuSJEmqDwZpSapj/fvD9dfDDjvAYYfBr35VdEWSJEm1zyAtSXWuTx+4+mrYfXf4+c/hqKMgpaKrkiRJql29ii5AktT9evWCSy/Nw71POgnefRfOOy9flyRJUuf4n1CS1CB69oRzzoGFF4YTToD33oOrroIBA4quTJIkqbY4tFuSGkgEHH88nHUW3HILfOMbMHZs0VVJkiTVFoO0JDWggw/O86Yffhi+9jV4882iK5IkSaodBmlJalA77QS33w6vvQbrrgvPPlt0RZIkSbXBIC1JDWzjjWHUKPjsM1h/ffjXv4quSJIkqfoZpCWpwX3lK/DAAzDffDlY33FH0RVJkiRVN4O0JInllsth+otfhK22ggsuKLoiSZKk6mWQliQBsOii8Le/5ZW8DzwQfvITmDq16KokSZKqj0FakvS5eeaBm26C734Xfvtb2HlnGD++6KokSZKqS6+iC5AkVZdeveAPf4Dll4cf/hDeeANuvBEWWaToyiRJkqqDPdKSpOlEwOGHw3XXwVNPwdprw3/+U3RVkiRJ1cEgLUmaoe22g/vvz8O7110XRo4suiJJkqTiGaQlSTO15pp5f+kll4TNNoM//rHoiiRJkoplkJYkzdLSS+ftsTbaCPbbD444AiZPLroqSZKkYhikJUkdMt98cOut8P3vw2mnwRZbwNixRVclSZJUeRUP0hGxVERcGxHjIuLjiLguIr7QwXv7RcTJEfF2RIyPiNERsUE77XpExFER8UpEfBYRT0bEjm3aLBYRv46IRyLio4h4LyJGtvd+kqSsd2/4/e/hwgth1ChYay0XIZMkSY2nokE6IgYA9wIrAnsB3waWB+6LiLk68BYXAQcAvwC2At4G7oyI1du0OwE4DjgL2Bx4ELgmIrZo1WYNYBfgRmAnYG/gM2BURGzV+W8nSY1jv/1ykP7kE/jqV/Pe05IkSY2i0vtIHwAMAVZIKb0IEBFPAS8ABwGnzujGiFgN2A3YN6V0cena/cAY4Hhgm9K1hYEfASellH5Xuv2+iFgOOAm4rXTtH8AXU0qTW33GnaX3OxK4pSu+sCTVq3XXhUceySt7b7cdnHAC/OxneessSZKkelbpod3bAA+WQzRASull4AFg2w7cOwm4utW9k4GrgE0jom/p8qZAH+DyNvdfDgyNiMGlez9qHaJbvd8TwBKd+1qS1JiWXBL+/nfYdVc4+mjYZRdobi66KkmSpO5V6SC9CvB0O9fHACt34N6XU0qftnNvH2C5Vu0mAC+2046ZfU5E9AHWAZ6ZRS2SpJL+/eHyy+G3v4Vrr4X11oP//rfoqiRJkrpPpYP0/EB7a7x+CAyag3vLr5fPH6WU0izatec4YEngN7OoRZLUSgT8+Mdw223w6quwxhpwixNkJElSnar0HOmqFRG7AT8FTkgp/X0m7Q4EDgRYZJFFGDVqVGUKbEdTU1Ohny+15vMogH794A9/6Mdxx63C1lvPw+67v8o++7xMz56VrcPnUdXE51HVxOdR1aSWn8dKB+mxtN/zPKPe5rb3Lj2De6Glx3ksMDAiok2vdNt2n4uIrYFLgItSSsfOrIiU0vnA+QDDhg1Lw4cPn0XZ3WfUqFEU+flSaz6Pam2HHfJ+0xdeuDRvv700f/4zLLxw5T7f51HVxOdR1cTnUdWklp/HSg/tHkOew9zWysCsdiIdAwwubaHV9t6JtMyJHgP0BZZtpx1tPyciNgGuAa4nrxwuSZpD/frBBRfAH/8I//wnfPnL8MADRVclSZLUNSodpG8C1o6IIeULEbEMsF7ptZm5GehN3vO5fG8v8l7Qd6WUJpQu30Fe3Xv3NvfvATxdWiW8fP865H2kRwJ7pJSmzsZ3kiTNwD77wOjReUGy4cPh9NNhuhUsJEmSakylh3ZfABwC3BgRRwMJOAF4HTiv3CgilgZeAo5PKR0PkFJ6PCKuBk6PiN7Ay8B3gcG0Cs0ppXcj4lTgqIj4BHiMHLY3prTXdOkzVgRuBd4HTgbWiFabn6aUHuzyby9JDWj11fN+03vvDYcfnnuoL7wQ5p236MokSZJmT0WDdEqpOSI2Bk4DLgOC3Bt8WEqpqVXTAHoyfY/5PsAvgROBgcCTwGYppcfatPs50AT8AFgUeA7YOaXUeg3ZtcnztQcB97VTbrRzTZI0GwYOhOuvh5NPhqOOgsceg6uugmHDiq5MkiSp8yo9tJuU0msppR1TSvOmlOZJKW2XUnqlTZtXUkqRUjquzfXxKaUjUkqLppT6pZS+mlIa1c5nTEkpnZhSWjql1DeltGpK6do2bS4pfUa7Rzd8dUlqaBFw5JFw//0wcSKsuy6ceipMdVKNJEmqMRUP0pKkxrb++vDEE7DVVvDDH+bzu+8WXZUkSVLHGaQlSRU3//zw17/C2WfDvffCaqvByJFFVyVJktQxBmlJUiEi4LvfhYcegkGDYMQI+NnPYNKkoiuTJEmaOYO0JKlQq64KDz8M++0Hv/41bLABvPzyrO+TJEkqikFaklS4ueaCCy7IK3n/5z95qPfFF7vntCRJqk4GaUlS1dhlF3jqKfjKV2DffWH77V2ITJIkVR+DtCSpqiy9dF6A7He/g9tvh6FD4aabiq5KkiSphUFaklR1evTIW2M9+igsthhsu22eQ/3JJ0VXJkmSZJCWJFWxL30pr+p91FFwySV57vTf/150VZIkqdEZpCVJVa1PH/jVr+Bvf8tbZm24Ifz4xzB+fNGVSZKkRmWQliTVhPXWgyeegP33z/OnV18d/vGPoquSJEmNyCAtSaoZ88wD558Pd90FEybkPacPPRSamoquTJIkNRKDtCSp5owYAU8/DYccAmeemVf2Hjmy6KokSVKjMEhLkmrS3HPD73+f50737g1f/zoceCCMG1d0ZZIkqd4ZpCVJNe1rX4Mnn4Qjj4SLLoJVVoHRo+cvuixJklTHDNKSpJrXvz/85jfw4IMwcCD87GersvPO8NZbRVcmSZLqkUFaklQ31lwTHnsM9tvvv9x0E6y4Ipx1FkyZUnRlkiSpnhikJUl1pU8f2GOP13j6aVhnHfj+92HttXPAliRJ6goGaUlSXVpuObjjDvjzn+H113Nv9WGHwSefFF2ZJEmqdQZpSVLdioBvfQuefRYOOiiv8r3SSnDddZBS0dVJkqRaZZCWJNW9gQPh7LPhn/+EBRaAHXeEzTfPAVuSJKmzDNKSpIax9trw6KNw2mkwejQMHQo//jF8/HHRlUmSpFpikJYkNZRevfJc6RdegL32glNOgRVWgD/9CaZOLbo6SZJUCwzSkqSGtPDCcOGF8K9/wdJL51C9/vq5x1qSJGlmDNKSpIa25pp57vTFF8NLL+XfDzwQ3n236MokSVK1MkhLkhpejx6w997w/PNw+OE5VC+3HPz61zB+fNHVSZKkamOQliSpZL758pzpp5+GjTaCn/0MVlwRrrjC+dOSJKmFQVqSpDZWWAFuvBHuuw8WXBD22AO++lX429+KrkySJFUDg7QkSTMwfDg8/DBcdhn873+w4Yaw/fZ5CLgkSWpcBmlJkmaiR4/cI/388/DLX8I998Aqq8DBB8PbbxddnSRJKoJBWpKkDujfP8+ZfvFF2H9/OP98WHbZfG3s2KKrkyRJlWSQliSpExZZBM45B555BrbbLq/sPWQInHQSfPpp0dVJkqRKMEhLkjQbllsOrrwSnngC1lsPjjoq91CffTZMnFh0dZIkqTsZpCVJmgOrrQa33AL/+Acsv3yeO73SSnmBssmTi65OkiR1B4O0JEldYL314P774bbbYN55Yc8986Jkl19uoJYkqd4YpCVJ6iIRsPnm8OijcN11eYGyb3/bQC1JUr0xSEuS1MV69Mj7TT/2mIFakqR6ZJCWJKmbzCxQO4dakqTaZZCWJKmbtReo99wzL0529tkwfnzRFUqSpM4wSEuSVCGtA/VNN8Fii+VVvgcPht/8Bj7+uOgKJUlSRxikJUmqsB49YOut4YEHYNQoWH11+OlP4QtfgJ//HN59t+gKJUnSzBikJUkqSARsuCHccQc88giMGAG//jUsswwceii8/HLRFUqSpPYYpCVJqgJrrAHXXAPPPAO77grnngvLLQc77QQPPlh0dZIkqTWDtCRJVWSFFeCii3Jv9JFHwj33wDrrwLrrwl//ClOmFF2hJEkySEuSVIWWWCIP8379dTjzTHjnHfjmN/NK37//PTQ1FV2hJEmNyyAtSVIVm3tuOOQQeP753CO92GLwgx/AUkvBj34E//1v0RVKktR4DNKSJNWAnj1hhx3ySt+jR+eFyU4/Pc+j3mYbuPtuSKnoKiVJagwGaUmSaszaa8Nf/gKvvpq3y3rwQfjGN2ClleCss+CTT4quUJKk+maQliSpRi2xBJxwQp5H/ac/wbzzwve/n68feig8+2zRFUqSVJ8M0pIk1bi+feHb34aHHoJ//Qu23TZvn7XSSnmf6iuvhAkTiq5SkqT6YZCWJKmOrLUWXHYZvPEGnHRSPu++e+6l/tGP8qJlkiRpzhikJUmqQwsvDD/5CbzwAtx5Z+6ZPv30vE/1xhvD1VfDxIlFVylJUm0ySEuSVMd69MgLkf31r3ku9Ykn5i2zvvWt3Et9+OHw738XXaUkSbXFIC1JUoNYbLG8yvdLL8Htt8Pw4fCHP8Cqq8KwYXD22TB2bNFVSpJU/QzSkiQ1mJ49YbPN4Jpr4K238pDvSZPg4INz2N5117wv9ZQpRVcqSVJ1MkhLktTAFlwQfvADeOIJePRROOCAPKf6G9+AwYPhqKNgzJiiq5QkqboYpCVJEhHwla/AmWfmXuqrroKhQ+Hkk+FLX4IvfxlOPRXefrvoSiVJKp5BWpIkTaNfP9hlF7j11hyqzzgDeveGH/4Qllwy91Zfdhk0NRVdqSRJxTBIS5KkGVp4YTj0UHjoIXj2WfjZz/KWWnvuCYssklf/vv56+OyzoiuVJKlyDNKSJKlDVlgBTjghb5/1j3/kMD1yJOywQw7ce+4Jt93m/tSSpPpnkJYkSZ0SAeutB+eck+dM33UX7LQT3HwzbLllXvn7gANyyJ48uehqJUnqegZpSZI023r1ghEj4KKL4J13cpjefPO8WNnXv55D9f77532r7amWJNULg7QkSeoSffrAVlvB5ZfDu+/CX/+aFyb7y19giy3y8O9vfzvPqR4/vuhqJUmafQZpSZLU5fr3z3Onr7gC3nsPbrkl/37bbfm84IJ5OPiVV8JHHxVdrSRJndOr6AIkSVJ969s3z53eckuYNAnuvz/3Vl9/PVx7bR4evsEGsO22sM02sMwyRVcsSdLM2SMtSZIqpnfvPHf6nHPyHtX//Cf86Ed50bIf/AAGD4bVVoNjjoFHHoGUiq5YkqTpGaQlSVIhevSAddaBX/8a/vMfeP55+N3vYOBA+NWvYM01YYkl8mJl110Hn3xSdMWSJGUGaUmSVBWWXx5++MM89Pudd+CSS2D99eGaa2DHHWGBBWCTTeCUU+DZZ+2tliQVxyAtSZKqzoILwl575RW/338fRo2Cww/PAftHP4KVVoJll4VDDslbbtlbLUmqJIO0JEmqar17w4Ybwm9+A08/Da+8kudYr7IKXHxxXqBsgQVg+PA8JPyRR2Dq1KKrliTVM4O0JEmqKUsvDd/5Tu6J/vBDGDkSjjgCPv4Yfv7zPLd64YVh113z8PDXXy+6YklSvXH7K0mSVLP69oWNN87HSSflod/33AN33gl33QVXXZXbLb98nl+98caw0UZ56LgkSbPLIC1JkurGIovA7rvnIyX4979zj/W998IVV8C55+Z2q6/eEqy/9jWYZ55Cy5Yk1RiDtCRJqksRsOqq+Tj8cJg0Kc+fHjkyH2eemVcA79kTvvKVPA97ww3zSuEDBxZdvSSpmhmkJUlSQ+jdO+9bvc46cPTRMH48PPBA3m7r/vvh97/P+1j36JF7rFsH6wUWKLp6SVI1MUhLkqSG1L8/fP3r+YAcrB98sCVYn302nHZafm3llXOgLh/LLJN7vCVJjckgLUmSRA7WG22UD4DPPoOHHoJ//CP3XF99NZx/fn5t8cVhvfVyqF5vvTx8vHfv4mqXJFWWQVqSJKkd/frBBhvkA/Le1GPG5GBdPq65Jr/Wvz8MG9YydHzttWHRRYurXZLUvQzSkiRJHdCjBwwdmo/vfjdfe/11+Oc/YfToPCz8tNPgt7/Nry2zTA7VCyywBP365XnX/foVVb0kqSsZpCVJkmbTUkvBLrvkA/Jw8Mcey8F69Og81/qtt5bnrLOgV688BHyttVqOFVfMq4ZLkmqLQVqSJKmL9OsH666bj7JrrhlNr17r8NBD8PDDcOWVLftZzz03rLHGtMfyy+feb0lS9TJIS5IkdaOFFprA8OGw/fb596lT4fnnc6guh+uzz8692ZDD9Ze/PG24/uIX7bmWpGpikJYkSaqgHj3ykO4VV4RvfztfmzQJnnkGHn205Tj33JZwPWBAHha++uotx9Ch+bokqfIM0pIkSQXr3TsH5VVXhX32ydcmT24J108+CU88AVdd1TIsvEcPWGGFHKpXWy3fO3QoLLGEe1xLUnczSEuSJFWhXr1aVgkvSwlefTWH6vLxwAPw5z+3tBk0KN9TDuZDh8KXvpSHjEuSuoZBWpIkqUZE5G21llkGttuu5fpHH8HTT8NTT+Xj3/+GSy6BpqaWNksvDauskkP1KqvkY6WVHB4uSbPDIC1JklTjBg6E9dfPR9nUqbn3+qmncsgeMyYf99wDEyfmNhEweHBLqG59zDtvIV9FkmqCQVqSJKkO9eiRQ/LgwbDtti3XJ0+GF19sCdZPPw3/+Q/ccUde9Kxs8cWnDdYrrJAP52BLkkFakiSpofTq1bJq+I47tlyfPBn++9+8wFnro+0Q8bnmyntdl4P1Civk7bmWXx7mm6/iX0eSCmGQliRJEr165UD8xS9O24OdErz1Fjz3XMtR3gf7mmvyEPKyhRbKgXr55WG55ab92aHikuqJQVqSJEkzFJGHcy+xBGy88bSvTZiQh4k/91w+v/BCPu6+Gy69dNq2Cy0Eyy7b/rHIIg4Xl1RbDNKSJEmaLX37tqwA3lZzM7z0Uku4fumlfPz973Dllbmnu2zAABgypGVO9+DBeWXy8s/2ZkuqNgZpSZIkdbm55mrZy7qtiRPhlVdysP7vf1vOL78Mo0bBJ59M237++XOgXnrpfCyzzLQ/DxzY7V9HkqZhkJYkSVJF9enTMh+7rZTgww9zqG57/Oc/cPvtMH78tPfMO28O1V/4Aiy11PTnJZbInylJXcUgLUmSpKoRAQsskI9hw6Z/PSV4//28R/Yrr+Rz+Xj9dXjwQfjgg+nfc9FFc6hecsn2D8O2pM4wSEuSJKlmROSFyxZaqP2gDXl+9htvwGuv5eP11/P5jTfyll733AMffzz9fQstlAP14ovP+LzggnmPbkmNzSAtSZKkujLXXC17XM/Ixx/Dm2/mcP3GGzlsv/VWvvbWW/Doo/Duu9MuigZ5m7BFFoHFFmv/WHTRfCyySF6MTVJ9MkhLkiSp4cw7bz5WWmnGbSZNgv/9L4frN9+Et9+e9nj11TyU/L332r9/0KCWUF0O2AsvnH8vn8s/9+vXPd9TUvcwSEuSJEnt6N07z6teaqmZt5s0Cd55J4frd97J4bvt8cgj+fXm5vbfY955c6BeeOE8xLx8bvvzQgvl4eX2dkvFMkhLkiRJc6B375ZFy2aluTkPGX/33Ry6257fey9vB/bgg3lRtSlT2n+feebJgXrBBVvCdfkoL9bW9jB8S13HIC1JkiRVyFxz5T2xBw+eddupU2Hs2Byu33svh+3332853nsvn995B8aMyb9/+umM32/uuWGuudZm8cXz3tzzz58Ddvnn8jFoUD7KP/fvnxd5k9TCIC1JkiRVoR49WnqTV1yxY/eMH5+3/2rveP99GDPmI/r2XZQPP4Snnsp7dn/44Yx7viFvC1YO1+Vj4MCWc/ko/z7ffC3n+eZzWzHVJ4O0JEmSVCf695/5MPNRo55l+PBFp7mWEnzySQ7UH3yQe8HbHh9+2PLzO+/As8/CRx/lY+rUWdfUXsCed95pz22vzTtvHsI+77y5J99ecVUTg7QkSZLUwCJagusyy3Tu3nIIL4fqsWNh3Lj887hx0/5cPo8dC6+8krcgGzdu5sPRy3r0aAnV88zTcrT9vfUx99wt57Y/9+tnMNecMUhLkiRJmi2tQ/gXvjB77zF5ckuoLp8/+ST/PKPjk0/y8fbbLT9/8kl+r47o0aMlVLc+5pqr5dz25/aOAQOm/9mh7I3BIC1JkiSpML16tSx0NidSggkTWkJ1U1M+yj+3Pjc3t7ze1NTy+3vvwcsv59/Lx4QJnf8+AwZ07OjfPx/ln1tfm9UxYED+LBXDP3pJkiRJNS8iD9nu1y9vCdZVJk+eNliXg/enn+aj/HPrc3NzXvit3KZ8vPtuS5vx41vazGyxt5np2TN/3/79pz23/XlmR9++7f/e+tz6aHutZ8+u+7OuJRUP0hGxFHAaMAII4B7gsJTSax24tx9wArAHMBB4AvhJSulvbdr1AH4CHAQsCjwHHJ9S+ms773kA8ENgMPAKcFpK6dzZ+3aSJEmS6kmvXi2LoXWXSZNyoG4dvstBe0bHZ59Nf2577eOPc3hv2+azzzrf0z4jPXvmQN2nz/Shu/X11ufyz01NyzJ8eNfUUWkVDdIRMQC4F5gA7AUk4ETgvohYNaXUPIu3uAjYEvgx8F/gYODOiFgnpfREq3YnAD8Cfg48CnwLuCYitkop3daqngOA84BfkwP9JsDZEREppXPm9PtKkiRJ0qz07t39Yb2tlGDixOnDddvfWx/tXWt9TJzY/u8TJ+a5761/nzABevacw/H8Bap0j/QBwBBghZTSiwAR8RTwArn3+NQZ3RgRqwG7AfumlC4uXbsfGAMcD2xTurYwOUSflFL6Xen2+yJiOeAk4LZSu17AL4HLUko/b9VuceCEiLgwpTSpy765JEmSJFWJiJZe40oG+NZGjXoYGF7Mh8+hHhX+vG2AB8shGiCl9DLwALBtB+6dBFzd6t7JwFXAphHRt3R5U6APcHmb+y8HhkbE4NLv6wALtdPuMmABYP0OfidJkiRJUgOpdJBeBXi6netjgJU7cO/LKaW2O82NIQfn5Vq1mwC82E47Wn3OKqVz23ratpMkSZIk6XOVHto9PzC2nesfAoPm4N7y6+XzRyml1IF2tPOebdtNIyIOBA4EWGSRRRg1atQsyu4+TU1NhX6+1JrPo6qJz6Oqic+jqonPo6pJLT+Pbn/VSSml84HzAYYNG5aGF7jM3KhRoyjy86XWfB5VTXweVU18HlVNfB5VTWr5eaz00O6xtN/zPKPe5o7eCy09yWOBgRERHWhHO+/Ztp0kSZIkSZ+rdJAeQ8vc5NZWBv7TgXsHl7bQanvvRFrmRI8B+gLLttOOVp9Tngvdtp627SRJkiRJ+lylg/RNwNoRMaR8ISKWAdYrvTYzNwO9gZ1a3dsL2AW4K6VU3lL8DvLq3ru3uX8P4OnSKuEAo4H3Z9DuQ/JK4pIkSZIkTaPSc6QvAA4BboyIo4EEnAC8DpxXbhQRSwMvAcenlI4HSCk9HhFXA6dHRG/gZeC7wGBaheGU0rsRcSpwVER8AjxGDtsbU9prutRuUkQcA5wdEW8C95Ta7At8P6U0sZv+DCRJkiRJNayiQTql1BwRGwOnkfdrDmAkcFhKqalV0wB6Mn2P+T7AL4ETgYHAk8BmKaXH2rT7OdAE/ABYFHgO2DmldEubes6NiAT8EPgx8BpwSErp7Dn8qpIkSZKkOlXxVbtTSq8BO86izSvkMN32+njgiNIxs/unkMP2iR2o5zxa9YZLkiRJkjQzlZ4jLUmSJElSTTNIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMM0pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVInGKQlSZIkSeoEg7QkSZIkSZ1gkJYkSZIkqRMM0pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCZFSKrqGmhUR7wGvFljCgsD7BX6+1JrPo6qJz6Oqic+jqonPo6pJLTyPS6eUFmp70SBdwyLikZTSsKLrkMDnUdXF51HVxOdR1cTnUdWklp9Hh3ZLkiRJktQJBmlJkiRJkjrBIF3bzi+6AKkVn0dVE59HVROfR1UTn0dVk5p9Hp0jLUmSJElSJ9gjLUmSJElSJxikJUmSJEnqBIN0jYmIpSLi2ogYFxEfR8R1EfGFoutSfYuIb0bEXyPi1YgYHxHPRcSvI2KeNu0GRcSFEfF+RDRHxD0RMbSoutU4IuKOiEgRcWKb6z6TqoiI2CIi/hYRTaV/Pz8SERu3et1nURUREetFxF0R8W5EfBIRj0XEvm3a9IuIkyPi7dK/10dHxAZF1az6EBFLRsSZpefp09K/l5dpp12Hnr+I6BERR0XEKxHxWUQ8GRE7VuTLdIBBuoZExADgXmBFYC/g28DywH0RMVeRtanu/QiYAvwM2Aw4B/gucHdE9ACIiABuLr3+fWBHoDf5+VyyiKLVGCJiV2C1dq77TKoiIuIg4EbgUWB7YCfgGmBA6XWfRVVERKwK3EN+vg4AdgAeBi6KiO+2anpR6fVfAFsBbwN3RsTqFS1Y9WY5YGdgLPD3mbTr6PN3AnAccBawOfAgcE1EbNGlVc8mFxurIRHxA+BUYIWU0oula4OBF4AjU0qnFlmf6ldELJRSeq/NtT2BS4FNUkr3RsS2wA3Aximl+0pt5gNeBi5PKR1a4bLVACJiEPAMcDhwJfDLlNLRpdd8JtXtSr0tzwBHpZROn0Ebn0VVRET8ivyX3/OnlJpaXR8NkFJaJyJWA54A9k0pXVx6vRcwBngupbRNxQtXXYiIHimlqaWf9wcuAAanlF5p1aZDz19ELAy8DpyUUjq21f0jgYVSSqtW5EvNhD3StWUb4MFyiAZIKb0MPABsW1hVqnttQ3TJw6XzEqXzNsBb5f9ILN03jtwL4/Op7vIb4OmU0p/bec1nUpWwLzAVOHcmbXwWVSl9gEnA+DbXx9Hy3/3blNpcXX4xpTQZuArYNCL6VqBO1aFyiJ6Fjj5/m5Kf58vb3H85MLTUmVgog3RtWQV4up3rY4CVK1yLtGHp/EzpPLPn8wsRMXdFqlLDiIj1gT2Bg2fQxGdSlbA+8CzwrYh4KSImR8SLEdH6ufRZVKVcUjr/PiIWj4iBEXEAsAlwWum1VYCXU0qftrl3DDm4LFeRStWoOvr8rQJMAF5spx1UQfYxSNeW+clzDtr6EBhU4VrUwCJiCeB44J6U0iOlyzN7PsFnVF0oIvoA5wG/Syk9N4NmPpOqhMXJ65WcDJwEfAO4GzirNCULfBZVISmlp4Hh5JEOb5Kfuz8A30kpXVVqNqvncf5uLlONraPP3/zAR2n6echV85z2KroASbWl1HNyIzAZ2KfgctS4jgT6A78suhA1vB7APMDeKaXrStfuLc2dPioifl9YZWo4EbE88Fdyr913yEO8twXOjYjPUkpXFFmfVE8M0rVlLO3/rfWM/mZH6lIR0Z88p28IsGFK6Y1WL8/s+Sy/Ls2xyFv+/RzYH+jbZj5f34gYCHyCz6Qq4wNyj/Tdba7fRV6lezF8FlU5vyLPP90qpTSpdG1kRCwAnBERfyY/b0u3c2/5efywndekrtLR528sMDAiok2vdNU8pw7tri1jyPMF2loZ+E+Fa1GDiYjewLXAMGCLlNK/2zSZ2fP5WuvVQ6U5NAToR15wZGyrA/JqtWOBofhMqjLGzOL1qfgsqnKGAk+2CtFlDwELAAuTn8fBpW1VW1sZmMj0c1KlrtTR528M0BdYtp12UAXZxyBdW24C1o6IIeULpaFj65Vek7pFaa/oK4CNge1SSg+20+wmYImI2LDVffMCW+Pzqa71BLBROwfkcL0R+V/EPpOqhOtL503bXN8MeCOl9D98FlU5/wNWL60j0dpXgc/IvXg3k/eZ3qn8Ymn7oV2Au1JKEypUqxpTR5+/O8ijK3Zvc/8e5N06Xq5ArTPl0O7acgFwCHBjRBwNJPJG5a+TF92RussfyP+H90ugOSLWbvXaG6Uh3jcBo4HLI+LH5F7Bo4AAflvhelXHUkofAaPaXo8IgFdTSqNKv/tMqhJuA+4DzouIBYH/kv//8hu0rCPhs6hKOQu4Brg5Is4mz5HeBtgVOC2lNBF4PCKuBk4vjTZ7GfguMJjpQ4vUKRHxzdKPa5TOm0fEe8B7KaX7U0odev5SSu9GxKnktSY+AR4jh+2Nyc904WL6hdBUzUpzA08DRpD/BTwSOKz1RudSV4uIV2h/PgvA/6WUjiu1mx/4HbAdeejtaOCIlNKT3V+lGl1EJOCXKaWjW13zmVS3K/Uu/xr4Jnku9LPASSmlK1u18VlURUTE5sBPyNMJ+gEvAecD56WUppTalBdr3A0YCDwJ/KT8F5HS7Cr9u7g996eUhpfadOj5i4ie5L90PABYFHgOOD6ldG131N5ZBmlJkiRJkjrBOdKSJEmSJHWCQVqSJEmSpE4wSEuSJEmS1AkGaUmSJEmSOsEgLUmSJElSJxikJUmSJEnqBIO0JEmqiIh4JSIuL7oOSZLmlEFakiRJkqROMEhLkiRJktQJBmlJkupQRKwWETdFxNiIGB8RD0TE11q9fklEvBER60bEwxHxWWno9ffbea+1IuKeiGiKiOaIGBkRa7XTbsOIuDsixpXaPRkR+7XT7lsR8UypzSMRsX7X/wlIktR9DNKSJNWZiPgK8E9gfuAAYEfgA+CeiFijVdN5gauBS4HtgFHA7yNi71bvtSpwPzAI2BvYs3Tf/RGxWqt22wIjgT7AQcC2wB+BpduU9zXgh8AxwC5AT+CWiBg4h19bkqSKiZRS0TVIkqQuFBEjgcWB1VJKE0vXegJPA8+llLaLiEuAvYBdU0pXtbr3buCLwDIppRQR1wJfL/3+UanNvMArwKiU0g4REcDLwPvAWimlqTOo6xVgPmBISmls6dow4GFg95TSlV36ByFJUjexR1qSpDoSEf2BDYFrgKkR0SsiegEB3ANs0Kr5FOCvbd7iKuALwBKl3zcAbimHaICU0sfATaXPAViB3PN84YxCdCujyyG65N+l8xdm/e0kSaoOBmlJkurL/OTh0scAk9ochwCDIqL87/+xKaVJbe5/p3QuB+n5gbfb+Zz/kYd7AyxQOr/Rgfo+bP1LSmlC6cd+HbhXkqSq0KvoAiRJUpf6CJgK/AH4U3sNUkpT82hsBkVE7zZhepHS+c3S+UNg0XbeZlGg3LP8fum8RDvtJEmqOwZpSZLqSEqpOSL+DqwGPDaLodY9yQuRXdXq2reA12gJ0vcDW0TEPCmlTwAiYh5ga/LiZADPk+dM7x8R5ycXYJEk1TmDtCRJ9ecI4G/AnRFxEXlo9oLAV4CeKaWfltp9Avw2IhYEXgB2JS8stnerMHwCsBUwMiJ+AyTgJ8AA4HiA0qJkhwHXAfdGxLnAe8BKwMIppWO7+ftKklRRzpGWJKnOpJQeA9Ykb3n1e+Au4AxgKDlgl31M7oHeC7gR2Aj4QUrp0lbv9RQwvNT2UuAyoAnYMKX0ZKt2NwIjSr9eRF6M7EByT7UkSXXF7a8kSWpApe2vvp5SWrLoWiRJqjX2SEuSJEmS1AkGaUmSJEmSOsGh3ZIkSZIkdYI90pIkSZIkdYJBWpIkSZKkTjBIS5IkSZLUCQZpSZIkSZI6wSAtSZIkSVIn/D9nKvBASJhCgQAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "logger.setLevel(3)\n", + "matplotlib.rcParams.update({'font.size': 16})\n", + "plt.figure(figsize=(16,12))\n", + "plt.plot(train_result['epoch'], train_result['train_loss'], color='b', label='train')\n", + "plt.plot(train_result['epoch'], train_result['val_loss'], color='r', label='va')\n", + "plt.title('CXE Loss')\n", + "plt.ylabel('CXE')\n", + "plt.xlabel('epoch')\n", + "plt.grid(True)\n", + "plt.show()\n", + "\n", + "plt.figure(figsize=(16,12))\n", + "plt.plot(train_result['epoch'], train_result['train_acc'], color='b', label='train')\n", + "plt.plot(train_result['epoch'], train_result['val_acc'], color='r', label='va')\n", + "plt.title('Classification Accuracy')\n", + "plt.ylabel('Accuracy (%)')\n", + "plt.xlabel('epoch')\n", + "plt.grid(True)\n", + "plt.show()\n", + "\n", + "plt.figure(figsize=(16,12))\n", + "plt.plot(train_result['epoch'], train_result['lr'], color='b')\n", + "plt.title('Learning Rate')\n", + "plt.ylabel('lr')\n", + "plt.xlabel('epoch')\n", + "plt.grid(True)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = TDNNXVec(16, 2, 32, 16, 100)\n", + "state_dict=torch.load(\"./tdnn_xvec/model_ep0099.pth\")\n", + "model.load_state_dict(state_dict['model_state_dict'])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the Trials dataset with different speakers than the train data.\n", + "#trial_data = IVDataset(num_spks=20, seed=4321)\n", + "trial_loader = DataLoader(train_data, batch_size=100, shuffle=True)\n", + "# sample enrollment data and compute x-vectors\n", + "x_e, y_e = next(iter(trial_loader))\n", + "z_e = model(x_e, infer=True).detach().cpu().numpy()\n", + "# sample test data and compute x-vectors\n", + "x_t, y_t = next(iter(trial_loader))\n", + "z_t = model(x_t, infer=True).detach().cpu().numpy()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.12987226757966816\n" + ] + } + ], + "source": [ + "from hyperion.utils.math import cosine_scoring\n", + "from hyperion.np.metrics import compute_eer\n", + "scores = cosine_scoring(z_e, z_t)\n", + "key = (y_e[:, None] - y_t[None,:])==0\n", + "tar_scores = scores[key==1]\n", + "non_scores = scores[key==0]\n", + "eer = compute_eer(tar_scores, non_scores)\n", + "print(eer)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "488a239b304e646027d6710c3377746db4487e56624448f35f81edd765904a6d" + }, + "kernelspec": { + "display_name": "Python 3.8.12 ('py38_pt101_cu112')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 1d41410f40ee45eb8e75cf84269c1993f815b46b Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 14 Jun 2022 08:48:01 -0400 Subject: [PATCH 018/154] added default config to voxceleb/v2 --- egs/voxceleb/v2/run_030_extract_xvectors.sh | 77 ++++++++-------- egs/voxceleb/v2/run_040_eval_be.sh | 97 +++++++-------------- 2 files changed, 72 insertions(+), 102 deletions(-) diff --git a/egs/voxceleb/v2/run_030_extract_xvectors.sh b/egs/voxceleb/v2/run_030_extract_xvectors.sh index 77c46672..da3ffde8 100755 --- a/egs/voxceleb/v2/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v2/run_030_extract_xvectors.sh @@ -7,64 +7,67 @@ . ./path.sh set -e -stage=1 +stage=2 config_file=default_config.sh use_gpu=false -nnet_stage=1 +nnet_stage=3 xvec_chunk_length=120 #seconds . parse_options.sh || exit 1; . $config_file if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" - xvec_cmd="$cuda_eval_cmd --mem 6G" + xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 6G" else - xvec_cmd="$train_cmd --mem 12G" + xvec_cmd="$train_cmd --mem 12G" fi - -if [ $nnet_stage -eq 2 ];then +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then nnet=$nnet_s2 - nnet_name=$nnet_name_s2 + nnet_name=$nnet_s2_name elif [ $nnet_stage -eq 3 ];then nnet=$nnet_s3 - nnet_name=$nnet_name_s3 + nnet_name=$nnet_s3_name fi xvector_dir=exp/xvectors/$nnet_name if [ $stage -le 1 ]; then - # Extract xvectors for training LDA/PLDA - for name in voxceleb2cat_train - do - if [ $plda_num_augs -eq 0 ]; then - steps_xvec/extract_wav2vec2xvectors.sh --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ - $nnet data/${name} \ - $xvector_dir/${name} - else - steps_xvec/extract_wav2vec2xvectors.sh --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ - --aug-config $plda_aug_config --num-augs $plda_num_augs \ - $nnet data/${name} \ - $xvector_dir/${name}_augx${plda_num_augs} \ - data/${name}_augx${plda_num_augs} - fi - done + # Extract xvectors for training LDA/PLDA + for name in voxceleb2cat_train + do + if [ $plda_num_augs -eq 0 ]; then + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + $nnet data/${name} \ + $xvector_dir/${name} + else + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ + --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + --aug-config $plda_aug_config --num-augs $plda_num_augs \ + $nnet data/${name} \ + $xvector_dir/${name}_augx${plda_num_augs} \ + data/${name}_augx${plda_num_augs} + fi + done fi - if [ $stage -le 2 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_wav2vec2xvectors.sh \ - --cmd "$xvec_cmd" --nj $nj ${xvec_args} \ - $nnet data/$name \ - $xvector_dir/$name - done + # Extracts x-vectors for evaluation + for name in voxceleb1_test + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj $nj ${xvec_args} \ + $nnet data/$name \ + $xvector_dir/$name + done fi exit diff --git a/egs/voxceleb/v2/run_040_eval_be.sh b/egs/voxceleb/v2/run_040_eval_be.sh index d9c03bba..ac561344 100755 --- a/egs/voxceleb/v2/run_040_eval_be.sh +++ b/egs/voxceleb/v2/run_040_eval_be.sh @@ -7,27 +7,29 @@ . ./path.sh set -e -stage=1 +# By default we evaluate the nnet after finetuning stage 3 and only with cosine scoring +stage=3 config_file=default_config.sh -nnet_stage=1 +nnet_stage=3 . parse_options.sh || exit 1; . $config_file . datapath.sh -if [ $nnet_stage -eq 2 ];then +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then nnet=$nnet_s2 - nnet_name=$nnet_name_s2 + nnet_name=$nnet_s2_name elif [ $nnet_stage -eq 3 ];then nnet=$nnet_s3 - nnet_name=$nnet_name_s3 + nnet_name=$nnet_s3_name fi - plda_label=${plda_type}y${plda_y_dim}_v1 be_name=lda${lda_dim}_${plda_label}_${plda_data} - xvector_dir=exp/xvectors/$nnet_name be_dir=exp/be/$nnet_name/$be_name score_dir=exp/scores/$nnet_name/${be_name} @@ -35,46 +37,44 @@ score_plda_dir=$score_dir/plda score_cosine_dir=exp/scores/$nnet_name/cosine if [ $stage -le 1 ]; then - echo "Train PLDA on Voxceleb2" - steps_be/train_be_v1.sh --cmd "$train_cmd" \ - --lda_dim $lda_dim \ - --plda_type $plda_type \ - --y_dim $plda_y_dim --z_dim $plda_z_dim \ - $xvector_dir/$plda_data/xvector.scp \ - data/$plda_data \ - $be_dir & - + steps_be/train_be_v1.sh \ + --cmd "$train_cmd" \ + --lda_dim $lda_dim \ + --plda_type $plda_type \ + --y_dim $plda_y_dim --z_dim $plda_z_dim \ + $xvector_dir/$plda_data/xvector.scp \ + data/$plda_data \ + $be_dir & wait - fi if [ $stage -le 2 ];then echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" - steps_be/eval_be_v1.sh --cmd "$train_cmd" --plda_type $plda_type \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $be_dir/lda_lnorm.h5 \ - $be_dir/plda.h5 \ - $score_plda_dir/voxceleb1_scores - + steps_be/eval_be_v1.sh \ + --cmd "$train_cmd" --plda_type $plda_type \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $be_dir/lda_lnorm.h5 \ + $be_dir/plda.h5 \ + $score_plda_dir/voxceleb1_scores + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir - + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + for f in $(ls $score_plda_dir/*_results); do - echo $f - cat $f - echo "" + echo $f + cat $f + echo "" done - + fi - score_plda_dir=$score_cosine_dir if [ $stage -le 3 ];then @@ -98,39 +98,6 @@ if [ $stage -le 3 ];then fi -be_dir=exp/be/$nnet_name/cw -score_plda_dir=$score_dir/cw_cosine - -if [ $stage -le 4 ]; then - echo "Train centering+whitening on Voxceleb2" - steps_be/train_be_v2.sh --cmd "$train_cmd" \ - $xvector_dir/$plda_data/xvector.scp \ - data/$plda_data \ - $be_dir -fi - - -if [ $stage -le 5 ];then - - echo "Eval Voxceleb 1 with CentWhiten + Cosine scoring" - steps_be/eval_be_v2.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $be_dir/cw.h5 \ - $score_plda_dir/voxceleb1_scores - - $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - -fi exit From cf433a73403a7319c0999752b9d3de22344b897f Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 4 Jul 2022 12:12:30 -0400 Subject: [PATCH 019/154] added recipes with wavlmlarge --- egs/voxceleb/v2/cmd.sh | 2 +- ...ain_hubertbase_ecapatdnn512x2_default.yaml | 6 - ...v2vec2base960h_ecapatdnn512x2_default.yaml | 6 - ...n_wav2vec2base_ecapatdnn512x2_default.yaml | 6 - ...baseplus6l_ecapatdnn512x3_stage1_v1.0.yaml | 43 +++ ...s6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml | 43 +++ ...baseplus9l_ecapatdnn512x3_stage1_v1.0.yaml | 45 +++ ...s9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml | 45 +++ ...lus_linfus_ecapatdnn512x3_stage1_v1.0.yaml | 45 +++ ...lmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml | 45 +++ ...lmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml | 50 +++ ...vlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml | 45 +++ ...wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml | 45 +++ ...wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml | 49 +++ ...wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml | 50 +++ .../v2/conf/trainer_phase1_adam_default.yaml | 20 -- .../v2/conf/trainer_phase1_sgd_default.yaml | 18 - .../v2/conf/trainer_phase2_sgd_default.yaml | 18 - .../v2/conf/trainer_phase3_sgd_default.yaml | 18 - .../conf/wavlmbaseplus6l_ecapatdnn512x3.yaml | 41 +++ ...wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml | 41 +++ .../conf/wavlmbaseplus9l_ecapatdnn512x3.yaml | 41 +++ ...wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml | 41 +++ .../wavlmbaseplus_linfus_ecapatdnn512x3.yaml | 40 +++ .../v2/conf/wavlmlarge12l_ecapatdnn512x3.yaml | 41 +++ .../v2/conf/wavlmlarge6l_ecapatdnn512x3.yaml | 41 +++ ...x2.yaml => wavlmlarge_ecapatdnn512x3.yaml} | 17 +- ...dnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh | 48 --- ...tdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh | 48 --- ...dnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh | 51 --- ...dnn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh | 53 --- ...dnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh | 49 --- ...fig_wavlmbaseplus6l_ecapatdnn512x3_v1.0.sh | 49 +++ ...lmbaseplus6l_linfus_ecapatdnn512x3_v1.0.sh | 49 +++ ...fig_wavlmbaseplus9l_ecapatdnn512x3_v1.0.sh | 49 +++ ...lmbaseplus9l_linfus_ecapatdnn512x3_v1.0.sh | 49 +++ ...ig_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh | 52 --- ...nfig_wavlmbaseplus_ecapatdnn512x3_v1.10.sh | 52 --- ...onfig_wavlmbaseplus_ecapatdnn512x3_v1.9.sh | 36 -- ...avlmbaseplus_linfus_ecapatdnn512x3_v1.0.sh | 49 +++ ...onfig_wavlmlarge12l_ecapatdnn512x3_v1.0.sh | 49 +++ ...config_wavlmlarge6l_ecapatdnn512x3_v1.0.sh | 49 +++ .../config_wavlmlarge_ecapatdnn512x3_v1.0.sh | 49 +++ egs/voxceleb/v2/run_030_extract_xvectors.sh | 3 +- .../xvectors/extract_wav2vec2xvectors.sh | 7 +- hyperion/bin/extract_wav2vec2xvectors.py | 20 +- hyperion/bin/finetune_wav2vec2xvector.py | 20 +- hyperion/bin/finetune_xvector_from_wav.py | 2 - hyperion/torch/data/weighted_seq_sampler.py | 164 ++++++--- hyperion/torch/layers/attention.py | 1 + hyperion/torch/layers/margin_losses.py | 83 ++++- .../hf_hubert2resnet1d_xvector.py | 11 +- .../hf_wav2vec2resnet1d_xvector.py | 10 + .../models/wav2xvectors/hf_wav2xvector.py | 34 +- .../wav2xvectors/hf_wavlm2resnet1d_xvector.py | 10 + .../torch/models/wav2xvectors/wav2xvector.py | 3 + .../models/xvectors/efficient_net_xvector.py | 33 +- .../torch/models/xvectors/resnet1d_xvector.py | 54 ++- .../torch/models/xvectors/resnet_xvector.py | 23 +- .../torch/models/xvectors/spinenet_xvector.py | 22 +- .../torch/models/xvectors/tdnn_xvector.py | 23 +- .../models/xvectors/transformer_xvector_v1.py | 48 +++ hyperion/torch/models/xvectors/xvector.py | 55 ++- hyperion/torch/narchs/classif_head.py | 9 + hyperion/torch/narchs/efficient_net.py | 63 +++- hyperion/torch/narchs/resnet1d_encoder.py | 70 +++- hyperion/torch/narchs/resnet2d_encoder.py | 17 + hyperion/torch/narchs/resnet_factory.py | 39 ++- hyperion/torch/narchs/spinenet_factory.py | 39 ++- hyperion/torch/narchs/tdnn_factory.py | 39 ++- .../torch/narchs/transformer_encoder_v1.py | 28 +- hyperion/torch/torch_model.py | 10 + hyperion/torch/tpm/hf/hf_hubert.py | 121 ++++++- hyperion/torch/tpm/hf/hf_wav2vec2.py | 123 ++++++- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 319 +++++++++++++++++- hyperion/torch/tpm/hf/hf_wavlm.py | 121 ++++++- hyperion/torch/utils/eval_utils.py | 171 ---------- 77 files changed, 2716 insertions(+), 762 deletions(-) delete mode 100644 egs/voxceleb/v2/conf/train_hubertbase_ecapatdnn512x2_default.yaml delete mode 100644 egs/voxceleb/v2/conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml delete mode 100644 egs/voxceleb/v2/conf/train_wav2vec2base_ecapatdnn512x2_default.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml delete mode 100644 egs/voxceleb/v2/conf/trainer_phase1_adam_default.yaml delete mode 100644 egs/voxceleb/v2/conf/trainer_phase1_sgd_default.yaml delete mode 100644 egs/voxceleb/v2/conf/trainer_phase2_sgd_default.yaml delete mode 100644 egs/voxceleb/v2/conf/trainer_phase3_sgd_default.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus6l_ecapatdnn512x3.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus_linfus_ecapatdnn512x3.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmlarge6l_ecapatdnn512x3.yaml rename egs/voxceleb/v2/conf/{wavlmbase_ecapatdnn512x2.yaml => wavlmlarge_ecapatdnn512x3.yaml} (75%) delete mode 100644 egs/voxceleb/v2/global_conf/config_hubertbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh delete mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base960h_ecapatdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh delete mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh delete mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh delete mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_ecapatdnn512x3_v1.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_linfus_ecapatdnn512x3_v1.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v1.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_linfus_ecapatdnn512x3_v1.0.sh delete mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh delete mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh delete mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.9.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_linfus_ecapatdnn512x3_v1.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v1.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmlarge6l_ecapatdnn512x3_v1.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v1.0.sh diff --git a/egs/voxceleb/v2/cmd.sh b/egs/voxceleb/v2/cmd.sh index 00f8d40a..71f3bae0 100755 --- a/egs/voxceleb/v2/cmd.sh +++ b/egs/voxceleb/v2/cmd.sh @@ -11,12 +11,12 @@ # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. if [ "$(hostname -d)" == "cm.gemini" ];then - #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" diff --git a/egs/voxceleb/v2/conf/train_hubertbase_ecapatdnn512x2_default.yaml b/egs/voxceleb/v2/conf/train_hubertbase_ecapatdnn512x2_default.yaml deleted file mode 100644 index 6cec83c8..00000000 --- a/egs/voxceleb/v2/conf/train_hubertbase_ecapatdnn512x2_default.yaml +++ /dev/null @@ -1,6 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: hubertbase_ecapatdnn512x2.yaml -trainer: trainer_swa_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml b/egs/voxceleb/v2/conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml deleted file mode 100644 index a7fc925e..00000000 --- a/egs/voxceleb/v2/conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml +++ /dev/null @@ -1,6 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: wav2vec2base960h_ecapatdnn512x2.yaml -trainer: trainer_swa_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2base_ecapatdnn512x2_default.yaml b/egs/voxceleb/v2/conf/train_wav2vec2base_ecapatdnn512x2_default.yaml deleted file mode 100644 index 90f35805..00000000 --- a/egs/voxceleb/v2/conf/train_wav2vec2base_ecapatdnn512x2_default.yaml +++ /dev/null @@ -1,6 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: wav2vec2base_ecapatdnn512x2.yaml -trainer: trainer_swa_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..570aad6a --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,43 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wavlmbaseplus6l_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..9838b855 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,43 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..1028f79a --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..2c2e5b64 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..eb32ce0c --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wavlmbaseplus_linfus_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..895bcb2b --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..1721e337 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,50 @@ +data: + train: + dataset: + max_chunk_length: 6.0 + min_chunk_length: 6.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 2 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..181d8fd7 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wavlmlarge6l_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..1af241ea --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml new file mode 100644 index 00000000..1298a056 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml @@ -0,0 +1,49 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5.5e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 4.4e-3 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..2867cfef --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,50 @@ +data: + train: + dataset: + max_chunk_length: 6.0 + min_chunk_length: 6.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 16 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 2 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/trainer_phase1_adam_default.yaml b/egs/voxceleb/v2/conf/trainer_phase1_adam_default.yaml deleted file mode 100644 index 03c5cc84..00000000 --- a/egs/voxceleb/v2/conf/trainer_phase1_adam_default.yaml +++ /dev/null @@ -1,20 +0,0 @@ -optim: - opt_type: adam - lr: 0.05 - amsgrad: true - beta1: 0.9 - beta2: 0.95 - weight_decay: 4e-4 -lrsched: - lrsch_type: exp_lr - decay_steps: 8000 - hold_steps: 40000 - min_lr: 1.0e-05 - decay_rate: 0.5 - warmup_steps: 1000 - update_lr_on_opt_step: true -use_amp: true -log_interval: 1000 -epochs: 30 -eff_batch_size: 1024 -train_mode: hf-feats-frozen-nograd diff --git a/egs/voxceleb/v2/conf/trainer_phase1_sgd_default.yaml b/egs/voxceleb/v2/conf/trainer_phase1_sgd_default.yaml deleted file mode 100644 index 7fc848a0..00000000 --- a/egs/voxceleb/v2/conf/trainer_phase1_sgd_default.yaml +++ /dev/null @@ -1,18 +0,0 @@ -optim: - opt_type: sgd - lr: 0.45 - momentum: 0.9 - weight_decay: 4e-4 -lrsched: - lrsch_type: exp_lr - decay_rate: 0.5 - decay_steps: 2100 - hold_steps: 1000 - min_lr: 4e-4 - warmup_steps: 1000 - update_lr_on_opt_step: true -use_amp: true -log_interval: 1000 -epochs: 30 -eff_batch_size: 1024 -train_mode: hf-feats-frozen-nograd diff --git a/egs/voxceleb/v2/conf/trainer_phase2_sgd_default.yaml b/egs/voxceleb/v2/conf/trainer_phase2_sgd_default.yaml deleted file mode 100644 index ae708b62..00000000 --- a/egs/voxceleb/v2/conf/trainer_phase2_sgd_default.yaml +++ /dev/null @@ -1,18 +0,0 @@ -optim: - opt_type: sgd - lr: 5.5e-3 - momentum: 0.9 - weight_decay: 1e-4 -lrsched: - lrsch_type: exp_lr - decay_rate: 0.5 - decay_steps: 5000 - hold_steps: 6000 - min_lr: 4.4e-3 - warmup_steps: 6000 - update_lr_on_opt_step: true -use_amp: true -log_interval: 1000 -epochs: 7 -eff_batch_size: 512 -train_mode: full diff --git a/egs/voxceleb/v2/conf/trainer_phase3_sgd_default.yaml b/egs/voxceleb/v2/conf/trainer_phase3_sgd_default.yaml deleted file mode 100644 index 2529e25a..00000000 --- a/egs/voxceleb/v2/conf/trainer_phase3_sgd_default.yaml +++ /dev/null @@ -1,18 +0,0 @@ -optim: - opt_type: sgd - lr: 2.3e-4 - momentum: 0.9 - weight_decay: 1e-4 -lrsched: - lrsch_type: exp_lr - decay_rate: 0.5 - decay_steps: 5000 - hold_steps: 6000 - min_lr: 2e-4 - warmup_steps: 6000 - update_lr_on_opt_step: true -use_amp: true -log_interval: 1000 -epochs: 7 -eff_batch_size: 192 -train_mode: full diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus6l_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus6l_ecapatdnn512x3.yaml new file mode 100644 index 00000000..dbe4ff65 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus6l_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 6 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml new file mode 100644 index 00000000..99a3778b --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 6 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: linear +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3.yaml new file mode 100644 index 00000000..ddbf3ca4 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml new file mode 100644 index 00000000..90b0fbef --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: linear +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_linfus_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_linfus_ecapatdnn512x3.yaml new file mode 100644 index 00000000..6f1e9f56 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus_linfus_ecapatdnn512x3.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: linear +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3.yaml new file mode 100644 index 00000000..0de43fd4 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge6l_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wavlmlarge6l_ecapatdnn512x3.yaml new file mode 100644 index 00000000..062137f3 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge6l_ecapatdnn512x3.yaml @@ -0,0 +1,41 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 6 +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbase_ecapatdnn512x2.yaml b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3.yaml similarity index 75% rename from egs/voxceleb/v2/conf/wavlmbase_ecapatdnn512x2.yaml rename to egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3.yaml index b5d14412..f36ac70c 100644 --- a/egs/voxceleb/v2/conf/wavlmbase_ecapatdnn512x2.yaml +++ b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3.yaml @@ -1,8 +1,8 @@ hf_feats: - pretrained_model_path: microsoft/wavlm-base + pretrained_model_path: microsoft/wavlm-large xvector: resnet_enc: - in_feats: 80 + in_feats: 1024 in_conv_channels: 512 in_kernel_size: 5 in_stride: 1 @@ -10,6 +10,7 @@ xvector: resb_repeats: - 1 - 1 + - 1 resb_channels: - 512 resb_kernel_sizes: @@ -17,6 +18,7 @@ xvector: resb_dilations: - 2 - 3 + - 4 resb_strides: - 1 res2net_width_factor: 1 @@ -28,10 +30,11 @@ xvector: pool_net: pool_type: ch-wise-att-mean+stddev inner_feats: 128 - embed_dim: 256 - cos_scale: 30.0 - margin: 0.3 - margin_warmup_epochs: 20.0 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 dropout_rate: 0.0 -feat_fusion_start: 2 feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/global_conf/config_hubertbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh b/egs/voxceleb/v2/global_conf/config_hubertbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh deleted file mode 100644 index 9ea07c9c..00000000 --- a/egs/voxceleb/v2/global_conf/config_hubertbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh +++ /dev/null @@ -1,48 +0,0 @@ -# Hubert base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 - -# hugging face model -hf_model_name=hubertbase - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_hubert2resnet1d - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -dropout=0 -embed_dim=256 -lr=0.05 -s=30 -margin_warmup=20 -margin=0.3 -nnet_num_epochs=70 - -lr=0.002 -xvec_train_base_cfg=conf/train_hubertbase_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" - -nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base960h_ecapatdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base960h_ecapatdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index b6cbdf30..00000000 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2base960h_ecapatdnn512x2_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,48 +0,0 @@ -# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 - -# hugging face model -hf_model_name=wav2vec2base960h - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wav2vec2resnet1d - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -dropout=0 -embed_dim=256 -lr=0.05 -s=30 -margin_warmup=20 -margin=0.3 -nnet_num_epochs=70 - -lr=0.002 -xvec_train_base_cfg=conf/train_wav2vec2base960h_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" - -nnet_name=${hf_model_name}_ecapatdnn512x3_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh deleted file mode 100644 index b40ff3d1..00000000 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v3.sh +++ /dev/null @@ -1,51 +0,0 @@ -# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 - -# hugging face model -hf_model_name=wav2vec2base - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wav2vec2resnet1d - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -dropout=0 -embed_dim=256 -lr=0.05 -s=30 -margin_warmup=20 -margin=0.3 -nnet_num_epochs=70 - - -lr=0.001 -#lr=0.005 -xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75" - -nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v3 #v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/swa_model_ep0076.pth -nnet=$nnet_dir/model_ep0060.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh deleted file mode 100644 index 1509e46e..00000000 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v4.sh +++ /dev/null @@ -1,53 +0,0 @@ -# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 - -# hugging face model -hf_model_name=wav2vec2base - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wav2vec2resnet1d - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -dropout=0 -embed_dim=256 -lr=0.05 -s=30 -margin_warmup=20 -margin=0.3 -nnet_num_epochs=70 - - -lr=0.001 -#lr=0.005 -xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75 --data.train.dataset.max-chunk-length 2 --data.train.dataset.min-chunk-length 2" - -nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v4 #v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/swa_model_ep0076.pth -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/model_ep0030.pth -nnet=$nnet_dir/model_ep0020.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh b/egs/voxceleb/v2/global_conf/config_wavlmbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh deleted file mode 100644 index ba4272a2..00000000 --- a/egs/voxceleb/v2/global_conf/config_wavlmbase_ecapatdnn512x2_arcs30m0.3_adam_lr0.002_amp.v1.sh +++ /dev/null @@ -1,49 +0,0 @@ -# WavLM base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 - -# hugging face model -hf_model_name=wavlmbase - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wavlm2resnet1d - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -dropout=0 -embed_dim=256 -lr=0.05 -s=30 -margin_warmup=20 -margin=0.3 -nnet_num_epochs=70 - -lr=0.002 -lr=0.001 -xvec_train_base_cfg=conf/train_wavlmbase_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" - -nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..d02c11f7 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus6l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_linfus_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_linfus_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..639225c3 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus6l_linfus_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus6l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_linfus_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..58bded52 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_linfus_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_linfus_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..4553f40b --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_linfus_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_linfus_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh deleted file mode 100644 index b580508a..00000000 --- a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.4.sh +++ /dev/null @@ -1,52 +0,0 @@ -# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 - -# hugging face model -hf_model_name=wavlmbaseplus - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wavlm2resnet1d - -xvec_train_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml -xvec_train_args="--model.xvector.margin-warmup-epochs 5 --trainer.lrsched.decay-steps 4200 --trainer.lrsched.warmup-steps 1500 --trainer.lrsched.hold-steps 1500 --trainer.epochs 60 --model.feat-fusion-method weighted-avg --model.feat-fusion-start 2 --model.xvector.intertop-margin 0.1" - -nnet_name=${hf_model_name}_ecapatdnn512x3_v1.10 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0060.pth - -xvec_train_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml -xvec_train_s2_args="--trainer.epochs 20" -nnet_name_s2=${nnet_name}.s2 -nnet_s2_dir=exp/xvector_nnets/$nnet_name_s2 -nnet_s2=$nnet_s2_dir/model_ep0007.pth -nnet_s2=$nnet_s2_dir/model_ep0020.pth - -xvec_train_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml -xvec_train_s3_args="--trainer.epochs 10 --data.train.dataset.min-chunk-length 6 --data.train.dataset.max-chunk-length 6 --model.xvector.intertop-margin 0.1" -nnet_name_s3=${nnet_name}.s3.4 -nnet_s3_dir=exp/xvector_nnets/$nnet_name_s3 -nnet_s3=$nnet_s3_dir/model_ep0002.pth -nnet_s3=$nnet_s3_dir/model_ep0006.pth -#nnet_s3=$nnet_s3_dir/model_ep0010.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh deleted file mode 100644 index b84c1f15..00000000 --- a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.10.sh +++ /dev/null @@ -1,52 +0,0 @@ -# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 - -# hugging face model -hf_model_name=wavlmbaseplus - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wavlm2resnet1d - -xvec_train_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml -xvec_train_args="--model.xvector.margin-warmup-epochs 5 --trainer.lrsched.decay-steps 4200 --trainer.lrsched.warmup-steps 1500 --trainer.lrsched.hold-steps 1500 --trainer.epochs 60 --model.feat-fusion-method weighted-avg --model.feat-fusion-start 2 --model.xvector.intertop-margin 0.1" - -nnet_name=${hf_model_name}_ecapatdnn512x3_v1.10 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0060.pth - -xvec_train_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml -xvec_train_s2_args="--trainer.epochs 20" -nnet_name_s2=${nnet_name}.s2 -nnet_s2_dir=exp/xvector_nnets/$nnet_name_s2 -nnet_s2=$nnet_s2_dir/model_ep0007.pth -nnet_s2=$nnet_s2_dir/model_ep0020.pth - -xvec_train_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml -xvec_train_s3_args="--trainer.epochs 10 --data.train.dataset.min-chunk-length 6 --data.train.dataset.max-chunk-length 6" -nnet_name_s3=${nnet_name}.s3 -nnet_s3_dir=exp/xvector_nnets/$nnet_name_s3 -nnet_s3=$nnet_s3_dir/model_ep0002.pth -nnet_s3=$nnet_s3_dir/model_ep0006.pth -nnet_s3=$nnet_s3_dir/model_ep0010.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.9.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.9.sh deleted file mode 100644 index dccd01e1..00000000 --- a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.9.sh +++ /dev/null @@ -1,36 +0,0 @@ -# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 - -# hugging face model -hf_model_name=wavlmbaseplus - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wavlm2resnet1d - -xvec_train_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml -xvec_train_args="--model.xvector.margin-warmup-epochs 5 --trainer.lrsched.decay-steps 3150 --trainer.lrsched.warmup-steps 1500 --trainer.lrsched.hold-steps 1500 --trainer.epochs 45 --model.feat-fusion-method weighted-avg --model.feat-fusion-start 2 --model.xvector.intertop-margin 0.1" - -nnet_name=${hf_model_name}_ecapatdnn512x3_v1.9 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0045.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_linfus_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_linfus_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..c75280f0 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_linfus_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_linfus_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..11425baa --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge6l_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge6l_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..e3c9466b --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge6l_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge6l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..8e870abe --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/run_030_extract_xvectors.sh b/egs/voxceleb/v2/run_030_extract_xvectors.sh index da3ffde8..67122f85 100755 --- a/egs/voxceleb/v2/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v2/run_030_extract_xvectors.sh @@ -11,12 +11,13 @@ stage=2 config_file=default_config.sh use_gpu=false nnet_stage=3 +hf_chunk_length=120 #seconds xvec_chunk_length=120 #seconds . parse_options.sh || exit 1; . $config_file if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" xvec_cmd="$cuda_eval_cmd --mem 6G" else xvec_cmd="$train_cmd --mem 12G" diff --git a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh index 56ed6b56..6c6f0fdf 100755 --- a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh +++ b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh @@ -4,7 +4,8 @@ nj=30 cmd="run.pl" -chunk_length=0 # The chunk size over which the embedding is extracted. +hf_chunk_length=0 # The chunk size over which the embedding is extracted. +xvec_chunk_length=0 # The chunk size over which the embedding is extracted. use_gpu=false write_utt2speech_dur=true # If true writes utt2speech_dur. stage=0 @@ -87,7 +88,7 @@ if [ $stage -le 0 ];then ${args} $write_speech_dur_opt \ --part-idx JOB --num-parts $nj \ --input $data_dir/wav.scp \ - --model-path $nnet_file --chunk-length $chunk_length \ + --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ --output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp set -e fi @@ -109,7 +110,7 @@ if [ $stage -le 1 ];then ${args} $write_speech_dur_opt \ --part-idx $i --num-parts $nj \ --input $data_dir/wav.scp \ - --model-path $nnet_file --chunk-length $chunk_length \ + --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & fi done diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 5675ace1..cfa28f0a 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -92,7 +92,8 @@ def extract_xvectors( scp_sep, vad_path_prefix, model_path, - chunk_length, + hf_chunk_length, + xvec_chunk_length, embed_layer, random_utt_length, min_utt_length, @@ -188,7 +189,8 @@ def extract_xvectors( y = ( model.extract_embed( x, - chunk_length=fs * chunk_length, + hf_chunk_length=hf_chunk_length, + xvec_chunk_length=xvec_chunk_length, embed_layer=embed_layer, ) .cpu() @@ -259,11 +261,21 @@ def extract_xvectors( parser.add_argument("--model-path", required=True) parser.add_argument( - "--chunk-length", + "--hf-chunk-length", type=int, default=0, help=( - "number of frames used in each forward pass " + "max. chunk length used in each forward pass " + "of the hf encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--xvec-chunk-length", + type=int, + default=0, + help=( + "max. chunk length used in each forward pass " "of the x-vector encoder," "if 0 the full utterance is used" ), diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index fda819ad..f2c740da 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -71,13 +71,25 @@ def init_data(partition, rank, num_gpus, **kwargs): return data_loader +# def init_model(num_classes, in_model_file, rank, **kwargs): +# xvec_args = kwargs["model"]["xvector"] +# if rank == 0: +# logging.info("xvector network ft args={}".format(xvec_args)) +# xvec_args["num_classes"] = num_classes +# model = TML.load(in_model_file) +# model.rebuild_output_layer(**xvec_args) +# if rank == 0: +# logging.info("model={}".format(model)) +# return model + + def init_model(num_classes, in_model_file, rank, **kwargs): - xvec_args = kwargs["model"]["xvector"] + model_args = kwargs["model"] if rank == 0: - logging.info("xvector network ft args={}".format(xvec_args)) - xvec_args["num_classes"] = num_classes + logging.info("xvector network ft args={}".format(model_args)) + model_args["xvector"]["num_classes"] = num_classes model = TML.load(in_model_file) - model.rebuild_output_layer(**xvec_args) + model.change_config(**model_args) if rank == 0: logging.info("model={}".format(model)) return model diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 5ddc4d82..b5a7f63b 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -77,8 +77,6 @@ def init_xvector(num_classes, in_model_path, rank, **kwargs): xvec_args["num_classes"] = num_classes model = TML.load(in_model_path) model.rebuild_output_layer(**xvec_args) - # if train_mode == "ft-embed-affine": - # model.freeze_preembed_layers() if rank == 0: logging.info("x-vector-model={}".format(model)) return model diff --git a/hyperion/torch/data/weighted_seq_sampler.py b/hyperion/torch/data/weighted_seq_sampler.py index 34c3fcbc..e679251b 100644 --- a/hyperion/torch/data/weighted_seq_sampler.py +++ b/hyperion/torch/data/weighted_seq_sampler.py @@ -15,14 +15,37 @@ class ClassWeightedSeqSampler(Sampler): + """Samples utterances following: + 1. It samples a class with a given probability. + 2. It samples an random utterance from the class. + + Attributes: + dataset: dataset containing audio or feature sequences. + batch_size: batch size per gpu for the largest chunk-size. + num_egs_per_utt_epoch: number of samples per utterance and epoch. + num_egs_per_class: number of samples per class in each batch. + num_egs_per_utt: number of samples per utterance in each batch. + var_batch_size: whether to use variable batch size when using + variable utterance length. + num_hard_prototypes: number of hard prototype classes per random class + in a batch. + num_egs_per_hard_prototype: number of utterances per each hard + prototype in a batch. + iters_per_epoch: deprecated, if not None, will overwrite "num_egs_per_utt_epoch". + """ + def __init__( self, dataset, batch_size=1, - iters_per_epoch="auto", + num_egs_per_utt_epoch="auto", num_egs_per_class=1, num_egs_per_utt=1, var_batch_size=False, + num_hard_prototypes=0, + num_egs_per_hard_prototype=1, + affinity_matrix=None, + iters_per_epoch=None, ): super().__init__(None) @@ -34,72 +57,101 @@ def __init__( rank = 0 world_size = 1 + if iters_per_epoch is not None: + num_egs_per_utt_epoch = iters_per_epoch + self.dataset = dataset self.batch_size = batch_size self.num_egs_per_class = num_egs_per_class self.num_egs_per_utt = num_egs_per_utt self.var_batch_size = var_batch_size + self.num_hard_prototypes = num_hard_prototypes + self.num_egs_per_hard_prototype = num_egs_per_hard_prototype self.batch = 0 self.rank = rank self.world_size = world_size - if rank > 0: # this will make sure that each process produces different data # when using ddp dummy = torch.rand(1000 * rank) del dummy - if iters_per_epoch == "auto": - self._compute_iters_auto() - else: - self.iters_per_epoch = iters_per_epoch - - if var_batch_size: - avg_batch_size = self._compute_avg_batch_size() - else: - avg_batch_size = self.batch_size - - self._len = int( - math.ceil( - self.iters_per_epoch * dataset.num_seqs / avg_batch_size / world_size - ) - ) - print( - "num_batches", - self.iters_per_epoch, - dataset.num_seqs, - avg_batch_size, - world_size, + self.has_short_seqs = self.dataset.short_seq_exist + self.set_num_egs_per_utt_epoch(num_egs_per_utt_epoch) + self._compute_avg_batch_size() + self._compute_len(world_size) + self._compute_num_classes_per_batch() + self.set_hard_prototypes(affinity_matrix) + logging.info( + "batches/epoch=%d classes/batch=%d avg-batch-size/gpu=%d samples/(utt*epoch)=%d", self._len, - flush=True, + self._num_classes_per_batch, + self.avg_batch_size, + self.num_egs_per_utt_epoch, ) - self.avg_batch_size = avg_batch_size - logging.info("num batches per epoch: %d", self._len) - - self._num_classes_per_batch = int( - math.ceil(avg_batch_size / num_egs_per_class / num_egs_per_utt) - ) - logging.info("num classes per batch: %d", self._num_classes_per_batch) - - # self.weights = torch.as_tensor(dataset.class_weights, dtype=torch.double) def _compute_avg_batch_size(self): + if not self.var_batch_size: + self.avg_batch_size = self.batch_size + return + dataset = self.dataset avg_chunk_length = int( (dataset.max_chunk_length + dataset.min_chunk_length) / 2 ) batch_mult = dataset.max_chunk_length / avg_chunk_length - return int(self.batch_size * batch_mult) + self.avg_batch_size = int(self.batch_size * batch_mult) - def _compute_iters_auto(self): + def set_num_egs_per_utt_epoch(self, num_egs_per_utt_epoch): + if num_egs_per_utt_epoch == "auto": + self._compute_num_egs_per_utt_epoch_auto() + else: + self.num_egs_per_utt_epoch = num_egs_per_utt_epoch + + def _compute_num_egs_per_utt_epoch_auto(self): dataset = self.dataset avg_seq_length = np.mean(dataset.seq_lengths) avg_chunk_length = int( (dataset.max_chunk_length + dataset.min_chunk_length) / 2 ) - self.iters_per_epoch = math.ceil(avg_seq_length / avg_chunk_length) - logging.debug("num iters per epoch: %d" % self.iters_per_epoch) + self.num_egs_per_utt_epoch = math.ceil(avg_seq_length / avg_chunk_length) + logging.debug("num iters per epoch: %d", self.num_egs_per_utt_epoch) + + def _compute_len(self, world_size): + self._len = int( + math.ceil( + self.num_egs_per_utt_epoch + * self.dataset.num_seqs + / self.avg_batch_size + / world_size + ) + ) + + def _compute_num_classes_per_batch(self): + self._num_classes_per_batch = int( + math.ceil( + self.avg_batch_size / self.num_egs_per_class / self.num_egs_per_utt + ) + ) + + def _get_class_weights(self, chunk_length): + if not self.has_short_seqs: + return self.dataset.class_weights + + # get classes with utt shorter than chunk length and put weight to 0 + zero_idx = self.dataset.class2max_length < chunk_length + if not np.any(zero_idx): + return self.dataset.class_weights + + class_weights = self.dataset.class_weights.clone() + class_weights[zero_idx] = 0 + # renormalize weights + class_weights /= class_weights.sum() + return class_weights + + def _get_seq_weights(self, chunk_length): + pass def __len__(self): return self._len @@ -108,9 +160,21 @@ def __iter__(self): self.batch = 0 return self + def hard_prototype_mining(self): + return self.num_hard_prototypes > 0 + + def set_hard_prototypes(self, affinity_matrix): + if affinity_matrix is None: + self.hard_prototypes = None + return + + affinity_matrix[np.diag(affinity_matrix.shape[0])] = -1.0 + self.hard_prototypes = torch.topk( + affinity_matrix, self.num_hard_prototypes, dim=-1 + ).indices + def _get_utt_idx_basic(self, batch_mult=1): dataset = self.dataset - num_classes_per_batch = batch_mult * self._num_classes_per_batch if dataset.class_weights is None: @@ -212,7 +276,6 @@ def __next__(self): logging.info("batch 0 uttidx=%s", str(utt_idx[:10])) self.batch += 1 - index = [(i, chunk_length) for i in utt_idx] return index @@ -226,8 +289,11 @@ def filter_args(**kwargs): "batch_size", "var_batch_size", "iters_per_epoch", + "num_egs_per_utt_epoch", "num_egs_per_class", "num_egs_per_utt", + "num_hard_prototypes", + "num_egs_per_hard_prototype", ) return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -255,6 +321,13 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--iters-per-epoch", + default=None, + type=lambda x: x if (x == "auto" or x is None) else float(x), + help=("number of times we sample an utterance in each epoch"), + ) + + parser.add_argument( + "--num-egs-per-utt-epoch", default="auto", type=lambda x: x if x == "auto" else float(x), help=("number of times we sample an utterance in each epoch"), @@ -272,9 +345,20 @@ def add_class_args(parser, prefix=None): default=1, help=("number of samples per utterance in batch"), ) + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) + parser.add_argument( + "--num-egs-per-hard-prototype", + type=int, + default=1, + help=("number of samples per hard prototype class in the batch"), + ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='weighted seq sampler options') add_argparse_args = add_class_args diff --git a/hyperion/torch/layers/attention.py b/hyperion/torch/layers/attention.py index 7b4f5c06..8ab75111 100644 --- a/hyperion/torch/layers/attention.py +++ b/hyperion/torch/layers/attention.py @@ -468,6 +468,7 @@ def _compute_softmax( else: # case when mask is 1d vector per batch element, # meaning that time1 and time2 are the same, so mask is symmetric + pad2 = 0 # fix this mask = nn.functional.pad(mask, (0, pad2)) mask = mask.squeeze(1).eq(0) # (batch, 1, time) diff --git a/hyperion/torch/layers/margin_losses.py b/hyperion/torch/layers/margin_losses.py index d7a086d1..6443ea02 100644 --- a/hyperion/torch/layers/margin_losses.py +++ b/hyperion/torch/layers/margin_losses.py @@ -19,6 +19,11 @@ def _l2_norm(x, axis=-1): return y +def _cosine_affinity(kernel): + kernel_norm = _l2_norm(kernel, axis=0) + return torch.mm(kernel_norm.transpose(0, 1), kernel_norm) + + class ArcLossOutput(nn.Module): """Additive angular margin softmax (ArcFace) output layer. @@ -63,7 +68,9 @@ def __init__( self._compute_aux() + # each column is the prototype vector of a class self.kernel = nn.Parameter(torch.Tensor(in_feats, num_classes)) + # we normalize prototypes to have l2 norm = 1 self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) def __repr__(self): @@ -99,9 +106,6 @@ def update_margin(self, epoch): Args: epoch: value of current epoch. """ - # if self.margin_warmup_epochs == 0: - # return - if epoch < self.margin_warmup_epochs: self.cur_margin = self.margin * epoch / self.margin_warmup_epochs self.cur_intertop_margin = ( @@ -145,11 +149,7 @@ def forward(self, x, y=None): cos_theta_m = cos_theta * self.cos_m - sin_theta * self.sin_m idx_ = torch.arange(0, batch_size, dtype=torch.long) - # if torch.distributed.get_rank() == 0: - # print("o1", output[idx_, y]) output[idx_, y] = cos_theta_m[idx_, y] - # if torch.distributed.get_rank() == 0: - # print("o2", output[idx_, y]) if self.cur_intertop_margin > 0: # implementation of intertop-K # set positive scores to -inf so they don't appear in the top k @@ -167,17 +167,16 @@ def forward(self, x, y=None): + sin_theta[idx_, topk_idx] * self.intertop_sin_m ) # take the maximum for the cases where m' is larger than theta to get cos(max(0, theta-m')) - # if torch.distributed.get_rank() == 0: - # print("o3", output[idx_, topk_idx]) output[idx_, topk_idx] = torch.maximum( output[idx_, topk_idx], cos_theta_m ) - # if torch.distributed.get_rank() == 0: - # print("o4", output[idx_, topk_idx], flush=True) output *= s # scale up in order to make softmax work return output + def compute_prototype_affinity(self): + return _cosine_affinity(self.kernel) + class CosLossOutput(nn.Module): """Additive margin softmax (CosFace) output layer. @@ -314,6 +313,9 @@ def forward(self, x, y=None): output *= s # scale up in order to make softmax work return output + def compute_prototype_affinity(self): + return _cosine_affinity(self.kernel) + class SubCenterArcLossOutput(ArcLossOutput): """Sub-Center Additive angular margin softmax (ArcFace) output layer. @@ -352,6 +354,11 @@ def __init__( ) self.num_classes = num_classes self.num_subcenters = num_subcenters + # this variable counts which subcenter is used more time during training + # Therefore, which subscenter correspond to the clean label. + self.register_buffer( + "subcenter_counts", torch.zeros(num_classes, num_subcenters) + ) def __str__(self): s = "%s(in_feats=%d, num_classes=%d, num_subcenters=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" % ( @@ -367,6 +374,12 @@ def __str__(self): ) return s + def _update_counts(self, y, proto_idx): + self.subcenter_counts[y, proto_idx] += 1 + # we make counts relative to avoid risk of overflowing the integers + min_counts, _ = torch.min(self.subcenter_counts, dim=1, keepdim=True) + self.subcenter_counts -= min_counts + def forward(self, x, y=None): """Computes penalized logits. @@ -385,17 +398,17 @@ def forward(self, x, y=None): kernel_norm = _l2_norm(self.kernel, axis=0) # cos(theta+m) cos_theta = torch.mm(x, kernel_norm).float() - cos_theta = torch.max( + cos_theta, proto_idx = torch.max( cos_theta.view(-1, self.num_classes, self.num_subcenters), dim=-1 - )[0] - + ) cos_theta = cos_theta.clamp(-1, 1) # for numerical stability - # print(cos_theta) + output = ( cos_theta * 1.0 ) # a little bit hacky way to prevent in_place operation on cos_theta if y is not None and self.training: + self._update_counts(y, proto_idx) cos_theta_2 = torch.pow(cos_theta, 2) sin_theta_2 = (1 + 1e-10) - cos_theta_2 sin_theta = torch.sqrt(sin_theta_2) @@ -426,3 +439,43 @@ def forward(self, x, y=None): output *= s # scale up in order to make softmax work return output + + def get_main_prototype_kernel(self): + _, idx2 = torch.max( + self.subcenter_counts, dim=-1 + ) # get indices for the main prototype + idx1 = torch.arange(self.num_classes) + kernel = kernel.view(-1, self.num_classes, self.num_subcenters)[:, idx1, idx2] + return kernel + + def compute_prototype_affinity(self): + kernel = self.get_main_prototype_kernel() + return _cosine_affinity(kernel) + + def to_arc_loss(self): + loss = ArcLossOutput( + in_feats=self.in_feats, + num_classes=self.num_classes, + cos_scale=self.cos_scale, + margin=self.margin, + margin_warmup_epochs=self.margin_warmup_epochs, + intertop_k=self.intertop_k, + intertop_margin=self.intertop_margin, + ) + kernel = self.get_main_prototype_kernel() + loss.kernel.data = kernel + return loss + + def to_cos_loss(self): + loss = CosLossOutput( + in_feats=self.in_feats, + num_classes=self.num_classes, + cos_scale=self.cos_scale, + margin=self.margin, + margin_warmup_epochs=self.margin_warmup_epochs, + intertop_k=self.intertop_k, + intertop_margin=self.intertop_margin, + ) + kernel = self.get_main_prototype_kernel() + loss.kernel.data = kernel + return loss diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py index bf0552dc..bd5c3f1b 100644 --- a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py @@ -52,7 +52,6 @@ def __init__( @staticmethod def filter_args(**kwargs): - base_args = HFWav2XVector.filter_args(**kwargs) child_args = HFHubert.filter_args(**kwargs["hf_feats"]) base_args["hf_feats"] = child_args @@ -73,12 +72,22 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFHubert.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + @staticmethod def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") + HFHubert.add_finetune_args(parser, prefix="hf_feats") ResNet1dXVector.add_finetune_args(parser, prefix="xvector") if prefix is not None: diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py index 3cabb1d5..a5166d4d 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -77,12 +77,22 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + @staticmethod def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") ResNet1dXVector.add_finetune_args(parser, prefix="xvector") if prefix is not None: diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index d75a257b..3fed7143 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -83,6 +83,9 @@ def _fuse_hid_feats(self, hid_feats): return feats + def compute_prototype_affinity(self): + return self.xvector.compute_prototype_affinity() + def update_loss_margin(self, epoch): """Updates the value of the margin in AAM/AM-softmax losses given the epoch number @@ -114,14 +117,22 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) - def forward_feats(self, x, x_lengths, return_feat_layers=None): + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): return_hid_states = ( False if return_feat_layers is None and self.feat_fusion_method == "last" else True ) with self._hf_context: - hf_output = self.hf_feats(x, x_lengths, return_hid_states=return_hid_states) + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) feat_lengths = hf_output["hidden_states_lengths"] if return_hid_states: hid_feats = hf_output["hidden_states"] @@ -203,7 +214,8 @@ def extract_embed( x, x_lengths=None, vad_samples=None, - chunk_length=0, + hf_chunk_length=0, + xvec_chunk_length=0, embed_layer=None, detach_chunks=False, ): @@ -211,8 +223,15 @@ def extract_embed( if vad_samples is not None: x, x_lengths = remove_silence(x, x_lengths) - feats, _, feat_lengths = self.forward_feats(x, x_lengths) - xvec_chunk_length = int(chunk_length * feats.size(-1) // x.size(-1)) + feats, _, feat_lengths = self.forward_feats( + x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks + ) + xvec_chunk_length = int( + xvec_chunk_length + * self.hf_feats.sample_frequency + * feats.size(-1) + // x.size(-1) + ) return self.xvector.extract_embed( feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks ) @@ -329,6 +348,11 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + def change_config(self, hf_feats, xvector): + logging.info("changing hf wav2xvector config") + self.hf_feats.change_config(**hf_feats) + self.xvector.change_config(**xvector) + @staticmethod def add_class_args(parser, prefix=None, skip=set()): diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py index efac4e50..2f4b66ce 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py @@ -73,12 +73,22 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWavLM.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + @staticmethod def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") + HFWavLM.add_finetune_args(parser, prefix="hf_feats") ResNet1dXVector.add_finetune_args(parser, prefix="xvector") if prefix is not None: diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 83c95222..c7a77f3e 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -57,6 +57,9 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) + def compute_prototype_affinity(self): + return self.xvector.compute_prototype_affinity() + def forward( self, x, diff --git a/hyperion/torch/models/xvectors/efficient_net_xvector.py b/hyperion/torch/models/xvectors/efficient_net_xvector.py index 21eb9dbe..df5965cd 100644 --- a/hyperion/torch/models/xvectors/efficient_net_xvector.py +++ b/hyperion/torch/models/xvectors/efficient_net_xvector.py @@ -204,6 +204,17 @@ def get_config(self): config.update(base_config) return config + def change_config( + self, override_dropouts=False, dropout_rate=0, drop_connect_rate=0, **kwargs + ): + xvec_args = XVector.filter_finetune_args(**kwargs) + xvec_args["override_dropouts"] = False + super().change_config(**xvec_args) + + if override_dropouts: + self.encoder_net.change_dropouts(dropout_rate, drop_connect_rate) + self.classif_net.change_dropouts(dropout_rate) + @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): @@ -215,6 +226,7 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + @staticmethod def filter_args(**kwargs): base_args = XVector.filter_args(**kwargs) @@ -236,6 +248,25 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = EN.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + EN.add_finetune_args(parser) + XVector.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index e4495182..5957c9f5 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -81,6 +81,38 @@ def get_config(self): config.update(base_config) return config + def change_config( + self, + resnet_enc, + override_dropouts=False, + dropout_rate=0, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0, + num_subcenters=2, + ): + super().change_config( + False, + dropout_rate, + num_classes, + loss_type, + cos_scale, + margin, + margin_warmup_epochs, + intertop_k, + intertop_margin, + num_subcenters, + ) + if override_dropouts: + logging.info("chaning x-vector head dropouts") + self.classif_net.change_dropouts(dropout_rate) + + self.encoder_net.change_config(**resnet_enc) + @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): @@ -118,6 +150,26 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = Encoder.filter_finetune_args(**kwargs["resnet_enc"]) + base_args["resnet_enc"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + Encoder.add_finetune_args( + parser, prefix="resnet_enc", skip=set(["head_channels"]) + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index 99385cae..fe88ff57 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -191,6 +191,7 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + @staticmethod def filter_args(**kwargs): base_args = XVector.filter_args(**kwargs) @@ -210,6 +211,26 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + + base_args = XVector.filter_finetune_args(**kwargs) + child_args = RNF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + RNF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index 676952da..1e616570 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -219,8 +219,8 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = SNF.filter_args(**kwargs) @@ -240,3 +240,23 @@ def add_class_args(parser, prefix=None): outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = SNF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + SNF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/tdnn_xvector.py b/hyperion/torch/models/xvectors/tdnn_xvector.py index 7816c7ea..f28dc9b3 100644 --- a/hyperion/torch/models/xvectors/tdnn_xvector.py +++ b/hyperion/torch/models/xvectors/tdnn_xvector.py @@ -153,8 +153,8 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = TF.filter_args(**kwargs) @@ -172,6 +172,25 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = TF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + TF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/transformer_xvector_v1.py b/hyperion/torch/models/xvectors/transformer_xvector_v1.py index 742fadc8..b3428783 100644 --- a/hyperion/torch/models/xvectors/transformer_xvector_v1.py +++ b/hyperion/torch/models/xvectors/transformer_xvector_v1.py @@ -365,3 +365,51 @@ def add_class_args(parser, prefix=None): # help='xvector options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + """Filters arguments correspondin to TransformerXVector + from args dictionary + + Args: + kwargs: args dictionary + + Returns: + args dictionary + """ + base_args = XVector.filter_finetune_args(**kwargs) + + valid_args = ( + "pos_dropout_rate", + "att_dropout_rate", + ) + + child_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + """Adds TransformerXVector config parameters for finetuning to argparser + + Args: + parser: argparse object + prefix: prefix string to add to the argument names + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + parser.add_argument( + "--pos-dropout-rate", + default=0.1, + type=float, + help="positional encoder dropout", + ) + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index c35e6a4a..197ef5a9 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -4,7 +4,7 @@ """ import logging from enum import Enum -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo from typing import Optional import torch @@ -531,6 +531,36 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model + def change_config( + self, + override_dropouts=False, + dropout_rate=0, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + logging.info("changing x-vector config") + self.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + + if override_dropouts: + logging.info("overriding x-vector dropouts") + self.encoder_net.change_dropouts(dropout_rate) + self.classif_net.change_dropouts(dropout_rate) + def rebuild_output_layer( self, num_classes=None, @@ -547,6 +577,7 @@ def rebuild_output_layer( ): # if we change the number of classes or the loss-type # we need to reinitiate the last layer + logging.info("rebuilding output layer") self.classif_net.rebuild_output_layer( num_classes, loss_type, cos_scale, margin, margin_warmup_epochs ) @@ -602,6 +633,9 @@ def _train(self, train_mode: str): else: raise ValueError(f"invalid train_mode={train_mode}") + def compute_prototype_affinity(self): + return self.classif_net.compute_prototype_affinity() + @staticmethod def valid_train_modes(): return ["full", "frozen", "ft-embed-affine"] @@ -850,9 +884,26 @@ def add_finetune_args(parser, prefix=None): help="number of subcenters in subcenter losses", ) + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='xvector finetune opts') add_argparse_args = add_class_args add_argparse_finetune_args = add_finetune_args diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index 6a886e44..adfeceb3 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -5,6 +5,7 @@ from jsonargparse import ArgumentParser, ActionParser +import torch import torch.nn as nn from torch.nn import Linear @@ -310,6 +311,14 @@ def extract_embed(self, x, embed_layer=0): y = self.fc_blocks[l](x) return y + def compute_prototype_affinity(self): + if self.loss_type != "softmax": + return self.output.compute_prototype_affinity() + + kernel = self.output.weight # (num_classes, feat_dim) + kernel = kernel / torch.linalg.norm(kernel, 2, dim=1, keepdim=True) + return torch.mm(kernel, kernel.transpose(0, 1)) + def get_config(self): hid_act = AF.get_config(self.fc_blocks[0].activation) diff --git a/hyperion/torch/narchs/efficient_net.py b/hyperion/torch/narchs/efficient_net.py index 8a71d6f4..273fa183 100644 --- a/hyperion/torch/narchs/efficient_net.py +++ b/hyperion/torch/narchs/efficient_net.py @@ -4,7 +4,7 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import torch import torch.nn as nn @@ -395,6 +395,17 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + def change_dropouts(self, dropout_rate, drop_connect_rate): + super().change_dropouts(dropout_rate) + from ..layers import DropConnect2d + + for module in self.modules(): + if isinstance(module, DropConnect2d): + module.p *= drop_connect_rate / self.drop_connect_rate + + self.drop_connect_rate = drop_connect_rate + self.dropout_rate = dropout_rate + @staticmethod def filter_args(**kwargs): @@ -424,7 +435,6 @@ def filter_args(**kwargs): ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args @staticmethod @@ -590,6 +600,53 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='efficientnet options') add_argparse_args = add_class_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + parser.add_argument( + "--drop-connect-rate", + default=0.2, + type=float, + help="layer drop probability", + ) + + try: + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "out_units", + "override_dropouts", + "drop_connect_rate", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index 2044f528..794f8144 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -3,8 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import math +import logging import numpy as np @@ -518,6 +519,22 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + def change_config(self, override_dropouts, dropout_rate, drop_connect_rate): + if override_dropouts: + logging.info("chaning resnet1d dropouts") + self.change_dropouts(dropout_rate, drop_connect_rate) + + def change_dropouts(self, dropout_rate, drop_connect_rate): + super().change_dropouts(dropout_rate) + from ..layers import DropConnect1d + + for module in self.modules(): + if isinstance(module, DropConnect1d): + module.p *= drop_connect_rate / self.drop_connect_rate + + self.drop_connect_rate = drop_connect_rate + self.dropout_rate = dropout_rate + @staticmethod def filter_args(**kwargs): if "wo_norm" in kwargs: @@ -791,6 +808,55 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='ResNet1d encoder options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "override_dropouts", + "drop_connect_rate", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set([])): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) + except: + pass + + try: + parser.add_argument( + "--drop-connect-rate", + default=0, + type=float, + help="layer drop probability", + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index a72cabac..b27e883d 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -5,6 +5,7 @@ import math from jsonargparse import ArgumentParser, ActionParser +import logging import torch import torch.nn as nn @@ -352,6 +353,22 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + def change_config(self, override_dropouts, dropout_rate, drop_connect_rate): + if override_dropouts: + logging.info("chaning resnet2d dropouts") + self.change_dropouts(dropout_rate, drop_connect_rate) + + def change_dropouts(self, dropout_rate, drop_connect_rate): + super().change_dropouts(dropout_rate) + from ..layers import DropConnect2d + + for module in self.modules(): + if isinstance(module, DropConnect2d): + module.p *= drop_connect_rate / self.drop_connect_rate + + self.drop_connect_rate = drop_connect_rate + self.dropout_rate = dropout_rate + @staticmethod def filter_args(**kwargs): diff --git a/hyperion/torch/narchs/resnet_factory.py b/hyperion/torch/narchs/resnet_factory.py index bd58cd2b..645b7f2b 100644 --- a/hyperion/torch/narchs/resnet_factory.py +++ b/hyperion/torch/narchs/resnet_factory.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo from .resnet import * @@ -308,6 +308,41 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='ResNet options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + valid_args = ( + "override_dropouts", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/spinenet_factory.py b/hyperion/torch/narchs/spinenet_factory.py index 02e36244..9e94a1be 100644 --- a/hyperion/torch/narchs/spinenet_factory.py +++ b/hyperion/torch/narchs/spinenet_factory.py @@ -2,7 +2,7 @@ Copyright 2020 Magdalena Rybicka Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo from .spinenet import * @@ -266,3 +266,40 @@ def add_class_args(parser, prefix=None): outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "override_dropouts", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/tdnn_factory.py b/hyperion/torch/narchs/tdnn_factory.py index 584e9243..6a9e6010 100644 --- a/hyperion/torch/narchs/tdnn_factory.py +++ b/hyperion/torch/narchs/tdnn_factory.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo from .tdnn import TDNNV1 from .etdnn import ETDNNV1 @@ -264,3 +264,40 @@ def add_class_args(parser, prefix=None): # help='TDNN options') add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "override_dropouts", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index f1a5b26c..d2949c12 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -125,9 +125,6 @@ def __init__( if self.norm_before: self.norm = nn.LayerNorm(d_model) - # def _make_in_layer(self, in_layer_type, in_feats, d_model, - # dropout_rate, pos_dropout_rate, - # padding_idx, time_dim): def _make_in_layer(self): in_feats = self.in_feats @@ -240,6 +237,31 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) + def change_dropouts(self, pos_dropout_rate, att_dropout_rate, ff_dropout_rate): + + assert pos_dropout_rate == 0 or self.pos_dropout_rate > 0 + assert att_dropout_rate == 0 or self.att_dropout_rate > 0 + assert ff_dropout_rate == 0 or self.ff_dropout_rate > 0 + + for module in self.modules(): + if isinstance(module, PosEncoder): + for layer in module.modules(): + if isinstance(layer, nn.Dropout): + layer.p = pos_dropout_rate + + elif isinstance(module, EBlock): + for layer in module.modules(): + if isinstance(layer, nn.Dropout): + layer.p = ff_dropout_rate + + for layer in module.self_attn.modules(): + if isinstance(layer, nn.Dropout): + layer.p = att_dropout_rate + + self.pos_dropout_rate = pos_dropout_rate + self.att_dropout_rate = att_dropout_rate + self.ff_dropout_rate = ff_dropout_rate + def in_context(self): return (self.att_context, self.att_context) diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index af3a305c..1d01e02b 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -51,6 +51,16 @@ def unfreeze(self): for param in self.parameters(): param.requires_grad = True + def change_dropouts(self, dropout_rate): + """Changes all dropout rates of the model.""" + for module in self.modules(): + if isinstance(module, nn.modules.dropout._DropoutNd): + module.p = dropout_rate + + if hasattr(self, "dropout_rate"): + assert dropout_rate == 0 or self.dropout_rate > 0 + self.dropout_rate = dropout_rate + @property def train_mode(self): return self._train_mode diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index 82ce70bd..ba331573 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -131,6 +131,10 @@ class HFHubert(HFWav2VecBase): and uses the ones passed as arguments. override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. configuration in the pretrained model and uses the ones passed in the arguments. + left_encoder_context (`int`): past context frames used by the transformer encoder when the signal is evaluated + chunk by chunk, if it is too long to fit in GPU. + right_encoder_context: (`int`): future context frames used by the transformer encoder. + sample_frequency: (`int`) waveform sample frequency used to train the model. """ def __init__( @@ -175,6 +179,9 @@ def __init__( ignore_pretrained: bool = False, override_dropouts: bool = False, override_spec_augment: bool = False, + left_encoder_context: int = 16, + right_encoder_context: int = 16, + sample_frequency: int = 16000, ): super().__init__( @@ -189,6 +196,9 @@ def __init__( ignore_pretrained=ignore_pretrained, override_dropouts=override_dropouts, override_spec_augment=override_spec_augment, + left_encoder_context=left_encoder_context, + right_encoder_context=right_encoder_context, + sample_frequency=sample_frequency, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -216,7 +226,9 @@ def __init__( ) ddp_wait_for_all_procs() self.hf_model.config.layerdrop = 0.0 - self.change_hyperparams( + self.change_config( + override_dropouts=self.override_dropouts, + override_spec_augment=self.override_spec_augment, hidden_dropout=hidden_dropout, activation_dropout=activation_dropout, attention_dropout=attention_dropout, @@ -571,3 +583,110 @@ def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + args_base = HFWav2VecBase.filter_args(**kwargs) + valid_args = ( + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_finetune_args(parser) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index e91fe8c4..579574a1 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -144,6 +144,10 @@ class HFWav2Vec2(HFWav2VecBase): and uses the ones passed as arguments. override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. configuration in the pretrained model and uses the ones passed in the arguments. + left_encoder_context (`int`): past context frames used by the transformer encoder when the signal is evaluated + chunk by chunk, if it is too long to fit in GPU. + right_encoder_context: (`int`): future context frames used by the transformer encoder. + sample_frequency: (`int`) waveform sample frequency used to train the model. """ def __init__( @@ -193,6 +197,9 @@ def __init__( ignore_pretrained: bool = False, override_dropouts: bool = False, override_spec_augment: bool = False, + left_encoder_context: int = 16, + right_encoder_context: int = 16, + sample_frequency: int = 16000, ): super().__init__( @@ -207,6 +214,9 @@ def __init__( ignore_pretrained=ignore_pretrained, override_dropouts=override_dropouts, override_spec_augment=override_spec_augment, + left_encoder_context=left_encoder_context, + right_encoder_context=right_encoder_context, + sample_frequency=sample_frequency, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -234,7 +244,9 @@ def __init__( ) ddp_wait_for_all_procs() self.hf_model.config.layerdrop = 0.0 - self.change_hyperparams( + self.change_config( + override_dropouts=self.override_dropouts, + override_spec_augment=self.override_spec_augment, hidden_dropout=hidden_dropout, activation_dropout=activation_dropout, attention_dropout=attention_dropout, @@ -316,7 +328,7 @@ def change_dropouts( self.hf_model.feature_projection.dropout.p = feat_proj_dropout for module in self.hf_model.encoder.modules(): if isinstance(module, nn.Dropout): - t.p = hidden_dropout + module.p = hidden_dropout for module in self.hf_model.encoder.modules(): if isinstance(module, t.Wav2Vec2Attention): @@ -667,6 +679,113 @@ def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + @staticmethod + def filter_finetune_args(**kwargs): + args_base = HFWav2VecBase.filter_args(**kwargs) + valid_args = ( + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_finetune_args(parser) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + """ Things I think I don't need: feat_quantizer_dropout (`float`, defaults to 0.0): the dropout probabilitiy for quantized feature encoder states. diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index 87f19835..1dceed1c 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -5,6 +5,7 @@ import os import logging +from turtle import right from jsonargparse import ArgumentParser, ActionParser, ActionYesNo from typing import Optional, Tuple, Union, List @@ -49,6 +50,10 @@ class HFWav2VecBase(TorchModel): and uses the ones passed as arguments. override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. configuration in the pretrained model and uses the ones passed in the arguments. + left_encoder_context (`int`): past context frames used by the transformer encoder when the signal is evaluated + chunk by chunk, if it is too long to fit in GPU. + right_encoder_context: (`int`): future context frames used by the transformer encoder. + sample_frequency: (`int`) waveform sample frequency used to train the model. """ def __init__( @@ -64,6 +69,9 @@ def __init__( ignore_pretrained: bool = False, override_dropouts: bool = False, override_spec_augment: bool = False, + left_encoder_context: int = 16, + right_encoder_context: int = 16, + sample_frequency: int = 16000, ): super().__init__() self.pretrained_model_path = pretrained_model_path @@ -75,6 +83,8 @@ def __init__( self.ignore_pretrained = ignore_pretrained self.override_dropouts = override_dropouts self.override_spec_augment = override_spec_augment + self.right_encoder_context = right_encoder_context + self.left_encoder_context = left_encoder_context if pretrained_model_path is not None and not ignore_pretrained: rank = ddp_get_rank() @@ -130,9 +140,14 @@ def __init__( ddp_wait_for_all_procs() normalize_input = feature_extractor.do_normalize use_input_attention_mask = feature_extractor.return_attention_mask + sample_frequency = feature_extractor.sampling_rate self.normalize_input = normalize_input self.use_input_attention_mask = use_input_attention_mask + self.sample_frequency = sample_frequency + + self._feature_encoder_context = None + self._frame_shift = None def __deepcopy__(self, memo): """Reimplementation of deepcopy for Hugging Face models. @@ -149,11 +164,65 @@ def __deepcopy__(self, memo): new_obj.to(device) return new_obj - def change_hyperparams(self, **kwargs): - if self.override_spec_augment: + @property + def feature_encoder_context(self): + if self._feature_encoder_context is not None: + return self._feature_encoder_context + + total_context = 0 + total_stride = 1 + for kernel, stride in zip( + self.hf_model.config.conv_kernel, self.hf_model.config.conv_stride + ): + total_context += total_stride * (kernel - 1) / 2 + total_stride *= stride + + self._feature_encoder_context = (int(total_context + 0.5), int(total_context)) + return self._feature_encoder_context + + @property + def frame_shift(self): + if self._frame_shift is not None: + return self._frame_shift + + total_stride = 1 + for stride in self.hf_model.config.conv_stride: + total_stride *= stride + + self._frame_shift = total_stride + return total_stride + + @property + def context(self): + left, right = self.feature_encoder_context + left += self.left_encoder_context + right += self.right_encoder_context + return left, right + + def max_out_length(self, max_in_length): + return self.hf_model._get_feat_extract_output_lengths(max_in_length).item() + # left_context, right_context = self.feature_encoder_context + # max_in_length = max_in_length - left_context - right_context + # return max_in_length // self.frame_shift + + def out_lengths(self, in_lengths): + return self.hf_model._get_feat_extract_output_lengths(in_lengths) + # left_context, right_context = self.feature_encoder_context + # in_lengths = in_lengths - left_context - right_context + # return torch.div(in_lengths, self.frame_shift, rounding_mode="floor") + + def out_shape(self, in_shape): + out_length = self.max_out_length(in_shape[1]) + C = self.hf_model.config.hidden_size + return (in_shape[0], out_length, C) + + def change_config(self, override_dropouts, override_spec_augment, **kwargs): + if override_spec_augment: + logging.info("overriding speech augment") self.change_spec_augment(**kwargs) - if self.override_dropouts: + if override_dropouts: + logging.info("overriding hf model dropouts") self.change_dropouts(**kwargs) def change_spec_augment( @@ -217,6 +286,51 @@ def forward( x_lengths: Optional[torch.LongTensor] = None, return_attentions: bool = False, return_hid_states: bool = False, + chunk_length: float = 0, + detach_chunks: bool = True, + ): + r"""Forward function for long utterances that do not fit in GPU memory. + + Args: + x: input audio of shape = (batch, sequence_length). + x_lengths: lengths of the audio waveforms in samples with shape = (batch,). + return_attentions: whether or not to return the attentions tensors of + all attention layers. + return_hid_states: whether or not to return the hidden states of all layers. + chunk_size: chunk size in seconds. + + Returns: + Dictionary with: + last_hidden_state: sequence of hidden-states at the output of the last + layer of the model (torch.FloatTensor of shape + (batch_size, sequence_length, hidden_size)). + extract_features: sequence of extracted feature vectors of the last + convolutional layer of the model. (torch.FloatTensor of shape + (batch_size, sequence_length, conv_dim[-1]) + hidden_states: hidden-states of the model at the output of each layer + plus the initial embedding outputs (tuple(torch.FloatTensor)). + attentions: Attentions weights after the attention softmax, used to + compute the weighted average in the self-attention heads + (tuple(torch.FloatTensor)). + """ + if chunk_length == 0 or x.size(1) < chunk_length * self.sample_frequency: + return self.forward_impl(x, x_lengths, return_attentions, return_hid_states) + else: + return self.forward_long_impl( + x, + x_lengths, + return_attentions, + return_hid_states, + chunk_length, + detach_chunks, + ) + + def forward_impl( + self, + x: torch.Tensor, + x_lengths: Optional[torch.LongTensor] = None, + return_attentions: bool = False, + return_hid_states: bool = False, ): r"""Forward function for wav2vec style models. @@ -259,6 +373,143 @@ def forward( return output + def forward_long_impl( + self, + x: torch.Tensor, + x_lengths: Optional[torch.LongTensor] = None, + return_attentions: bool = False, + return_hid_states: bool = False, + chunk_length: float = 120.0, + detach_chunks: bool = True, + ): + r"""Forward function for long utterances that do not fit in GPU memory. + + Args: + x: input audio of shape = (batch, sequence_length). + x_lengths: lengths of the audio waveforms in samples with shape = (batch,). + return_attentions: whether or not to return the attentions tensors of + all attention layers. + return_hid_states: whether or not to return the hidden states of all layers. + chunk_size: chunk size in seconds. + + Returns: + Dictionary with: + last_hidden_state: sequence of hidden-states at the output of the last + layer of the model (torch.FloatTensor of shape + (batch_size, sequence_length, hidden_size)). + extract_features: sequence of extracted feature vectors of the last + convolutional layer of the model. (torch.FloatTensor of shape + (batch_size, sequence_length, conv_dim[-1]) + hidden_states: hidden-states of the model at the output of each layer + plus the initial embedding outputs (tuple(torch.FloatTensor)). + attentions: Attentions weights after the attention softmax, used to + compute the weighted average in the self-attention heads + (tuple(torch.FloatTensor)). + """ + # output0 = self.forward_impl(x, x_lengths) + # mol0 = output0.last_hidden_state.size(1) + print("long", flush=True) + max_in_length = x.size(-1) + x, x_mask = self._preprocess(x, x_lengths) + # we transform the chunk length from seconds to samples, + # making sure that the chunk_length corresponds to an integer number of output samples. + chunk_frames = int(chunk_length * self.sample_frequency) // self.frame_shift + chunk_length = chunk_frames * self.frame_shift + num_chunks = (x.size(1) + chunk_length - 1) // chunk_length + left_context, right_context = self.context + max_out_length = self.max_out_length(x.size(1)) + start = 0 + outputs = [] + for i in range(num_chunks): + if i < num_chunks - 1: + start_i = max(start - left_context, 0) + else: + # last chunk has special treatment, we forward pass + # a chunk with chunk_length size ending at the end. + # but we will just use the output frames that don't overlap + # with the second last chunk. + start_i = max(x.size(1) - chunk_length - left_context, 0) + + stop_i = min(start + chunk_length + right_context, x.size(1)) + x_i = x[:, start_i:stop_i] + x_mask_i = None if x_mask is None else x_mask[start_i:stop_i] + output_i = self.hf_model( + x_i, + x_mask_i, + output_attentions=return_attentions, + output_hidden_states=return_hid_states, + ) + + if i < num_chunks - 1: + start_out_i = max( + output_i.last_hidden_state.size(1) + - chunk_frames + - self.right_encoder_context, + 0, + ) + stop_out_i = start_out_i + chunk_frames + else: + # we just use the frames that do not overlap + # with the second last chunk + remaining_frames = max_out_length - i * chunk_frames + start_out_i = -remaining_frames + stop_out_i = output_i.last_hidden_state.size(1) + + output_i.last_hidden_state = output_i.last_hidden_state[ + :, start_out_i:stop_out_i + ] + if detach_chunks: + output_i.last_hidden_state.detach_() + + if return_hid_states: + output_i.hidden_states = [ + h[:, start_out_i:stop_out_i] for h in output_i.hidden_states + ] + if detach_chunks: + output_i.hidden_states = [ + h.detach() for h in output_i.hidden_states + ] + + outputs.append(output_i) + start += chunk_length + + # concatenate outputs from different chunks + output = outputs[0] + output.last_hidden_state = torch.cat( + [o.last_hidden_state for o in outputs], dim=1 + ) + if return_hid_states: + hidden_states = [] + for j in range(len(outputs[0].hidden_states)): + hidden_states_j = torch.cat( + [o.hidden_states[j] for o in outputs], dim=1 + ) + hidden_states.append(hidden_states_j) + output.hidden_states = hidden_states + + if return_attentions: + attentions = [] + for j in range(len(outputs[0].attentions)): + attentions_j = [o.attentions[j] for o in outputs] + attentions.append(attentions_j) + output.attentions = attentions + + feat_lengths = ( + None + if x_lengths is None + else scale_seq_lengths(x_lengths, max_out_length, max_in_length) + ) + output["hidden_states_lengths"] = feat_lengths + # print( + # "lens", + # mol0, + # max_out_length, + # output.last_hidden_state.size(1), + # output.hidden_states[0].size(1), + # flush=True, + # ) + return output + def get_config(self): """Returns the configuration arguments for the object in a dictionary.""" @@ -274,6 +525,9 @@ def get_config(self): "ignore_pretrained": self.ignore_pretrained, "override_dropouts": self.override_dropouts, "override_spec_augment": self.override_spec_augment, + "left_encoder_context": self.left_encoder_context, + "right_encoder_context": self.right_encoder_context, + "sample_frequency": self.sample_frequency, } base_config = super().get_config() @@ -298,6 +552,9 @@ def filter_args(**kwargs): "ignore_pretrained", "override_dropouts", "override_spec_augment", + "left_encoder_context", + "right_encoder_context", + "sample_frequency", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args @@ -383,5 +640,61 @@ def add_class_args(parser, prefix=None, skip=set()): "arguments instead of the defaults in the pretrained model." ), ) + parser.add_argument( + "--left-encoder-context", + default=16, + type=int, + help=( + "past context frames used by the transformer encoder " + "when the signal is evaluated chunk by chunk." + ), + ) + parser.add_argument( + "--right-encoder-context", + default=16, + type=int, + help=( + "future context frames used by the transformer encoder " + "when the signal is evaluated chunk by chunk." + ), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + valid_args = ( + "override_dropouts", + "override_spec_augment", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--override-spec-augment", + default=False, + action=ActionYesNo, + help=( + "whether to use the spec augment config. passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index c75cb6e8..15b8248d 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -144,6 +144,10 @@ class HFWavLM(HFWav2VecBase): and uses the ones passed as arguments. override_spec_augment (`bool` defaults to False): if True, it ingnores the spec. augment. configuration in the pretrained model and uses the ones passed in the arguments. + left_encoder_context (`int`): past context frames used by the transformer encoder when the signal is evaluated + chunk by chunk, if it is too long to fit in GPU. + right_encoder_context: (`int`): future context frames used by the transformer encoder. + sample_frequency: (`int`) waveform sample frequency used to train the model. """ def __init__( @@ -193,6 +197,9 @@ def __init__( ignore_pretrained: bool = False, override_dropouts: bool = False, override_spec_augment: bool = False, + left_encoder_context: int = 16, + right_encoder_context: int = 16, + sample_frequency: int = 16000, ): super().__init__( @@ -207,6 +214,9 @@ def __init__( ignore_pretrained=ignore_pretrained, override_dropouts=override_dropouts, override_spec_augment=override_spec_augment, + left_encoder_context=left_encoder_context, + right_encoder_context=right_encoder_context, + sample_frequency=sample_frequency, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -234,7 +244,9 @@ def __init__( ) ddp_wait_for_all_procs() self.hf_model.config.layerdrop = 0.0 - self.change_hyperparams( + self.change_config( + override_dropouts=self.override_dropouts, + override_spec_augment=self.override_spec_augment, hidden_dropout=hidden_dropout, activation_dropout=activation_dropout, attention_dropout=attention_dropout, @@ -640,3 +652,110 @@ def add_class_args(parser, prefix=None, skip=set()): ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + args_base = HFWav2VecBase.filter_args(**kwargs) + valid_args = ( + "hidden_dropout", + "activation_dropout", + "attention_dropout", + "feat_proj_dropout", + "apply_spec_augment", + "mask_time_prob", + "mask_time_length", + "mask_time_min_masks", + "mask_feature_prob", + "mask_feature_length", + "mask_feature_min_masks", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(args_base) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2VecBase.add_finetune_args(parser) + parser.add_argument( + "--hidden-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "fully connected layers in the embeddings, encoder, and pooler" + ), + ) + parser.add_argument( + "--activation-dropout", + default=0.1, + type=float, + help=( + "the dropout probability for all " + "intermediate layer in feedforward transformer layers" + ), + ) + parser.add_argument( + "--attention-dropout", + default=0.1, + type=float, + help=("the dropout ratio for the attention probabilities"), + ) + parser.add_argument( + "--apply-spec-augment", + default=True, + action=ActionYesNo, + help=( + "whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder" + ), + ) + parser.add_argument( + "--mask-time-prob", + default=0.05, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the time axis which will be masked" + ), + ) + parser.add_argument( + "--mask-time-length", + default=10, + type=int, + help=("length of vector span along the time axis"), + ) + parser.add_argument( + "--mask-time-min-masks", + default=2, + type=int, + help=( + "the minimum number of masks of length `mask_time_length` generated along the time axis" + ), + ) + parser.add_argument( + "--mask-feature-prob", + default=0.0, + type=float, + help=( + "percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked" + ), + ) + parser.add_argument( + "--mask-feature-length", + default=10, + type=int, + help=(" length of vector span along the feature axis"), + ) + parser.add_argument( + "--mask-feature-min-masks", + default=0, + type=int, + help=( + "The minimum number of masks of length `mask_feature_length` generated along the feature axis" + ), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/utils/eval_utils.py b/hyperion/torch/utils/eval_utils.py index e8fa9c86..f1ae1edb 100644 --- a/hyperion/torch/utils/eval_utils.py +++ b/hyperion/torch/utils/eval_utils.py @@ -8,12 +8,8 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1): - # model_device = next(nnet.parameters()).device - # print(device, model_device, x.device) - # assume time is the last dimension device = None if nnet.device == x.device else nnet.device - T = x.shape[time_dim] if T <= chunk_length or chunk_length == 0: if device is not None: @@ -183,170 +179,3 @@ def eval_nnet_overlap_add( y = y.transpose(0, time_dim) / count return y - - -# """ -# Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -# """ - -# import math -# import torch - -# def eval_nnet_by_chunks(x, nnet, chunk_length=0, device=None, time_dim=-1): -# # model_device = next(nnet.parameters()).device -# # print(device, model_device, x.device) -# #assume time is the last dimension -# T = x.shape[time_dim] -# if T <= chunk_length or chunk_length == 0: -# if device is not None: -# x = x.to(device) -# return nnet(x) #.detach() - -# try: -# left_context, right_context = nnet.in_context() -# except: -# left_context = right_context = 0 - -# in_shape = x.shape -# chunk_shift_in = chunk_length - left_context - right_context - -# try: -# out_shape = nnet.out_shape(in_shape) -# T_out = out_shape[time_dim] -# r = float(T_out)/T -# except: -# out_shape = None - - -# num_chunks = int(math.ceil((T-chunk_length)/chunk_shift_in+1)) -# #move time dimension to dim 0 -# x = x.transpose(0, time_dim) -# y = None -# tbeg_in = 0 -# tbeg_out = 0 -# for i in range(num_chunks): -# tend_in = min(tbeg_in + chunk_length, x.shape[0]) -# #get slice and move back time dimension to last dim -# x_i = x[tbeg_in:tend_in].transpose(0, time_dim) -# if device is not None: -# x_i = x_i.to(device) - -# y_i = nnet(x_i).detach() -# chunk_length_out = y_i.shape[time_dim] -# if out_shape is None: -# # infer chunk_shift in the output -# r = float(chunk_length_out)/chunk_length - -# # infer total output length -# T_out = int(r * T) -# out_shape = list(y_i.shape) -# out_shape[time_dim] = T_out - -# if y is None: -# right_context_out = int(math.floor(r*right_context)) -# left_context_out = int(math.floor(r*left_context)) -# chunk_shift_out = chunk_length_out - right_context_out - left_context_out -# # create output tensor -# y = torch.zeros(out_shape) -# #move time dimension to dim 0 -# y = y.transpose(0, time_dim) - -# y_i = y_i.transpose(0, time_dim) - -# if i == 0: -# tend_out = min(tbeg_out + chunk_length_out, T_out) -# y[tbeg_out:tend_out] = y_i -# tbeg_out =+ (chunk_length_out - right_context_out) -# else: -# tend_out = min(int(round(tbeg_out)) + chunk_length_out - left_context_out, T_out) -# dt = tend_out - tbeg_out -# if dt > 0: -# #print('eu', tbeg_out, tend_out, left_context_out,left_context_out+dt, T_out, chunk_length, chunk_length_out, tbeg_in, tend_in) -# y[tbeg_out:tend_out] = y_i[left_context_out:left_context_out+dt] -# tbeg_out += chunk_shift_out - -# tbeg_in += chunk_shift_in - -# # put time dimension back in his place -# y = y.transpose(0, time_dim) - -# return y - - -# def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, device=None, time_dim=-1): - -# #assume time is the last dimension -# T = x.shape[time_dim] -# if T <= chunk_length or chunk_length == 0: -# if device is not None: -# x = x.to(device) -# return nnet(x).detach() - -# if chunk_overlap is None: -# #infer chunk overlap from network input context -# try: -# left_context, right_context = nnet.in_context() -# except: -# left_context = right_context = 0 - -# chunk_overlap = left_context + right_context - - -# in_shape = x.shape -# chunk_shift_in = chunk_length - chunk_overlap - -# try: -# out_shape = nnet.out_shape(in_shape) -# T_out = out_shape[time_dim] -# r = float(T_out)/T -# except: -# out_shape = None - - -# num_chunks = int(math.ceil((T-chunk_length)/chunk_shift_in+1)) -# #move time dimension to dim 0 -# x = x.transpose(0, time_dim) -# y = None -# N = None -# tbeg_in = 0 -# tbeg_out = 0 -# for i in range(num_chunks): -# tend_in = min(tbeg_in + chunk_length, x.shape[0]) -# #get slice and move back time dimension to last dim -# x_i = x[tbeg_in:tend_in].transpose(0, time_dim) -# if device is not None: -# x_i = x_i.to(device) - -# y_i = nnet(x_i).detach() -# chunk_length_out = y_i.shape[time_dim] -# if out_shape is None: -# # infer chunk_shift in the output -# r = float(chunk_length_out)/chunk_length - -# # infer total output length -# T_out = int(r * T) -# out_shape = list(y_i.shape) -# out_shape[time_dim] = T_out - -# if y is None: -# chunk_shift_out = r*chunk_shift_in -# # create output tensor -# y = torch.zeros(out_shape) -# #move time dimension to dim 0 -# y = y.transpose(0, time_dim) -# count = torch.zeros(T_out) - -# y_i = y_i.transpose(0, time_dim) - -# tend_out = min(int(round(tbeg_out)) + chunk_length_out, T_out) -# dt = tend_out - tbeg_out -# y[tbeg_out:tend_out] += y_i[:dt] -# count[tbeg_out:tend_out] += 1 -# tbeg_out += chunk_shift_out -# tbeg_in += chunk_shift_in - -# # put time dimension back in his place and normalize -# y = y.transpose(0, time_dim)/count - -# return y From 4825d393cf1af165699eac7e9ac54cf224bb6daa Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 8 Jul 2022 17:43:09 -0400 Subject: [PATCH 020/154] added configs for w2v2 xlsr --- ...c2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml | 45 +++++++++++++++++ ...c2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml | 49 ++++++++++++++++++ ...c2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml | 50 +++++++++++++++++++ ...vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml | 45 +++++++++++++++++ ...vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml | 49 ++++++++++++++++++ ...vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml | 50 +++++++++++++++++++ .../conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml | 40 +++++++++++++++ .../conf/wav2vec2xlsr53_ecapatdnn512x3.yaml | 40 +++++++++++++++ ...fig_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh | 49 ++++++++++++++++++ ...onfig_wav2vec2xlr53_ecapatdnn512x3_v1.0.sh | 49 ++++++++++++++++++ 10 files changed, 466 insertions(+) create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2xlsr53_ecapatdnn512x3.yaml create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2xlr53_ecapatdnn512x3_v1.0.sh diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..e1d1b1ea --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml new file mode 100644 index 00000000..1298a056 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml @@ -0,0 +1,49 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5.5e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 4.4e-3 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..2867cfef --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,50 @@ +data: + train: + dataset: + max_chunk_length: 6.0 + min_chunk_length: 6.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 16 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 2 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml new file mode 100644 index 00000000..247f8a7c --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: wav2vec2xlsr53_ecapatdnn512x3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml new file mode 100644 index 00000000..1298a056 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml @@ -0,0 +1,49 @@ +data: + train: + dataset: + max_chunk_length: 3.0 + min_chunk_length: 3.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5.5e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 4.4e-3 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml new file mode 100644 index 00000000..2867cfef --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml @@ -0,0 +1,50 @@ +data: + train: + dataset: + max_chunk_length: 6.0 + min_chunk_length: 6.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 16 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + max_chunk_length: 4.0 + min_chunk_length: 4.0 + aug_cfg: conf/reverb_noise_aug.yaml + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2.3e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 2e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 2 + eff_batch_size: 192 + train_mode: full + + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml new file mode 100644 index 00000000..1cc7df4c --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr53_ecapatdnn512x3.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr53_ecapatdnn512x3.yaml new file mode 100644 index 00000000..1975bada --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr53_ecapatdnn512x3.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-large-xlsr-53 +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..9225389a --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlr53_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr53_ecapatdnn512x3_v1.0.sh new file mode 100644 index 00000000..014a5d03 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr53_ecapatdnn512x3_v1.0.sh @@ -0,0 +1,49 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr53 + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + From 9a3a16837c077aecc83c50ca8a9e974265e74855 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 22 Jul 2022 17:53:20 -0400 Subject: [PATCH 021/154] added hard negative mining --- ...c2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml | 3 +- ...fig_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh | 1 + hyperion/bin/finetune_wav2vec2xvector.py | 17 +++++++++++ hyperion/torch/data/weighted_seq_sampler.py | 30 ++++++++++++------- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml index 2867cfef..fb264a53 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml @@ -8,6 +8,7 @@ data: sampler: batch_size: 16 iters_per_epoch: 6 + num_hard_prototypes: 8 data_loader: num_workers: 8 val: @@ -43,7 +44,7 @@ trainer: update_lr_on_opt_step: true use_amp: true log_interval: 1000 - epochs: 2 + epochs: 5 eff_batch_size: 192 train_mode: full diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh index 9225389a..8144f6eb 100644 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlr300m_ecapatdnn512x3_v1.0.sh @@ -33,6 +33,7 @@ nnet_s3_args="" nnet_s3_name=${nnet_name}.s3 nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index f2c740da..25722b35 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -95,6 +95,22 @@ def init_model(num_classes, in_model_file, rank, **kwargs): return model +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + if not train_loader.batch_sampler.hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + if not val_loader.batch_sampler.hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + def train_model(gpu_id, args): config_logger(args.verbose) @@ -112,6 +128,7 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) model = init_model(train_loader.dataset.num_classes, **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: diff --git a/hyperion/torch/data/weighted_seq_sampler.py b/hyperion/torch/data/weighted_seq_sampler.py index e679251b..c50d577d 100644 --- a/hyperion/torch/data/weighted_seq_sampler.py +++ b/hyperion/torch/data/weighted_seq_sampler.py @@ -43,7 +43,6 @@ def __init__( num_egs_per_utt=1, var_batch_size=False, num_hard_prototypes=0, - num_egs_per_hard_prototype=1, affinity_matrix=None, iters_per_epoch=None, ): @@ -66,7 +65,6 @@ def __init__( self.num_egs_per_utt = num_egs_per_utt self.var_batch_size = var_batch_size self.num_hard_prototypes = num_hard_prototypes - self.num_egs_per_hard_prototype = num_egs_per_hard_prototype self.batch = 0 self.rank = rank @@ -160,6 +158,7 @@ def __iter__(self): self.batch = 0 return self + @property def hard_prototype_mining(self): return self.num_hard_prototypes > 0 @@ -168,14 +167,22 @@ def set_hard_prototypes(self, affinity_matrix): self.hard_prototypes = None return - affinity_matrix[np.diag(affinity_matrix.shape[0])] = -1.0 + # affinity_matrix[np.diag(affinity_matrix.shape[0])] = -1.0 + # hard prototypes for a class are itself and k-1 closest to it. self.hard_prototypes = torch.topk( affinity_matrix, self.num_hard_prototypes, dim=-1 ).indices + def get_hard_prototypes(self, class_idx): + return self.hard_prototypes[class_idx].flatten() + def _get_utt_idx_basic(self, batch_mult=1): dataset = self.dataset num_classes_per_batch = batch_mult * self._num_classes_per_batch + if self.hard_prototype_mining: + num_classes_per_batch = int( + math.ceil(num_classes_per_batch / self.num_hard_prototypes) + ) if dataset.class_weights is None: class_idx = torch.randint( @@ -188,6 +195,9 @@ def _get_utt_idx_basic(self, batch_mult=1): replacement=True, ) + if self.hard_prototype_mining: + class_idx = self.get_hard_prototypes(class_idx) + if self.num_egs_per_class > 1: class_idx = class_idx.repeat(self.num_egs_per_class) @@ -206,6 +216,10 @@ def _get_utt_idx_seq_st_max_length(self, chunk_length, batch_mult=1): dataset = self.dataset num_classes_per_batch = batch_mult * self._num_classes_per_batch + if self.hard_prototype_mining: + num_classes_per_batch = int( + math.ceil(num_classes_per_batch / self.num_hard_prototypes) + ) # first we sample the batch classes class_weights = dataset.class_weights.clone() @@ -219,6 +233,9 @@ def _get_utt_idx_seq_st_max_length(self, chunk_length, batch_mult=1): class_weights, num_samples=num_classes_per_batch, replacement=True ) + if self.hard_prototype_mining: + class_idx = self.get_hard_prototypes(class_idx) + utt_idx = torch.zeros( (len(class_idx) * self.num_egs_per_class,), dtype=torch.long ) @@ -293,7 +310,6 @@ def filter_args(**kwargs): "num_egs_per_class", "num_egs_per_utt", "num_hard_prototypes", - "num_egs_per_hard_prototype", ) return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -351,12 +367,6 @@ def add_class_args(parser, prefix=None): default=0, help=("number of hard prototype classes per batch"), ) - parser.add_argument( - "--num-egs-per-hard-prototype", - type=int, - default=1, - help=("number of samples per hard prototype class in the batch"), - ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) From 45b5dcd309beb525712969023c26d69e40d6bdca Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 28 Jul 2022 17:55:29 -0400 Subject: [PATCH 022/154] refactorized segment samplers --- hyperion/torch/data/bucketing_seg_sampler.py | 123 ++++++ .../data/class_weighted_seg_chunk_sampler.py | 408 ++++++++++++++++++ hyperion/torch/data/hyp_sampler.py | 43 ++ hyperion/torch/data/seg_chunk_sampler.py | 148 +++++++ hyperion/torch/data/seg_sampler.py | 189 ++++++++ hyperion/torch/data/seg_sampler_factory.py | 225 ++++++++++ hyperion/torch/optim/factory.py | 2 + hyperion/utils/class_info.py | 28 ++ hyperion/utils/feature_set.py | 74 ++++ hyperion/utils/info_table.py | 383 ++++++++++++++++ hyperion/utils/recording_set.py | 61 +++ hyperion/utils/segment_set.py | 11 + 12 files changed, 1695 insertions(+) create mode 100644 hyperion/torch/data/bucketing_seg_sampler.py create mode 100644 hyperion/torch/data/class_weighted_seg_chunk_sampler.py create mode 100644 hyperion/torch/data/hyp_sampler.py create mode 100644 hyperion/torch/data/seg_chunk_sampler.py create mode 100644 hyperion/torch/data/seg_sampler.py create mode 100644 hyperion/torch/data/seg_sampler_factory.py create mode 100644 hyperion/utils/class_info.py create mode 100644 hyperion/utils/feature_set.py create mode 100644 hyperion/utils/info_table.py create mode 100644 hyperion/utils/recording_set.py create mode 100644 hyperion/utils/segment_set.py diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py new file mode 100644 index 00000000..85b6772e --- /dev/null +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -0,0 +1,123 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math +from jsonargparse import ArgumentParser, ActionParser +import logging + +import numpy as np + +import torch +from .hyp_sampler import HypSampler +from .seg_sampler import SegSampler +import torch.distributed as dist + + +class BucketingSegSampler(HypSampler): + def __init__( + self, + seg_set, + base_sampler=SegSampler, + num_buckets=10, + length_column="duration", + seed=1234, + **base_kwargs + ): + super().__init__(shuffle=False, seed=seed) + self.seg_set = seg_set + self.base_sampler = base_sampler + self.base_kwargs = base_kwargs + self.base_kwargs["seed"] = seed + self.num_buckets = num_buckets + self.length_column = length_column + self._create_bucket_samplers() + self._compute_len() + self.depleted_buckets = torch.zeros((num_buckets,), dtype=torch.bool) + + @staticmethod + def create_buckets(self, seg_ids, seg_lengths): + sort_idx = torch.argsort(seg_lengths) + sort_ids = seg_ids[sort_idx] + sort_lengths = seg_lengths[sort_ids] + cum_lengths = torch.cumsum(sort_lengths) + bucket_length = cum_lengths[-1] / self.num_buckets + buckets = [] + for i in range(self.num_buckets): + bucket_idx = (cum_lengths <= bucket_length) & (cum_lengths > 0) + bucket_i = sort_ids[bucket_idx] + buckets.append(bucket_i) + cum_lengths -= bucket_length + + return buckets + + def _create_bucket_samplers(self): + buckets = self.create_buckets( + self.dataset["ids"], self.dataset[self.length_column] + ) + bucket_samplers = [] + for i in range(self.num_buckets): + dataset_i = self.dataset.create_bucket(buckets[i]) + sampler_i = self.base_sampler(dataset_i, self.seed, **self.base_kwargs) + bucket_samplers.append(sampler_i) + + self.bucket_samplers = bucket_samplers + + def _compute_len(self): + self._len = 0 + for i in range(self.num_buckets): + self._len += len(self.bucket_samplers[i]) + + def set_epoch(self, epoch): + for i in range(self.num_buckets): + self.bucket_samplers[i].set_epoch(epoch) + + def __iter__(self): + super().__iter__() + for i in range(self.num_buckets): + self.bucket_samplers[i].__iter__() + + return self + + def all_buckets_depleted(self): + return torch.all(self.depleted_buckets).item() + + def __next__(self): + + if self.batch == self._len or self.all_buckets_depleted(): + raise StopIteration + + while True: + bucket_idx = torch.randint( + low=0, high=self.num_buckets, size=(1,), generator=self.rng + ).item() + if self.depleted_buckets[bucket_idx]: + continue + + bucket = self.buckets[bucket_idx] + try: + batch = next(bucket) + break + except StopIteration: + self.depleted_buckets[bucket_idx] = True + if self.all_buckets_depleted(): + raise StopIteration() + + if self.batch == 0: + logging.info("batch 0 chunks=%s", str(batch[:10])) + + self.batch += 1 + return batch + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "num_buckets", + "length_column", + "shuffle", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py new file mode 100644 index 00000000..1a9f98b8 --- /dev/null +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -0,0 +1,408 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +import logging + +import numpy as np +import pandas as pd + +import torch +from .hyp_sampler import HypSampler + + +class ClassWeightedRandomSegChunkSampler(HypSampler): + def __init__( + self, + seg_set, + class_info, + min_chunk_length, + max_chunk_length=None, + min_batch_size=1, + max_batch_size=None, + max_batch_length=None, + num_chunks_per_seg_epoch="auto", + num_segs_per_class=1, + num_chunks_per_seg=1, + num_hard_prototypes=0, + affinity_matrix=None, + class_column="class_id", + length_column="duration", + iters_per_epoch=None, + batch_size=None, + seed=1234, + ): + super().__init__(seed=seed) + self.class_column = class_column + self.length_column = length_column + self.seg_set = seg_set + self.class_info = class_info + self.min_chunk_length = min_chunk_length + self.max_chunk_length = ( + min_chunk_length if max_chunk_length is None else max_chunk_length + ) + + # computing min-batch-size + if batch_size is not None: + min_batch_size = batch_size + + min_batch_size = max(num_segs_per_class * num_chunks_per_seg, min_batch_size) + + # computing max-batch-size + if max_batch_length is None: + max_batch_size_0 = int(min_batch_size * max_chunk_length / min_chunk_length) + else: + max_batch_size_0 = int(max_batch_length / max_chunk_length) + + max_batch_size = ( + max_batch_size_0 + if max_batch_size is None + else min(max_batch_size_0, max_batch_size) + ) + + self.min_batch_size = min_batch_size + self.max_batch_size = max_batch_size + self.avg_batch_size = (min_batch_size + max_batch_size) / 2 + self.var_batch_size = self.min_batch_size != self.max_batch_size + + self.num_segs_per_class = num_segs_per_class + self.num_chunks_per_seg = num_chunks_per_seg + + self.num_hard_prototypes = num_hard_prototypes + self.batch = 0 + + # compute the number of batches / epoch + # legacy config parameter + num_chunks_per_seg_epoch = ( + iters_per_epoch if iters_per_epoch is not None else num_chunks_per_seg_epoch + ) + self._set_num_chunks_per_seg_epoch(num_chunks_per_seg_epoch) + self._compute_len() + + self._gather_class_info() + + self.set_hard_prototypes(affinity_matrix) + + logging.info( + "batches/epoch=%d min-batch-size=%d, max-batch-size=%d avg-batch-size/gpu=%.2f avg-classes/batch=%.2f samples/(seg*epoch)=%d", + self._len, + self.min_batch_size, + self.max_batch_size, + self.avg_batch_size, + self.avg_batch_size / num_segs_per_class / num_chunks_per_seg, + self.num_chunks_per_seg_epoch, + ) + + def _set_seed(self): + self.rng.manual_seed(self.seed + 10 * self.epoch + 100 * self.rank) + + def _set_num_chunks_per_seg_epoch(self, num_chunks_per_seg_epoch): + if num_chunks_per_seg_epoch == "auto": + self._compute_num_chunks_per_seg_epoch_auto() + else: + self.num_egs_per_utt_epoch = num_chunks_per_seg_epoch + + def _compute_num_chunks_per_seg_epoch_auto(self): + seg_set = self.seg_set + avg_seg_length = np.mean(seg_set[self.length_column]) + avg_chunk_length = (self.max_chunk_length + self.min_chunk_length) / 2 + self.num_chunks_per_seg_epoch = math.ceil(avg_seg_length / avg_chunk_length) + logging.debug( + "num egs per segment and epoch: %d", self.num_chunks_per_seg_epoch + ) + + def _compute_len(self): + self._len = int( + math.ceil( + self.num_chunks_per_seg_epoch + * len(self.seg_set) + / self.avg_batch_size + / self.world_size + ) + ) + + def __len__(self): + return self._len + + def _gather_class_info(self): + # we get some extra info that we need for the classes. + + # we need the maximum/minimum segment duration for each class. + max_dur = np.zeros(len(self.class_info)) + min_dur = np.zeros(len(self.class_info)) + for i, c in enumerate(self.class_info["id"]): + seg_idx = self.seg_set[self.class_column] == c + durs_i = self.seg_set.loc[seg_idx] + max_dur[i] = durs_i.max() + min_dur[i] = durs_i.min() + + self.class_info["max_seg_duration"] = max_dur + self.class_info["min_seg_duration"] = min_dur + + zero_weight = self.class_info["min_seg_duration"] < self.min_chunk_length + if np.any(zero_weight): + self.class_info.loc[zero_weight, "weights"] = 0 + self.class_info.loc["weights"] /= self.class_info["weights"].sum() + + self.var_weights = np.any( + self.seg_set[self.length_column] < self.max_chunk_length + ) + + self.map_idx_to_ids = self.class_info[["class_idx", "ids"]] + self.map_idx_to_ids.set_index("class_idx", inplace=True) + + @property + def hard_prototype_mining(self): + return self.num_hard_prototypes > 1 + + def set_hard_prototypes(self, affinity_matrix): + if affinity_matrix is None: + self.hard_prototypes = None + return + + # affinity_matrix[np.diag(affinity_matrix.shape[0])] = -1.0 + # hard prototypes for a class are itself and k-1 closest to it. + self.hard_prototypes = torch.topk( + affinity_matrix, self.num_hard_prototypes, dim=-1 + ).indices + + def get_hard_prototypes(self, class_idx): + return self.hard_prototypes[class_idx].flatten() + + def _sample_chunk_length(self): + if self.var_batch_size: + return ( + torch.rand(size=(1,), generator=self.rng).item() + * (self.max_chunk_length - self.min_chunk_length) + + self.min_chunk_length + ) + + return self.min_chunk_length + + def _compute_batch_size(self, chunk_length): + return int(self.min_batch_size * self.max_chunk_length / chunk_length) + + def _compute_num_classes_per_batch(self, batch_size): + num_classes = batch_size / self.num_segs_per_class / self.num_egs_per_utt + if self.hard_prototype_mining: + num_classes /= self.num_hard_prototypes + return int(math.ceil(num_classes)) + + def _get_class_weights(self, chunk_length): + if not self.var_weights: + return self.class_info["weights"].values + + # get classes where all segments are shorter than + # chunk length and put weight to 0 + zero_idx = self.class_info["max_seg_duration"] < chunk_length + if not np.any(zero_idx): + return self.class_info["weights"].values + + class_weights = self.class_info["weights"].values.copy() + class_weights[zero_idx] = 0.0 + # renormalize weights + class_weights /= class_weights.sum() + return class_weights + + def _sample_classes(self, num_classes, chunk_length): + weights = self._get_class_weights(chunk_length) + row_idx = torch.multinomial( + weights, + num_samples=num_classes, + replacement=True, + ) + + class_ids = self.class_info.iloc[row_idx].id.values + if self.hard_prototype_mining: + # map class ids to class indexes + class_idx = self.class_info.loc[class_ids, "class_idx"] + class_idx = self.get_hard_prototypes(class_idx) + # map back to class ids + class_ids = self.map_idx_to_ids.loc[class_idx] + + return class_ids + + def _sample_segs(self, class_ids, chunk_length): + + seg_ids = [] + for c in class_ids: + # for each class we sample segments longer than chunk length + # get segments belonging to c + seg_mask = (self.seg_set[self.class_column] == c) & ( + self.seg_set[self.length_column] > chunk_length + ) + seg_ids_c = self.seg_set.loc[seg_mask, "id"] + # sample num_segs_per_class random segments + sel_seg_idx_c = torch.randint( + low=0, + high=len(seg_ids_c), + size=(self.num_segs_per_class,), + generator=self.rng, + ) + sel_seg_ids_c = seg_ids_c[sel_seg_idx_c] + seg_ids.extend(sel_seg_ids_c) + + return seg_ids + + def _sample_chunks(self, seg_ids, chunk_length): + chunks = [] + scale = self.seg_set.loc[seg_ids, self.length_column] - chunk_length + for i in range(self.num_chunks_per_seg): + start = scale * torch.rand(size=(len(seg_ids),), generator=self.rng) + chunks_i = [(id, s, chunk_length) for id, s in zip(seg_ids, start)] + chunks.expand(chunks_i) + + return chunks + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + chunk_length = self._sample_chunk_length() + batch_size = self._compute_batch_size() + num_classes = self._compute_num_classes_per_batch(batch_size) + class_ids = self._sample_classes(num_classes, chunk_length) + seg_ids = self._sample_segs(class_ids, chunk_length) + chunks = self._sample_chunks(seg_ids, chunk_length) + if self.batch == 0: + logging.info("batch 0 uttidx=%s", str(chunks[:10])) + + self.batch += 1 + return chunks + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "min_chunk_length", + "max_chunk_length", + "min_batch_size", + "max_batch_size", + "max_batch_length", + "num_chunks_per_seg_epoch", + "num_segs_per_class", + "num_chunks_per_seg", + "num_hard_prototypes", + "class_column", + "length_column", + "iters_per_epoch", + "batch_size", + "shuffle", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--min-chunk-length", + type=float, + default=4.0, + help=("minimum length of the segment chunks"), + ) + parser.add_argument( + "--max-chunk-length", + type=float, + default=None, + help=("maximum length of segment chunks"), + ) + + parser.add_argument( + "--min-batch-size", + type=int, + default=1, + help=("minimum batch size per gpu"), + ) + parser.add_argument( + "--max-batch-size", + type=int, + default=None, + help=( + "maximum batch size per gpu, if None, estimated from max_batch_length" + ), + ) + + parser.add_argument( + "--batch-size", + default=128, + type=int, + help=("deprecated, use min-batch-size instead"), + ) + + parser.add_argument( + "--max-batch-duration", + type=float, + default=None, + help=( + "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), + ) + + parser.add_argument( + "--iters-per-epoch", + default=None, + type=lambda x: x if (x == "auto" or x is None) else float(x), + help=("deprecated, use --num-egs-per-seg-epoch instead"), + ) + + parser.add_argument( + "--num-chunks-per-seg-epoch", + default="auto", + type=lambda x: x if x == "auto" else float(x), + help=("number of times we sample a segment in each epoch"), + ) + + parser.add_argument( + "--num-segs-per-class", + type=int, + default=1, + help=("number of segments per class in batch"), + ) + parser.add_argument( + "--num-chunks-per-seg", + type=int, + default=1, + help=("number of chunks per segment in batch"), + ) + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-column", + default="duration", + help="which column in the segment table indicates the duration of the segment", + ) + parser.add_argument( + "--class-column", + default="class_id", + help="which column in the segment table indicates the class of the segment", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/hyp_sampler.py b/hyperion/torch/data/hyp_sampler.py new file mode 100644 index 00000000..18ae4b5d --- /dev/null +++ b/hyperion/torch/data/hyp_sampler.py @@ -0,0 +1,43 @@ +import math +from jsonargparse import ArgumentParser, ActionParser +import logging + +import numpy as np + +import torch +from torch.utils.data import Sampler +import torch.distributed as dist + + +class HypSampler(Sampler): + def __init__(self, shuffle=False, seed=1234): + super().__init__(None) + self.epoch = 0 + self.batch = 0 + self.shuffle = shuffle + self.seed = seed + + try: + rank = dist.get_rank() + world_size = dist.get_world_size() + except: + rank = 0 + world_size = 1 + + self.rank = rank + self.world_size = world_size + self.rng = torch.Generator() + + def set_epoch(self, epoch): + self.epoch = epoch + + def _set_seed(self): + if self.shuffle: + self.rng.manual_seed(self.seed + 10 * self.epoch) + else: + self.rng.manual_seed(self.seed) + + def __iter__(self): + self.batch = 0 + self._set_seed() + return self diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py new file mode 100644 index 00000000..a971f8ce --- /dev/null +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -0,0 +1,148 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math +from jsonargparse import ArgumentParser, ActionParser +import logging + +import numpy as np +import pandas as pd + +import torch +from .hyp_sampler import HypSampler +from .seg_sampler import SegSampler +import torch.distributed as dist + + +class SegChunkSampler(HypSampler): + def __init__( + self, + seg_set, + min_chunk_length, + max_chunk_length=None, + base_sampler=SegSampler, + length_column="duration", + shuffle=False, + seed=1234, + **base_kwargs + ): + + super().__init__(shuffle=shuffle, seed=seed) + self.seg_set = seg_set + self.min_chunk_length = min_chunk_length + self.max_chunk_length = ( + min_chunk_length if max_chunk_length is None else max_chunk_length + ) + self.avg_chunk_length = (max_chunk_length + min_chunk_length) / 2 + self.chunk_set = None + self.length_column = length_column + self.chunk_sampler = base_sampler + self.base_kwargs = base_kwargs + self.base_kwargs["seed"] = seed + self.base_kwargs["shuffle"] = shuffle + if "subbase_sampler" in base_kwargs: + base_kwargs["base_sampler"] = base_kwargs.pop("subbase_sampler") + + self.__iter__() + + def __len__(self): + return len(self._seg_sampler) + + # def _compute_num_chunks(self, seg_set): + # num_chunks = 0 + # for len in seg_set['duration']: + # if len < self.min_chunk_length: + # #discard too short sequences + # continue + + # num_chunks += math.ceil(len/self._avg_chunk_length) + + # self.num_chunks = num_chunks + + @property + def duration_is_random(self): + return self.min_chunk_length != self.max_chunk_length + + def get_random_duration(self): + if self.duration_is_random: + return ( + torch.rand(size=(1,), generator=self.rng).item() + * (self.max_chunk_length - self.min_chunk_length) + + self.min_chunk_length + ) + else: + return self.min_chunk_length + + def _create_chunks(self): + + chunks = [] + for id, len in zip(self.seg_set["id"], self.seg_set[self.length_column]): + if len < self.min_chunk_length: + # discard too short sequences + continue + + # making it this way, we get the same number of chunks in all epochs + num_chunks = math.ceil(len / self.avg_chunk_length) + start = 0 + for i in range(num_chunks - 1): + dur = self.get_random_duration() + chunk = (id, start, dur) + chunks.append(chunk) + start += dur + + # special treatment for last chunk we get from the recording + remainder = len - start + if remainder > self.max_chunk_length: + # here we discard part of the end + chunk = (id, start, self.max_chunk_length) + elif remainder < self.min_chunk_length: + # here we overlap with second last chunk + chunk = (id, len - self.min_chunk_length, self.min_chunk_length) + else: + # here the last chunk is what it is left + chunk = (id, start, remainder) + + chunks.append(chunk) + + self.chunk_set = pd.DataFrame( + chunks, columns=["id", "chunk_start", self.length_column] + ) + + def __iter__(self): + super().__iter__() + self._create_chunks() + self._seg_sampler = SegSampler(self.chunk_set, self._base_kwargs) + self._seg_sampler.set_epoch(self.epoch) + self._seg_sampler.__iter__() + + return self + + def __next__(self): + + return next(self._seg_sampler) + # if self.batch == self._len: + # raise StopIteration + + # start = (self.batch -1)*self.batch_size + # chunks = self.chunks[start:start+self.batch_size] + + # if self.batch == 0: + # logging.info("batch 0 chunks=%s", str(chunks[:10])) + + # self.batch +=1 + # return chunks + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "min_chunk_length", + "max_chunk_length", + "length_column", + "shuffle", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py new file mode 100644 index 00000000..6802cc8e --- /dev/null +++ b/hyperion/torch/data/seg_sampler.py @@ -0,0 +1,189 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import math +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +import logging + +import numpy as np + +import torch +from .hyp_sampler import HypSampler + + +class SegSampler(HypSampler): + def __init__( + self, + seg_set, + min_batch_size=1, + max_batch_size=None, + max_batch_length=None, + length_column="duration", + shuffle=False, + drop_last=False, + seed=1234, + ): + super().__init__(shuffle=shuffle, seed=seed) + self.seg_set = seg_set + self.min_batch_size = min_batch_size + self.max_batch_size = max_batch_size + self.max_batch_length = max_batch_length + self.var_batch_size = max_batch_length is not None + self.length_column = length_column + if self.var_batch_size: + avg_batch_size = max_batch_length / torch.mean( + self.seg_set[self.length_column] + ) + else: + avg_batch_size = min_batch_size + + len = len(self.seg_set) / avg_batch_size / self.world_size + if drop_last: + self._len = int(len) + else: + self._len = int(math.ceil(len)) + + self._permutation = None + + @property + def seg_set(self): + return self.dataset + + def __len__(self): + return self._len + + def _shuffle_segs(self): + self._permutation = torch.randperm(len(self.seg_set), generator=self.rng) + + def __iter__(self): + super().__iter__() + if self.shuffle: + self._shuffle_segs() + + self.start = self.rank + return self + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + if self.var_batch_size: + idxs = [] + max_length = 0 + batch_size = 0 + while True: + if self._shuffle: + idx = self._permutation[self.start] + else: + idx = self.start + + max_length = max(max_length, self.seg_set.iloc[idx].duration.values) + if max_length * (batch_size + 1) > self.max_batch_length: + break + + idxs.append(idx) + self.start = (self.start + self.world_size) % len(self.seg_set) + batch_size += 1 + if ( + self.max_batch_size is not None + and batch_size >= self.max_batch_size + ): + break + + assert len(idxs) > self.min_batch_size + else: + stop = min(self.start + self.min_batch_size, len(self.seg_set)) + if self.shuffle: + idx = self._permutation[self.start : stop] + else: + idx = slice(self.start, stop) + self.start + + seg_ids = self.seg_set.iloc[idx].id + + if self.batch == 0: + logging.info("batch 0 chunks=%s", str(seg_ids[:10])) + + self.batch += 1 + if "chunk_start" in self.seg_set: + chunks = self.seg_set.loc[ + seg_ids, ["chunk_start", self.length_column] + ].values + return [(id, chunk[0], chunk[1]) for id, chunk in zip(seg_ids, chunks)] + + return seg_ids + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "min_batch_size", + "max_batch_size", + "max_batch_length", + "length_column", + "shuffle", + "drop_last", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--min-batch-size", + type=int, + default=1, + help=("minimum batch size per gpu"), + ) + parser.add_argument( + "--max-batch-size", + type=int, + default=None, + help=( + "maximum batch size per gpu, if None, estimated from max_batch_length" + ), + ) + + parser.add_argument( + "--max-batch-duration", + type=float, + default=None, + help=( + "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), + ) + + parser.add_argument( + "--drop-last", + action=ActionYesNo, + help="drops the last batch of the epoch", + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-column", + default="duration", + help="which column in the segment table indicates the duration of the file", + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py new file mode 100644 index 00000000..e3ba84f8 --- /dev/null +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -0,0 +1,225 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from typing import Union, Optional +import logging +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo + +from .audio_dataset import AudioDataset +from .feat_seq_dataset import FeatSeqDataset + +from .seg_sampler import SegSampler +from .class_weighted_seg_chunk_sampler import ClassWeightedRandomSegChunkSampler +from .seg_chunk_sampler import SegChunkSampler +from .bucketing_seg_sampler import BucketingSegSampler + +sampler_dict = { + "class_weighted_random_seg_chunk_sampler": ClassWeightedRandomSegChunkSampler, + "seg_sampler": SegSampler, + "seg_chunk_sampler": SegChunkSampler, + "bucketing_seg_sampler": BucketingSegSampler, +} + + +class SegSamplerFactory(object): + """Factory class to create different types of samplers for + sequencial data like audio or acoustic features. + """ + + @staticmethod + def create( + dataset: Union[AudioDataset, FeatSeqDataset], + sampler_type: str = "class_weighted_random_seg_chunk_sampler", + base_sampler_type: str = "seg_sampler", + subbase_sampler_type: str = "seg_sampler", + **kwargs, + ): + """Functions that creates a sequence sampler based on a dataset, sampler_type and sampler arguments. + + Args: + dataset: sequence dataset object containing the data info of class AudioDataset or FeatSeqDataset. + sampler_type: string indicating the sampler type. + """ + + sampler_class = sampler_dict[sampler_type] + sampler_kwargs = sampler_class.filter_args(**kwargs) + + if sampler_type in ["bucketing_seg_sampler", "seg_chunk_sampler"]: + base_sampler_class = sampler_dict[base_sampler_type] + base_sampler_kwargs = base_sampler_class.filter_args(**kwargs) + sampler_kwargs.update(base_sampler_kwargs) + sampler_kwargs["base_sampler"] = base_sampler_class + if base_sampler_type == "bucketing_seg_sampler": + base_sampler_class = sampler_dict[subbase_sampler_type] + base_sampler_kwargs = base_sampler_class.filter_args(**kwargs) + sampler_kwargs.update(base_sampler_kwargs) + + if sampler_type in ["class_weighted_random_seg_chunk_sampler"]: + sampler_kwargs["class_info"] = dataset.class_info + + logging.info(f"sampler-args={sampler_kwargs}") + + return sampler_class(dataset.seg_set, **sampler_kwargs) + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "sampler_type", + "num_buckets", + "min_chunk_length", + "max_chunk_length", + "min_batch_size", + "max_batch_size", + "max_batch_length", + "num_chunks_per_seg_epoch", + "num_segs_per_class", + "num_chunks_per_seg", + "num_hard_prototypes", + "class_column", + "length_column", + "iters_per_epoch", + "batch_size", + "shuffle", + "drop_last", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--sampler-type", + choices=sampler_dict.keys(), + default="class_weighted_random_seg_chunk_sampler", + help="batch sampler type", + ) + + parser.add_argument( + "--base-sampler-type", + choices=["seg_sampler", "bucketing_seg_sampler"], + default="seg_sampler", + help="base sampler used for seg_chunk_sampler or bucketing_seg_sampler", + ) + + parser.add_argument( + "--min-chunk-length", + type=float, + default=4.0, + help=("minimum length of the segment chunks"), + ) + + parser.add_argument( + "--min-chunk-length", + type=float, + default=4.0, + help=("minimum length of the segment chunks"), + ) + parser.add_argument( + "--max-chunk-length", + type=float, + default=None, + help=("maximum length of segment chunks"), + ) + + parser.add_argument( + "--min-batch-size", + type=int, + default=1, + help=("minimum batch size per gpu"), + ) + parser.add_argument( + "--max-batch-size", + type=int, + default=None, + help=( + "maximum batch size per gpu, if None, estimated from max_batch_length" + ), + ) + + parser.add_argument( + "--batch-size", + default=128, + type=int, + help=("deprecated, use min-batch-size instead"), + ) + + parser.add_argument( + "--max-batch-duration", + type=float, + default=None, + help=( + "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), + ) + + parser.add_argument( + "--iters-per-epoch", + default=None, + type=lambda x: x if (x == "auto" or x is None) else float(x), + help=("deprecated, use --num-egs-per-seg-epoch instead"), + ) + + parser.add_argument( + "--num-chunks-per-seg-epoch", + default="auto", + type=lambda x: x if x == "auto" else float(x), + help=("number of times we sample a segment in each epoch"), + ) + + parser.add_argument( + "--num-segs-per-class", + type=int, + default=1, + help=("number of segments per class in batch"), + ) + parser.add_argument( + "--num-chunks-per-seg", + type=int, + default=1, + help=("number of chunks per segment in batch"), + ) + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) + + parser.add_argument( + "--drop-last", + action=ActionYesNo, + help="drops the last batch of the epoch", + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--length-column", + default="duration", + help="which column in the segment table indicates the duration of the segment", + ) + parser.add_argument( + "--class-column", + default="class_id", + help="which column in the segment table indicates the class of the segment", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/optim/factory.py b/hyperion/torch/optim/factory.py index 4fa7b186..ab350098 100644 --- a/hyperion/torch/optim/factory.py +++ b/hyperion/torch/optim/factory.py @@ -13,6 +13,8 @@ class OptimizerFactory(object): + """Factory class to create different types of optimizers.""" + @staticmethod def create( params, diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py new file mode 100644 index 00000000..2aed18c1 --- /dev/null +++ b/hyperion/utils/class_info.py @@ -0,0 +1,28 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .info_table import InfoTable + + +class ClassInfo(InfoTable): + def __init__(self, df): + super().__init__(df) + if "class_idx" not in self.df: + self.add_class_idx() + + if "weights" not in self.df: + self.add_equal_weights() + else: + self.df['weights'] /= self.df['weigths'].sum() + + def add_class_idx(self): + self.df["class_idx"] = [i for i in range(len(self.df))] + + def add_equal_weights(self): + self.df["weights"] = 1 / len(self.df) + + @property + def weights(self, id): + return self.df.loc[id, "weights"] diff --git a/hyperion/utils/feature_set.py b/hyperion/utils/feature_set.py new file mode 100644 index 00000000..456cf99b --- /dev/null +++ b/hyperion/utils/feature_set.py @@ -0,0 +1,74 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from pathlib import Path + +import numpy as np +import pandas as pd + +from .info_table import InfoTable + + +class FeatureSet(InfoTable): + def __init__(self, df): + super().__init__(df) + assert "storage_path" in df + + def save(self, file_path, sep=None): + """Saves info table to file + + Args: + file_path: File to write the list. + sep: Separator between the key and file_path in the text file. + """ + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) + ext = file_path.suffix + if ext == "": + # if no extension we save as kaldi feats.scp file + from .scp_list import SCPList + + offset = self.df["storage_byte"] if "storage_byte" is not None else None + range = None + if "start" and "num_frames" in self.df: + range = [ + np.array([s, n], dtype=np.int64) + for s, n in self.df[["start", "num_frames"]] + ] + scp = SCPList(self.df["id"], self.df["storage_path"], offset, range) + scp.save(file_path) + return + + super().save(file_path, sep) + + @classmethod + def load(cls, file_path, sep=None): + """Loads utt2info list from text file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + Returns: + FeatureSet object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext == "": + # if no extension we load as kaldi feats.scp file + from .scp_list import SCPList + + scp = SCPList.load(file_path) + df_dict = {"id": scp.key, "storage_path": scp.file_path} + df = pd.DataFrame(df_dict) + if scp.offset is not None: + df["storage_byte"] = scp.offset + + if scp.range is not None: + df["start"] = [r[0] for r in scp.range] + df["num_frames"] = [r[0] for r in scp.range] + + return cls(df) + + return super().load(file_path, sep) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py new file mode 100644 index 00000000..247001c0 --- /dev/null +++ b/hyperion/utils/info_table.py @@ -0,0 +1,383 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from pathlib import Path +import logging +from collections import OrderedDict +from copy import deepcopy + +import numpy as np +import pandas as pd + +from .list_utils import split_list, split_list_group_by_key + + +class InfoTable(object): + """This is a base class to store information about recordings, segments, + features, etc. + + Attributes: + df: pandas dataframe. + """ + + def __init__(self, df): + self.df = df + assert "id" in df + self.df.set_index("id", drop=False, inplace=True) + + def copy(self): + """Makes a copy of the object.""" + return deepcopy(self) + + def clone(self): + """Makes a copy of the object.""" + return deepcopy(self) + + @property + def __len__(self): + return self.df.__len__ + + @property + def iat(self): + return self.df.iat + + @property + def at(self): + return self.df.at + + @property + def iloc(self): + return self.df.iloc + + @property + def loc(self): + return self.df.loc + + @property + def __getitem__(self): + return self.df.__getitem__ + + @property + def __contains__(self): + return self.df.__contains__ + + def save(self, file_path, sep=None): + """Saves info table to file + + Args: + file_path: File to write the list. + sep: Separator between the key and file_path in the text file. + """ + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) + ext = file_path.suffix + if ext == "": + # if no extension we save as kaldi utt2spk file + self.df.to_csv(file_path, sep=" ", header=False, index=False) + return + + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + self.df.to_csv(file_path, sep=sep, index=False) + + @classmethod + def load(cls, file_path, sep=None): + """Loads utt2info list from text file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + dtype: Dictionary with the dtypes of each column. + Returns: + Utt2Info object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext == "": + # if no extension we load as kaldi utt2spk file + df = pd.read_csv( + file_path, + sep=" ", + header=None, + names=["id", "class_id"], + dtype={"id": np.str, "class_id": np.str}, + ) + + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + return cls(df) + + def sort(self, column="id", ascending=True): + """Sorts the table by column""" + self.df.sort_values(by=column, inplace=True, ascending=ascending) + + def split(self, idx, num_parts, group_by=None): + """Splits SCPList into num_parts and return part idx. + + Args: + idx: Part to return from 1 to num_parts. + num_parts: Number of parts to split the list. + group_by_field: All the lines with the same value in column + groub_by_field go to the same part + + Returns: + Sub Utt2Info object + """ + if group_by is None: + _, idx1 = split_list(self.df["id"], idx, num_parts) + else: + _, idx1 = split_list_group_by_key( + self.df[group_by], idx, num_parts + ) + + df = self.df.iloc[idx1] + return self.__class__(df) + + @classmethod + def merge(cls, tables): + """Merges several Utt2Info tables. + + Args: + info_lists: List of Utt2Info + + Returns: + Utt2Info object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + return cls(df) + + def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): + assert items is None or iindex is None, "items and iindex cannot be not None at the same time" + df = self.df + + if not keep: + if items is not None: + items = np.setdiff1d(df[by], items) + elif iindex is not None: + iindex = np.setdiff1d(np.arange(len(df)), iindex) + + if columns is not None: + columns = np.setdiff1d(df.columns, columns) + + if items is not None: + if by != "id": + missing = [False if v in df[by] else True for v in items] + if any(missing): + raise Exception(f"{items[missing]} not found in table") + items = [True if v in items else False for v in df[by]] + + if columns is None: + df = df.loc[items] + else: + df = df.loc[items, columns] + else: + if iindex is not None: + df = self.df.iloc[iindex] + + if columns is not None: + df = df[columns] + + return self.__class__(df) + + + def __eq__(self, other): + """Equal operator""" + if self.df.shape[0] == 0 and other.df.shape[0] == 0: + return True + eq = self.df.equals(other.df) + return eq + + def __ne__(self, other): + """Non-equal operator""" + return not self.__eq__(other) + + def __cmp__(self, other): + """Comparison operator""" + if self.__eq__(other): + return 0 + return 1 + + + + # def __len__(self): + # """Returns the number of elements in the list.""" + # return len(self.df) + + # def _create_dict(self): + # """Creates dictionary that returns the position of + # a segment in the list. + # """ + # self.key_to_index = OrderedDict( + # (k, i) for i, k in enumerate(self.utt_info.index) + # ) + + # def get_index(self, key): + # """Returns the position of key in the list.""" + # if self.key_to_index is None: + # self._create_dict() + # return self.key_to_index[key] + + # def __contains__(self, id): + # """Returns True if the list contains the key""" + # return id in self.df.index + + # def __getitem__(self, id): + # """It allows to acces the data in the list by key or index like in + # a ditionary, e.g.: + # If input is a string key: + # utt2spk = Utt2Info(info) + # spk_id = utt2spk['data1'] + # If input is an index: + # key, spk_id = utt2spk[0] + + # Args: + # key: String key or integer index. + # Returns: + # If key is a string: + # info corresponding to key + # If key is the index in the key list: + # key, info given index + # """ + # if isinstance(id, str): + # row = np.array(self.utt_info.loc[key])[1:] + # if len(row) == 1: + # return row[0] + # else: + # return row + # else: + # row = np.array(self.utt_info.iloc[key]) + # if len(row) == 2: + # return row[0], row[1] + # else: + # return row[0], row[1:] + + # def sort(self, field=0): + # """Sorts the list by key""" + # if field == 0: + # self.utt_info.sort_index(ascending=True, inplace=True) + # else: + # idx = np.argsort(self.utt_info[field]) + # self.utt_info = self.utt_info.iloc[idx] + # self.key_to_index = None + + # @classmethod + # def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}): + # """Loads utt2info list from text file. + + # Args: + # file_path: File to read the list. + # sep: Separator between the key and file_path in the text file. + # dtype: Dictionary with the dtypes of each column. + # Returns: + # Utt2Info object + # """ + # df = pd.read_csv(file_path, sep=sep, header=None, dtype=dtype) + # df = df.rename(index=str, columns={0: "key"}) + # return cls(df) + + # def split(self, idx, num_parts, group_by_field=0): + # """Splits SCPList into num_parts and return part idx. + + # Args: + # idx: Part to return from 1 to num_parts. + # num_parts: Number of parts to split the list. + # group_by_field: All the lines with the same value in column + # groub_by_field go to the same part + + # Returns: + # Sub Utt2Info object + # """ + # if group_by_field == 0: + # key, idx1 = split_list(self.utt_info["key"], idx, num_parts) + # else: + # key, idx1 = split_list_group_by_key( + # self.utt_info[group_by_field], idx, num_parts + # ) + + # utt_info = self.utt_info.iloc[idx1] + # return Utt2Info(utt_info) + + + + # def filter(self, filter_key, keep=True): + # """Removes elements from Utt2Info object by key + + # Args: + # filter_key: List with the keys of the elements to keep or remove. + # keep: If True, we keep the elements in filter_key; + # if False, we remove the elements in filter_key; + + # Returns: + # Utt2Info object. + # """ + # if not keep: + # filter_key = np.setdiff1d(self.utt_info["key"], filter_key) + # utt_info = self.utt_info.loc[filter_key] + # return Utt2Info(utt_info) + + # def filter_info(self, filter_key, field=1, keep=True): + # """Removes elements of Utt2Info by info value + + # Args: + # filter_key: List with the file_path of the elements to keep or remove. + # field: Field number corresponding to the info to filter + # keep: If True, we keep the elements in filter_key; + # if False, we remove the elements in filter_key; + + # Returns: + # Utt2Info object. + # """ + # if not keep: + # filter_key = np.setdiff1d(self.utt_info[field], filter_key) + # f, _ = ismember(filter_key, self.utt_info[field]) + # if not np.all(f): + # for k in filter_key[f == False]: + # logging.error("info %s not found in field %d" % (k, field)) + # raise Exception("not all keys were found in field %d" % (field)) + + # f, _ = ismember(self.utt_info[field], filter_key) + # utt_info = self.utt_info.iloc[f] + # return Utt2Info(utt_info) + + # def filter_index(self, index, keep=True): + # """Removes elements of Utt2Info by index + + # Args: + # filter_key: List with the index of the elements to keep or remove. + # keep: If True, we keep the elements in filter_key; + # if False, we remove the elements in filter_key; + + # Returns: + # Utt2Info object. + # """ + + # if not keep: + # index = np.setdiff1d(np.arange(len(self.key), dtype=np.int64), index) + + # utt_info = self.utt_info.iloc[index] + # return Utt2Info(utt_info) + + def shuffle(self, seed=1024, rng=None): + """Shuffles the elements of the list. + + Args: + seed: Seed for random number generator. + rng: numpy random number generator object. + + Returns: + Index used to shuffle the list. + """ + if rng is None: + rng = np.random.RandomState(seed=seed) + index = np.arange(len(self.df)) + rng.shuffle(index) + self.df = self.df.iloc[index] + return index + + \ No newline at end of file diff --git a/hyperion/utils/recording_set.py b/hyperion/utils/recording_set.py new file mode 100644 index 00000000..ad6f65f6 --- /dev/null +++ b/hyperion/utils/recording_set.py @@ -0,0 +1,61 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from pathlib import Path + +import numpy as np +import pandas as pd + +from .info_table import InfoTable + + +class RecordingSet(InfoTable): + def __init__(self, df): + super().__init__(df) + assert "storage_path" in df + + def save(self, file_path, sep=None): + """Saves info table to file + + Args: + file_path: File to write the list. + sep: Separator between the key and file_path in the text file. + """ + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) + ext = file_path.suffix + if ext == "": + # if no extension we save as kaldi feats.scp file + from .scp_list import SCPList + + scp = SCPList(self.df["id"], self.df["storage_path"]) + scp.save(file_path) + return + + super().save(file_path, sep) + + @classmethod + def load(cls, file_path, sep=None): + """Loads utt2info list from text file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + Returns: + RecordingSet object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext == "": + # if no extension we load as kaldi feats.scp file + from .scp_list import SCPList + + scp = SCPList.load(file_path) + df_dict = {"id": scp.key, "storage_path": scp.file_path} + df = pd.DataFrame(df_dict) + + return cls(df) + + return super().load(file_path, sep) diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py new file mode 100644 index 00000000..4332dea3 --- /dev/null +++ b/hyperion/utils/segment_set.py @@ -0,0 +1,11 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .info_table import InfoTable + + +class SegmentSet(InfoTable): + def __init__(self, df): + super().__init__(df) From 67c1bb8864439eb43a1c1764cc49cb0086ee7e17 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 30 Aug 2022 04:20:36 -0400 Subject: [PATCH 023/154] new seq sampler works --- apps.txt | 83 +---- .../v1.1/conf/train_data_default.yaml | 15 +- egs/voxceleb/v1.1/conf/val_data_default.yaml | 15 +- ...statsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh | 2 +- egs/voxceleb/v1.1/run_011_train_xvector.sh | 6 +- hyp_utils/conda_env.sh | 1 + hyperion/bin/extract_xvectors_from_wav.py | 2 +- hyperion/bin/train_xvector_from_wav.py | 79 +---- hyperion/helpers/trial_data_reader.py | 17 +- hyperion/io/audio_reader.py | 2 +- .../classifiers/binary_logistic_regression.py | 2 +- hyperion/np/score_norm/adapt_s_norm.py | 273 +++++++++++++-- hyperion/np/score_norm/score_norm.py | 7 +- hyperion/np/score_norm/t_norm.py | 18 +- hyperion/np/score_norm/z_norm.py | 18 +- hyperion/torch/data/__init__.py | 3 +- hyperion/torch/data/audio_dataset.py | 326 +++++++++++++++++- .../data/class_weighted_seg_chunk_sampler.py | 117 +++++-- hyperion/torch/data/seg_chunk_sampler.py | 50 ++- hyperion/torch/data/seg_sampler.py | 54 +-- hyperion/torch/data/seg_sampler_factory.py | 20 +- hyperion/torch/lr_schedulers/factory.py | 4 +- hyperion/torch/lr_schedulers/triangular_lr.py | 4 +- hyperion/torch/trainers/torch_trainer.py | 15 +- hyperion/utils/class_info.py | 51 ++- hyperion/utils/feature_set.py | 4 +- hyperion/utils/info_table.py | 17 +- hyperion/utils/recording_set.py | 4 +- hyperion/utils/segment_set.py | 6 + hyperion/utils/trial_ndx.py | 2 +- hyperion/utils/trial_scores.py | 2 +- 31 files changed, 904 insertions(+), 315 deletions(-) diff --git a/apps.txt b/apps.txt index 4bf4a173..837c064b 100644 --- a/apps.txt +++ b/apps.txt @@ -1,69 +1,14 @@ -apply-mvn-select-frames.py -compute-energy-vad.py -compute-mfcc-feats.py -copy-feats.py -eval-cos-1vs1.py -eval-linear-gbe-up.py -eval-linear-gbe.py -eval-linear-svmc.py -eval-logistic-regression.py -eval-plda-1vs1.py -eval-plda-nvs1.py -make-babble-noise-audio-files.py -merge-h5-files.py -pack-audio-files.py -pack-wav-rirs.py -plot-vector-hist.py -plot-vector-tsne.py -preprocess-audio-files.py -rttm-to-bin-vad.py -segments-to-bin-vad.py -torch-adv-finetune-xvec-from-wav.py -torch-adv-finetune-xvec.py -torch-compute-mfcc-feats.py -torch-eval-vae.py -torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py -torch-eval-xvec-cosine-scoring-from-adv-test-wav.py -torch-eval-xvec-cosine-scoring-from-art-test-wav.py -torch-eval-xvec-cosine-scoring-from-test-wav.py -torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py -torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py -torch-eval-xvec-logits-from-wav.py -torch-extract-xvectors-from-wav-with-rttm.py -torch-extract-xvectors-from-wav.py -torch-extract-xvectors-slidwin-from-wav.py -torch-extract-xvectors-slidwin.py -torch-extract-xvectors-vae-preproc.py -torch-extract-xvectors.py -torch-finetune-xvec-dfr-from-wav.py -torch-finetune-xvec-dfr.py -torch-finetune-xvec-from-wav.py -torch-finetune-xvec.py -torch-generate-adv-attacks-xvector-classif.py -torch-generate-adv-attacks-xvector-verif.py -torch-train-dvae.py -torch-train-efficientnet-xvec-from-wav.py -torch-train-efficientnet-xvec.py -torch-train-resnet-xvec-from-wav.py -torch-train-resnet-xvec.py -torch-train-spinenet-xvec-from-wav.py -torch-train-tdnn-xvec-from-wav.py -torch-train-tdnn-xvec.py -torch-train-transformer-xvec-v1-from-wav.py -torch-train-transformer-xvec-v1.py -torch-train-vae.py -torch-train-vq-dvae.py -torch-train-vq-vae.py -torch-train-xvec-from-wav.py -train-cw-up.py -train-cw.py -train-gaussianizer.py -train-lda.py -train-linear-gbe-up.py -train-linear-gbe.py -train-linear-svmc.py -train-logistic-regression.py -train-mvn.py -train-nda.py -train-pca.py -train-plda.py +compute_energy_vad.py +extract_wav2vec2xvectors.py +extract_xvectors_from_wav.py +finetune_wav2vec2xvector.py +finetune_xvector_dfr_from_feats.py +finetune_xvector_dfr_from_wav.py +finetune_xvector_from_feats.py +finetune_xvector_from_wav.py +make_babble_noise_audio_files.py +pack_wav_rirs.py +preprocess_audio_files.py +train_wav2vec2xvector.py +train_xvector_from_feats.py +train_xvector_from_wav.py diff --git a/egs/voxceleb/v1.1/conf/train_data_default.yaml b/egs/voxceleb/v1.1/conf/train_data_default.yaml index 451ffa35..acd088e6 100644 --- a/egs/voxceleb/v1.1/conf/train_data_default.yaml +++ b/egs/voxceleb/v1.1/conf/train_data_default.yaml @@ -1,10 +1,17 @@ dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id sampler: + sampler_type: class_weighted_random_seg_chunk_sampler batch_size: 32 - iters_per_epoch: 6 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id data_loader: num_workers: 8 \ No newline at end of file diff --git a/egs/voxceleb/v1.1/conf/val_data_default.yaml b/egs/voxceleb/v1.1/conf/val_data_default.yaml index 451ffa35..acd088e6 100644 --- a/egs/voxceleb/v1.1/conf/val_data_default.yaml +++ b/egs/voxceleb/v1.1/conf/val_data_default.yaml @@ -1,10 +1,17 @@ dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id sampler: + sampler_type: class_weighted_random_seg_chunk_sampler batch_size: 32 - iters_per_epoch: 6 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id data_loader: num_workers: 8 \ No newline at end of file diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh index 3cd4b108..ecd076c8 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -25,7 +25,7 @@ margin=0.3 nnet_num_epochs=70 xvec_train_base_cfg=conf/train_ecapatdnn_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu" +xvec_train_args="--data.train.sampler.min-batch-size $batch_size_1gpu --data.val.sampler.min-batch-size $batch_size_1gpu" nnet_name=${feat_type}_ecapatdnn512x3_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 diff --git a/egs/voxceleb/v1.1/run_011_train_xvector.sh b/egs/voxceleb/v1.1/run_011_train_xvector.sh index 17d50722..883c729b 100755 --- a/egs/voxceleb/v1.1/run_011_train_xvector.sh +++ b/egs/voxceleb/v1.1/run_011_train_xvector.sh @@ -47,11 +47,11 @@ if [ $stage -le 1 ]; then train_xvector_from_wav.py $nnet_type --cfg $xvec_train_base_cfg $xvec_train_args $extra_args \ --data.train.dataset.audio-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ - --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ - --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ --data.val.dataset.audio-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ - --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir $args \ --num-gpus $ngpu \ diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index 283a7a49..0a8f7a41 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -68,6 +68,7 @@ if [ $num_gpus -gt 0 ];then echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters if [ $num_gpus -gt 1 ];then + # export CUDA_LAUNCH_BLOCKING=1 [[ $(type -P "$torchrun") ]] && command="torchrun" \ || command="python -m torch.distributed.run" command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index 48c23687..e9746897 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -208,7 +208,7 @@ def extract_xvectors( writer.write([key], [y]) if write_num_frames_spec is not None: keys.append(key) - info.append(str(x.shape[1])) + info.append(str(x.shape[-1])) t8 = time.time() read_time = t2 - t1 diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index 5eb871db..0e074977 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -22,7 +22,9 @@ from hyperion.torch.utils import ddp from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler + +# from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.models import ResNetXVector as RXVec @@ -46,19 +48,21 @@ def init_data(partition, rank, num_gpus, **kwargs): kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) - sampler_args = Sampler.filter_args(**kwargs["sampler"]) + sampler_args = kwargs["sampler"] if rank == 0: logging.info("{} audio dataset args={}".format(partition, ad_args)) logging.info("{} sampler args={}".format(partition, sampler_args)) logging.info("init %s dataset", partition) - ad_args["is_val"] = partition == "val" + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val dataset = AD(**ad_args) if rank == 0: logging.info("init %s samplers", partition) - sampler = Sampler(dataset, **sampler_args) + sampler = SegSamplerFactory.create(dataset, **sampler_args) if rank == 0: logging.info("init %s dataloader", partition) @@ -72,49 +76,6 @@ def init_data(partition, rank, num_gpus, **kwargs): return data_loader -# def init_data( -# audio_path, -# train_list, -# val_list, -# train_aug_cfg, -# val_aug_cfg, -# num_workers, -# num_gpus, -# rank, -# **kwargs -# ): - -# ad_args = AD.filter_args(**kwargs) -# sampler_args = Sampler.filter_args(**kwargs) -# if rank == 0: -# logging.info("audio dataset args={}".format(ad_args)) -# logging.info("sampler args={}".format(sampler_args)) -# logging.info("init datasets") - -# train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) -# val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) - -# if rank == 0: -# logging.info("init samplers") -# train_sampler = Sampler(train_data, **sampler_args) -# val_sampler = Sampler(val_data, **sampler_args) - -# num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) -# largs = ( -# {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} -# ) - -# train_loader = torch.utils.data.DataLoader( -# train_data, batch_sampler=train_sampler, **largs -# ) - -# test_loader = torch.utils.data.DataLoader( -# val_data, batch_sampler=val_sampler, **largs -# ) - -# return train_loader, test_loader - - def init_feats(rank, **kwargs): feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: @@ -154,7 +115,7 @@ def train_xvec(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) feat_extractor = init_feats(**kwargs) - model = init_xvector(train_loader.dataset.num_classes, **kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: @@ -180,14 +141,9 @@ def make_parser(xvec_class): parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") - # parser.add_argument("--audio-path", required=True) - # parser.add_argument("--train-list", required=True) - # parser.add_argument("--val-list", required=True) AD.add_class_args(train_parser, prefix="dataset", skip={}) - Sampler.add_class_args(train_parser, prefix="sampler") - # parser.add_argument("--train-aug-cfg", default=None) - # parser.add_argument("--val-aug-cfg", default=None) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", type=int, @@ -197,7 +153,7 @@ def make_parser(xvec_class): val_parser = ArgumentParser(prog="") AD.add_class_args(val_parser, prefix="dataset", skip={}) - Sampler.add_class_args(val_parser, prefix="sampler") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", type=int, @@ -209,14 +165,11 @@ def make_parser(xvec_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) parser.link_arguments( - "data.train.dataset.class_file", "data.val.dataset.class_file" + "data.train.dataset.class_files", "data.val.dataset.class_files" ) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) - parser.link_arguments( - "data.train.sampler.batch_size", "data.val.sampler.batch_size" - ) AF.add_class_args(parser, prefix="feats") xvec_class.add_class_args(parser, prefix="model") @@ -225,12 +178,6 @@ def make_parser(xvec_class): ) ddp.add_ddp_args(parser) parser.add_argument("--seed", type=int, default=1123581321, help="random seed") - # parser.add_argument( - # "--resume", - # action="store_true", - # default=False, - # help="resume training from checkpoint", - # ) parser.add_argument( "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int ) @@ -241,11 +188,9 @@ def make_parser(xvec_class): if __name__ == "__main__": parser = ArgumentParser(description="Train XVector from audio files") - parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() - for k, v in xvec_dict.items(): parser_k = make_parser(v) subcommands.add_subcommand(k, parser_k) diff --git a/hyperion/helpers/trial_data_reader.py b/hyperion/helpers/trial_data_reader.py index e6da5b7d..219ee6ce 100644 --- a/hyperion/helpers/trial_data_reader.py +++ b/hyperion/helpers/trial_data_reader.py @@ -12,7 +12,7 @@ from ..io import RandomAccessDataReaderFactory as DRF from ..utils.utt2info import Utt2Info -from ..utils import TrialNdx, TrialKey +from ..utils import TrialNdx, TrialKey # , SparseTrialNdx, SparseTrialKey from ..np.transforms import TransformList @@ -34,6 +34,7 @@ def __init__( num_seg_parts=1, eval_set="enroll-test", tlist_sep=" ", + sparse=False, ): self.r = DRF.create(v_file) @@ -45,10 +46,16 @@ def __init__( test = Utt2Info.load(test_file, sep=tlist_sep) ndx = None if ndx_file is not None: - try: - ndx = TrialNdx.load(ndx_file) - except: - ndx = TrialKey.load(ndx_file).to_ndx() + if sparse: + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + else: + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() ndx, enroll = TrialNdx.parse_eval_set(ndx, enroll, test, eval_set) if num_model_parts > 1 or num_seg_parts > 1: diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index c6bdeab8..043ae778 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -184,7 +184,7 @@ def _read_segment(self, segment, time_offset=0, time_dur=0): if s_beg >= num_samples_i: raise Exception( "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" - % (key, tbeg, sbeg, file_id, num_samples_i) + % (file_id, t_beg, s_beg, file_id, num_samples_i) ) s_end = int(t_end * fs_i) diff --git a/hyperion/np/classifiers/binary_logistic_regression.py b/hyperion/np/classifiers/binary_logistic_regression.py index c144105f..82a84529 100644 --- a/hyperion/np/classifiers/binary_logistic_regression.py +++ b/hyperion/np/classifiers/binary_logistic_regression.py @@ -91,7 +91,7 @@ def __init__( verbose=verbose, warm_start=warm_start, multi_class="ovr", - lr_seed=1024, + lr_seed=lr_seed, **kwargs ) diff --git a/hyperion/np/score_norm/adapt_s_norm.py b/hyperion/np/score_norm/adapt_s_norm.py index b213d653..46d1fc14 100644 --- a/hyperion/np/score_norm/adapt_s_norm.py +++ b/hyperion/np/score_norm/adapt_s_norm.py @@ -21,10 +21,17 @@ class AdaptSNorm(ScoreNorm): std_floor: floor for standard deviations. """ - def __init__(self, nbest=100, nbest_discard=0, **kwargs): + def __init__( + self, + nbest=100, + nbest_discard=0, + nbest_sel_method="highest-other-side", + **kwargs, + ): super().__init__(*kwargs) self.nbest = nbest self.nbest_discard = nbest_discard + self.nbest_sel_method = nbest_sel_method def predict( self, @@ -33,6 +40,7 @@ def predict( scores_enr_coh, mask_coh_test=None, mask_enr_coh=None, + return_stats=False, ): """Normalizes the scores. @@ -59,50 +67,267 @@ def predict( if mask_enr_coh is not None: scores_enr_coh[mask_enr_coh == False] = 0 + if self.nbest_sel_method == "highest-other-side": + return self._norm_highest_other_side( + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ) + elif self.nbest_sel_method == "highest-same-side": + return self._norm_highest_same_side( + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ) + # best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ + # self.nbest_discard : self.nbest_discard + nbest + # ] + # elif self.nbest_sel_method == "highest-same-side": + # best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ + # :, self.nbest_discard : self.nbest_discard + nbest + # ].T + else: + raise Exception(f"invalid cohort selection method {self.nbest_sel_method}") + + # scores_z_norm = np.zeros_like(scores) + # for i in range(scores.shape[1]): + # best_idx_i = best_idx[:, i] + + # best_scores_i = scores_enr_coh[:, best_idx_i] + # mu_z = np.mean(best_scores_i, axis=1, keepdims=True) + + # if mask_enr_coh is None: + # s_z = np.std(best_scores_i, axis=1, keepdims=True) + # else: + # norm = np.mean(mask_enr_coh[:, best_idx_i], axis=1, keepdims=True) + # mu_z /= norm + # s_z = np.sqrt( + # np.mean(best_scores_i ** 2, axis=1, keepdims=True) / norm + # - mu_z ** 2 + # ) + + # s_z = np.clip(s_z, a_min=1e-5, a_max=None) + # if not self.norm_var: + # s_z = 1.0 + + # scores_z_norm[:, i] = (scores[:, i] - mu_z.T) / s_z.T + + # if self.nbest_sel_method == "highest-other-side": + # best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ + # :, self.nbest_discard : self.nbest_discard + nbest + # ] + # elif self.nbest_sel_method == "highest-same-side": + # best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ + # self.nbest_discard : self.nbest_discard + nbest + # ].T + # else: + # raise Exception(f"invalid cohort selection method {self.nbest_sel_method}") + + # scores_t_norm = np.zeros_like(scores) + # for i in range(scores.shape[0]): + # best_idx_i = best_idx[i] + # best_scores_i = scores_coh_test[best_idx_i, :] + # mu_t = np.mean(best_scores_i, axis=0, keepdims=True) + + # if mask_coh_test is None: + # s_t = np.std(best_scores_i[best_idx_i, :], axis=0, keepdims=True) + # else: + # norm = np.mean(mask_coh_test[best_idx_i, :], axis=0, keepdims=True) + # mu_t /= norm + # s_t = np.sqrt( + # np.mean(best_scores_i[best_idx_i, :] ** 2, axis=0, keepdims=True) + # / norm + # - mu_z ** 2 + # ) + + # s_t = np.clip(s_t, a_min=1e-5, a_max=None) + # if not self.norm_var: + # s_t = 1.0 + + # scores_t_norm[i, :] = (scores[i, :] - mu_t) / s_t + + # scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) + + # if return_stats: + # return scores_norm, mu_z, s_z, mu_t, s_t + # else: + # return scores_norm + + def _norm_highest_other_side( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ): + + if return_stats: + mu_z = np.zeros_like(scores) + mu_t = np.zeros_like(scores) + if self.norm_var: + s_z = np.zeros_like(scores) + s_t = np.zeros_like(scores) + else: + s_z = s_t = 1.0 + + scores_z_norm = np.zeros_like(scores) best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ self.nbest_discard : self.nbest_discard + nbest ] - scores_z_norm = np.zeros_like(scores) for i in range(scores.shape[1]): best_idx_i = best_idx[:, i] - mu_z = np.mean(scores_enr_coh[:, best_idx_i], axis=1, keepdims=True) + best_scores_i = scores_enr_coh[:, best_idx_i] + mu_z_i = np.mean(best_scores_i, axis=1, keepdims=False) if mask_enr_coh is None: - s_z = np.std(scores_enr_coh[:, best_idx_i], axis=1, keepdims=True) + s_z_i = np.std(best_scores_i, axis=1, keepdims=False) else: - norm = np.mean(mask_enr_coh[:, best_idx_i], axis=1, keepdims=True) - mu_z /= norm - s_z = np.sqrt( - np.mean(scores_enr_coh[:, best_idx_i] ** 2, axis=1, keepdims=True) - / norm - - mu_z ** 2 + norm = np.mean(mask_enr_coh[:, best_idx_i], axis=1, keepdims=False) + mu_z_i /= norm + s_z_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=1, keepdims=False) / norm + - mu_z_i ** 2 ) - s_z = np.clip(s_z, a_min=1e-5, a_max=None) - scores_z_norm[:, i] = (scores[:, i] - mu_z.T) / s_z.T + s_z_i = np.clip(s_z_i, a_min=1e-5, a_max=None) + if not self.norm_var: + s_z_i = 1.0 + scores_z_norm[:, i] = (scores[:, i] - mu_z_i) / s_z_i + if return_stats: + mu_z[:, i] = mu_z_i + if self.norm_var: + s_z[:, i] = s_z_i + + scores_t_norm = np.zeros_like(scores) best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ :, self.nbest_discard : self.nbest_discard + nbest ] - scores_t_norm = np.zeros_like(scores) for i in range(scores.shape[0]): best_idx_i = best_idx[i] + best_scores_i = scores_coh_test[best_idx_i, :] + mu_t_i = np.mean(best_scores_i, axis=0, keepdims=False) + + if mask_coh_test is None: + s_t_i = np.std(best_scores_i, axis=0, keepdims=False) + else: + norm = np.mean(mask_coh_test[best_idx_i, :], axis=0, keepdims=False) + mu_t_i /= norm + s_t_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=0, keepdims=False) / norm + - mu_t_i ** 2 + ) + + s_t_i = np.clip(s_t_i, a_min=1e-5, a_max=None) + if not self.norm_var: + s_t_i = 1.0 - mu_z = np.mean(scores_coh_test[best_idx_i, :], axis=0, keepdims=True) + scores_t_norm[i, :] = (scores[i, :] - mu_t_i) / s_t_i + if return_stats: + mu_t[i, :] = mu_t_i + if self.norm_var: + s_t[i, :] = s_t_i + + scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) + if return_stats: + return scores_norm, mu_z, s_z, mu_t, s_t + else: + return scores_norm + + def _norm_highest_same_side( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ): + + if return_stats: + mu_z = np.zeros_like(scores) + mu_t = np.zeros_like(scores) + if self.norm_var: + s_z = np.zeros_like(scores) + s_t = np.zeros_like(scores) + else: + s_z = s_t = 1.0 + + best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ + :, self.nbest_discard : self.nbest_discard + nbest + ] + + scores_z_norm = np.zeros_like(scores) + for i in range(scores.shape[0]): + best_idx_i = best_idx[i] + best_scores_i = scores_enr_coh[:, best_idx_i] + mu_z_i = np.mean(best_scores_i, axis=1, keepdims=False) if mask_coh_test is None: - s_z = np.std(scores_coh_test[best_idx_i, :], axis=0, keepdims=True) + s_z_i = np.std(best_scores_i, axis=1, keepdims=False) else: - norm = np.mean(mask_coh_test[best_idx_i, :], axis=0, keepdims=True) - mu_z /= norm - s_z = np.sqrt( - np.mean(scores_coh_test[best_idx_i, :] ** 2, axis=0, keepdims=True) - / norm - - mu_z ** 2 + norm = np.mean(mask_enr_coh[:, best_idx_i], axis=1, keepdims=False) + mu_z_i /= norm + s_z_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=1, keepdims=False) / norm + - mu_z_i ** 2 ) - s_z = np.clip(s_z, a_min=1e-5, a_max=None) - scores_t_norm[i, :] = (scores[i, :] - mu_z) / s_z + s_z_i = np.clip(s_z_i, a_min=1e-5, a_max=None) + if not self.norm_var: + s_z_i = 1.0 + + scores_z_norm[:, i] = (scores[:, i] - mu_z_i) / s_z_i + if return_stats: + mu_z[:, i] = mu_z_i + if self.norm_var: + s_z[:, i] = s_z_i + + best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ + self.nbest_discard : self.nbest_discard + nbest + ] + scores_t_norm = np.zeros_like(scores) + for i in range(scores.shape[1]): + best_idx_i = best_idx[:, i] - return (scores_z_norm + scores_t_norm) / np.sqrt(2) + best_scores_i = scores_coh_test[best_idx_i, :] + mu_t_i = np.mean(best_scores_i, axis=0, keepdims=False) + + if mask_enr_coh is None: + s_t_i = np.std(best_scores_i, axis=0, keepdims=False) + else: + norm = np.mean(mask_coh_test[best_idx_i, :], axis=0, keepdims=False) + mu_t_i /= norm + s_t_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=0, keepdims=False) / norm + - mu_t_i ** 2 + ) + + s_t_i = np.clip(s_t_i, a_min=1e-5, a_max=None) + if not self.norm_var: + s_t_i = 1.0 + + scores_t_norm[i, :] = (scores[i, :] - mu_t_i) / s_t_i + if return_stats: + mu_t[i, :] = mu_t_i + if self.norm_var: + s_t[i, :] = s_t_i + + scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) + if return_stats: + return scores_norm, mu_z, s_z, mu_t, s_t + else: + return scores_norm diff --git a/hyperion/np/score_norm/score_norm.py b/hyperion/np/score_norm/score_norm.py index e2fa1814..9b40c7d7 100644 --- a/hyperion/np/score_norm/score_norm.py +++ b/hyperion/np/score_norm/score_norm.py @@ -15,14 +15,15 @@ class ScoreNorm(NPModel): std_floor: floor for standard deviations. """ - def __init__(self, std_floor=1e-5, **kwargs): + def __init__(self, norm_var=True, std_floor=1e-5, **kwargs): super().__init__(*kwargs) + self.norm_var = norm_var self.std_floor = std_floor def forward(self, **kwargs): """Overloads predict function.""" return self.predict(**kwargs) - def __call__(self, *kwargs): + def __call__(self, *args, **kwargs): """Overloads predict function.""" - return self.predict(**kwargs) + return self.predict(*args, **kwargs) diff --git a/hyperion/np/score_norm/t_norm.py b/hyperion/np/score_norm/t_norm.py index ac87c8ac..a5a80def 100644 --- a/hyperion/np/score_norm/t_norm.py +++ b/hyperion/np/score_norm/t_norm.py @@ -24,16 +24,22 @@ def predict(self, scores, scores_coh_test, mask=None): """ if mask is None: mu_t = np.mean(scores_coh_test, axis=0, keepdims=True) - s_t = np.std(scores_coh_test, axis=0, keepdims=True) + if self.norm_var: + s_t = np.std(scores_coh_test, axis=0, keepdims=True) else: scores_coh_test[mask == False] = 0 n_t = np.mean(mask, axis=0, keepdims=True) mu_t = np.mean(scores_coh_test, axis=0, keepdims=True) / n_t - s_t = np.sqrt( - np.mean(scores_coh_test ** 2, axis=0, keepdims=True) / n_t - mu_t ** 2 - ) - - s_t[s_t < self.std_floor] = self.std_floor + if self.norm_var: + s_t = np.sqrt( + np.mean(scores_coh_test ** 2, axis=0, keepdims=True) / n_t + - mu_t ** 2 + ) + + if self.norm_var: + s_t[s_t < self.std_floor] = self.std_floor + else: + s_t = 1.0 scores_norm = (scores - mu_t) / s_t return scores_norm diff --git a/hyperion/np/score_norm/z_norm.py b/hyperion/np/score_norm/z_norm.py index 98189e06..7b9e32d8 100644 --- a/hyperion/np/score_norm/z_norm.py +++ b/hyperion/np/score_norm/z_norm.py @@ -25,16 +25,22 @@ def predict(self, scores, scores_enr_coh, mask=None): """ if mask is None: mu_z = np.mean(scores_enr_coh, axis=1, keepdims=True) - s_z = np.std(scores_enr_coh, axis=1, keepdims=True) + if self.norm_var: + s_z = np.std(scores_enr_coh, axis=1, keepdims=True) else: scores_enr_coh[mask == False] = 0 n_z = np.mean(mask, axis=1, keepdims=True) mu_z = np.mean(scores_enr_coh, axis=1, keepdims=True) / n_z - s_z = np.sqrt( - np.mean(scores_enr_coh ** 2, axis=1, keepdims=True) / n_z - mu_z ** 2 - ) - - s_z[s_z < self.std_floor] = self.std_floor + if self.norm_var: + s_z = np.sqrt( + np.mean(scores_enr_coh ** 2, axis=1, keepdims=True) / n_z + - mu_z ** 2 + ) + + if self.norm_var: + s_z[s_z < self.std_floor] = self.std_floor + else: + s_z = 1.0 scores_norm = (scores - mu_z) / s_z return scores_norm diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index 4deb3f25..752cf0f5 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -10,4 +10,5 @@ from .audio_dataset import AudioDataset # samplers -from .weighted_seq_sampler import ClassWeightedSeqSampler +# from .weighted_seq_sampler import ClassWeightedSeqSampler +from .seg_sampler_factory import SegSamplerFactory diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index f86ad0a2..8875676f 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -4,7 +4,7 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionYesNo, ArgumentParser, ActionParser import time import math @@ -21,8 +21,10 @@ from torch.utils.data import Dataset import torch.distributed as dist +from hyperion.np import augment -class AudioDataset(Dataset): + +class AudioDataset1(Dataset): def __init__( self, audio_file, @@ -443,3 +445,323 @@ def add_class_args(parser, prefix=None, skip={"audio_file", "key_file"}): # help='audio dataset options') add_argparse_args = add_class_args + + +from ...utils.class_info import ClassInfo +from ...utils.segment_set import SegmentSet + + +class AudioDataset(Dataset): + def __init__( + self, + audio_file, + segments_file, + class_names=None, + class_files=None, + time_durs_file=None, + aug_cfgs=None, + num_augs=1, + return_segment_info=None, + return_orig=False, + wav_scale=2 ** 15 - 1, + is_val=False, + ): + + super().__init__() + try: + rank = dist.get_rank() + world_size = dist.get_world_size() + except: + rank = 0 + world_size = 1 + + self.rank = rank + self.world_size = world_size + self.epoch = 0 + + if rank == 0: + logging.info("opening audio reader %s", audio_file) + + self.r = AR(audio_file, wav_scale=wav_scale) + + if rank == 0: + logging.info("loading segments file %s" % segments_file) + + self.seg_set = SegmentSet.load(segments_file) + if rank == 0: + logging.info("dataset contains %d seqs" % len(self.seg_set)) + + self.is_val = is_val + if time_durs_file is not None: + if rank == 0: + logging.info("loading durations file %s" % time_durs_file) + + time_durs = SegmentSet.load(time_durs_file) + self.seg_set["duration"] = time_durs.loc[ + self.seg_set["id"] + ].class_id.values.astype(np.float, copy=False) + else: + assert "duration" in self.seg_set + + logging.info("loading class-info files") + self._load_class_infos(class_names, class_files, is_val) + + self.return_segment_info = ( + [] if return_segment_info is None else return_segment_info + ) + self.return_orig = return_orig + + self.num_augs = num_augs + self._create_augmenters(aug_cfgs) + + def _load_class_infos(self, class_names, class_files, is_val): + self.class_info = {} + if class_names is None: + assert class_files is None + return + + assert len(class_names) == len(class_files) + for name, file in zip(class_names, class_files): + assert ( + name in self.seg_set + ), f"class_name {name} not present in the segment set" + if self.rank == 0: + logging.info("loading class-info file %s" % file) + table = ClassInfo.load(file) + self.class_info[name] = table + if not is_val: + # check that all classes are present in the training segments + class_ids = table["id"] + segment_class_ids = self.seg_set[name].unique() + for c_id in class_ids: + if c_id not in segment_class_ids: + logging.warning( + "%s class: %s not present in dataset", name, c_id + ) + + def _create_augmenters(self, aug_cfgs): + self.augmenters = [] + self.reverb_context = 0 + if aug_cfgs is None: + return + + for aug_cfg in aug_cfgs: + logging.info(f"loading augmentation={aug_cfg}") + augmenter = SpeechAugment.create( + aug_cfg, random_seed=112358 + 1000 * self.rank + ) + self.augmenters.append(augmenter) + self.reverb_context = max(augmenter.max_reverb_context, self.reverb_context) + + def set_epoch(self, epoch): + self.epoch = epoch + + @property + def wav_scale(self): + return self.r.wav_scale + + @property + def num_seqs(self): + return len(self.seg_set) + + def __len__(self): + return self.num_seqs + + @property + def seq_lengths(self): + return self.seg_set["duration"] + + @property + def total_length(self): + return np.sum(self.seq_lengths) + + @property + def min_seq_length(self): + return np.min(self.seq_lengths) + + @property + def max_seq_length(self): + return np.max(self.seq_lengths) + + @property + def num_classes(self): + return {k: t.num_classes for k, t in self.class_info.items()} + + def _parse_segment_item(self, segment): + if isinstance(segment, (tuple, list)): + seg_id, start, duration = segment + assert duration <= self.seg_set.loc[seg_id].duration + else: + seg_id, start, duration = segment, 0, 0 + + if "start" in self.seg_set: + start += self.seg_set.loc[seg_id].start + + return seg_id, start, duration + + def _read_audio(self, seg_id, start, duration): + # how much extra audio we need to load to + # calculate the reverb of the first part of the audio + reverb_context = min(self.reverb_context, start) + start -= reverb_context + read_duration = duration + reverb_context + + # read audio + recording_id = self.seg_set.recording_ids(seg_id) + x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration) + return x[0], fs[0] + + def _apply_augs(self, x, num_samples, reverb_context_samples): + x_augs = [] + # for each type of augmentation + for i, augmenter in enumerate(self.augmenters): + # we do n_augs per augmentation type + for j in range(self.num_augs): + # augment x + x_aug, aug_info = augmenter(x) + # remove the extra left context used to compute the reverberation. + x_aug = x_aug[reverb_context_samples : len(x)] + x_augs.append(x_aug) + + return x_augs + + def _get_segment_info(self, seg_id): + r = [] + # converts the class_ids to integers + for info_name in self.return_segment_info: + seg_info = self.seg_set.loc[seg_id, info_name] + if info_name in self.class_info: + # if the type of information is a class-id + # we use the class information table to + # convert from id to integer + class_info = self.class_info[info_name] + idx = class_info.loc[seg_info, "class_idx"] + seg_info = idx + + r.append(seg_info) + + return r + + def __getitem__(self, segment): + + seg_id, start, duration = self._parse_segment_item(segment) + x, fs = self._read_audio(seg_id, start, duration) + if self.augmenters: + # augmentations + num_samples = int(duration * fs) + reverb_context_samples = len(x) - num_samples + x_augs = self._apply_augs(x, num_samples, reverb_context_samples) + r = x_augs + + # add original non augmented audio + if self.return_orig: + x_orig = x[reverb_context_samples:] + r.append(x_orig) + + else: + r = [x] + + # adds the segment labels + seg_info = self._get_segment_info(seg_id) + r.extend(seg_info) + + return (*r,) + + @staticmethod + def filter_args(**kwargs): + + ar_args = AR.filter_args(**kwargs) + valid_args = ( + "audio_file", + "segments_file", + "aug_cfgs", + "num_augs", + "class_names", + "class_files", + "return_segment_info", + "return_orig", + "time_durs_file", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args.update(ar_args) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip={}): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "audio_file" not in skip: + parser.add_argument( + "--audio-file", + required=True, + help=("audio manifest file"), + ) + + if "segments_file" not in skip: + parser.add_argument( + "--segments-file", + required=True, + help=("segments manifest file"), + ) + + parser.add_argument( + "--class-names", + default=None, + nargs="+", + help=( + "list with the names of the types of classes in the datasets, e.g., speaker, language" + ), + ) + + parser.add_argument( + "--class-files", + default=None, + nargs="+", + help=("list of class info files"), + ) + + parser.add_argument( + "--time-durs-file", + default=None, + help=( + "segment to duration in secs file, if durations are not in segments_file" + ), + ) + + parser.add_argument( + "--aug-cfgs", + default=None, + nargs="+", + help=("augmentation configuration file."), + ) + + parser.add_argument( + "--num-augs", + default=1, + help=("number of augmentations per segment and augmentation type"), + ) + parser.add_argument( + "--return-segment-info", + default=None, + nargs="+", + help=( + "list of columns of the segment file which should be returned as supervisions" + ), + ) + parser.add_argument( + "--return-orig", + default=False, + action=ActionYesNo, + help=( + "when using augmentation, whether or not to return also the original audio" + ), + ) + + AR.add_class_args(parser) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='audio dataset options') + + add_argparse_args = add_class_args diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 1a9f98b8..91d592bc 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -27,17 +27,20 @@ def __init__( num_chunks_per_seg_epoch="auto", num_segs_per_class=1, num_chunks_per_seg=1, + weight_exponent=1.0, + weight_mode="custom", num_hard_prototypes=0, affinity_matrix=None, - class_column="class_id", - length_column="duration", + class_name="class_id", + length_name="duration", + shuffle=False, iters_per_epoch=None, batch_size=None, seed=1234, ): - super().__init__(seed=seed) - self.class_column = class_column - self.length_column = length_column + super().__init__(shuffle=shuffle, seed=seed) + self.class_name = class_name + self.length_name = length_name self.seg_set = seg_set self.class_info = class_info self.min_chunk_length = min_chunk_length @@ -71,6 +74,9 @@ def __init__( self.num_segs_per_class = num_segs_per_class self.num_chunks_per_seg = num_chunks_per_seg + self.weight_exponent = weight_exponent + self.weight_mode = weight_mode + self.num_hard_prototypes = num_hard_prototypes self.batch = 0 @@ -83,6 +89,7 @@ def __init__( self._compute_len() self._gather_class_info() + self._set_class_weights() self.set_hard_prototypes(affinity_matrix) @@ -97,17 +104,20 @@ def __init__( ) def _set_seed(self): - self.rng.manual_seed(self.seed + 10 * self.epoch + 100 * self.rank) + if self.shuffle: + self.rng.manual_seed(self.seed + 10 * self.epoch + 100 * self.rank) + else: + self.rng.manual_seed(self.seed + 100 * self.rank) def _set_num_chunks_per_seg_epoch(self, num_chunks_per_seg_epoch): if num_chunks_per_seg_epoch == "auto": self._compute_num_chunks_per_seg_epoch_auto() else: - self.num_egs_per_utt_epoch = num_chunks_per_seg_epoch + self.num_chunks_per_seg_epoch = num_chunks_per_seg_epoch def _compute_num_chunks_per_seg_epoch_auto(self): seg_set = self.seg_set - avg_seg_length = np.mean(seg_set[self.length_column]) + avg_seg_length = np.mean(seg_set[self.length_name]) avg_chunk_length = (self.max_chunk_length + self.min_chunk_length) / 2 self.num_chunks_per_seg_epoch = math.ceil(avg_seg_length / avg_chunk_length) logging.debug( @@ -133,27 +143,42 @@ def _gather_class_info(self): # we need the maximum/minimum segment duration for each class. max_dur = np.zeros(len(self.class_info)) min_dur = np.zeros(len(self.class_info)) + total_dur = np.zeros(len(self.class_info)) for i, c in enumerate(self.class_info["id"]): - seg_idx = self.seg_set[self.class_column] == c - durs_i = self.seg_set.loc[seg_idx] - max_dur[i] = durs_i.max() - min_dur[i] = durs_i.min() + seg_idx = self.seg_set[self.class_name] == c + if seg_idx.sum() > 0: + durs_i = self.seg_set.loc[seg_idx, self.length_name] + max_dur[i] = durs_i.max() + min_dur[i] = durs_i.min() + total_dur[i] = durs_i.sum() + else: + max_dur[i] = min_dur[i] = total_dur[i] = 0 self.class_info["max_seg_duration"] = max_dur self.class_info["min_seg_duration"] = min_dur + self.class_info["total_duration"] = total_dur + + self.map_idx_to_ids = self.class_info[["class_idx", "id"]] + self.map_idx_to_ids.set_index("class_idx", inplace=True) + + def _set_class_weights(self): + if self.weight_mode == "uniform": + self.class_info.set_uniform_weights() + elif self.weight_mode == "dataset-prior": + weights = self.class_info["total_duration"].values + self.class_info.set_weights(self, weights) + + if self.weight_exponent != 1.0: + self.class_info.exp_weights(self.weight_exponent) zero_weight = self.class_info["min_seg_duration"] < self.min_chunk_length if np.any(zero_weight): - self.class_info.loc[zero_weight, "weights"] = 0 - self.class_info.loc["weights"] /= self.class_info["weights"].sum() + self.class_info.set_zero_weight(zero_weight) self.var_weights = np.any( - self.seg_set[self.length_column] < self.max_chunk_length + self.seg_set[self.length_name] < self.max_chunk_length ) - self.map_idx_to_ids = self.class_info[["class_idx", "ids"]] - self.map_idx_to_ids.set_index("class_idx", inplace=True) - @property def hard_prototype_mining(self): return self.num_hard_prototypes > 1 @@ -186,14 +211,14 @@ def _compute_batch_size(self, chunk_length): return int(self.min_batch_size * self.max_chunk_length / chunk_length) def _compute_num_classes_per_batch(self, batch_size): - num_classes = batch_size / self.num_segs_per_class / self.num_egs_per_utt + num_classes = batch_size / self.num_segs_per_class / self.num_chunks_per_seg if self.hard_prototype_mining: num_classes /= self.num_hard_prototypes return int(math.ceil(num_classes)) def _get_class_weights(self, chunk_length): if not self.var_weights: - return self.class_info["weights"].values + return torch.as_tensor(self.class_info["weights"].values) # get classes where all segments are shorter than # chunk length and put weight to 0 @@ -205,7 +230,7 @@ def _get_class_weights(self, chunk_length): class_weights[zero_idx] = 0.0 # renormalize weights class_weights /= class_weights.sum() - return class_weights + return torch.as_tensor(class_weights) def _sample_classes(self, num_classes, chunk_length): weights = self._get_class_weights(chunk_length) @@ -213,7 +238,8 @@ def _sample_classes(self, num_classes, chunk_length): weights, num_samples=num_classes, replacement=True, - ) + generator=self.rng, + ).numpy() class_ids = self.class_info.iloc[row_idx].id.values if self.hard_prototype_mining: @@ -231,29 +257,34 @@ def _sample_segs(self, class_ids, chunk_length): for c in class_ids: # for each class we sample segments longer than chunk length # get segments belonging to c - seg_mask = (self.seg_set[self.class_column] == c) & ( - self.seg_set[self.length_column] > chunk_length + seg_mask = (self.seg_set[self.class_name] == c) & ( + self.seg_set[self.length_name] >= chunk_length ) - seg_ids_c = self.seg_set.loc[seg_mask, "id"] + seg_ids_c = self.seg_set.loc[seg_mask, "id"].values # sample num_segs_per_class random segments + if len(seg_ids_c) == 0: + print(chunk_length, c, self.class_info.loc[c], flush=True) sel_seg_idx_c = torch.randint( low=0, high=len(seg_ids_c), size=(self.num_segs_per_class,), generator=self.rng, - ) - sel_seg_ids_c = seg_ids_c[sel_seg_idx_c] + ).numpy() + sel_seg_ids_c = list(seg_ids_c[sel_seg_idx_c]) seg_ids.extend(sel_seg_ids_c) return seg_ids def _sample_chunks(self, seg_ids, chunk_length): chunks = [] - scale = self.seg_set.loc[seg_ids, self.length_column] - chunk_length + scale = ( + torch.as_tensor(self.seg_set.loc[seg_ids, self.length_name].values) + - chunk_length + ) for i in range(self.num_chunks_per_seg): start = scale * torch.rand(size=(len(seg_ids),), generator=self.rng) - chunks_i = [(id, s, chunk_length) for id, s in zip(seg_ids, start)] - chunks.expand(chunks_i) + chunks_i = [(id, s.item(), chunk_length) for id, s in zip(seg_ids, start)] + chunks.extend(chunks_i) return chunks @@ -263,7 +294,7 @@ def __next__(self): raise StopIteration chunk_length = self._sample_chunk_length() - batch_size = self._compute_batch_size() + batch_size = self._compute_batch_size(chunk_length) num_classes = self._compute_num_classes_per_batch(batch_size) class_ids = self._sample_classes(num_classes, chunk_length) seg_ids = self._sample_segs(class_ids, chunk_length) @@ -286,9 +317,11 @@ def filter_args(**kwargs): "num_chunks_per_seg_epoch", "num_segs_per_class", "num_chunks_per_seg", + "weight_exponent", + "weight_mode", "num_hard_prototypes", - "class_column", - "length_column", + "class_name", + "length_name", "iters_per_epoch", "batch_size", "shuffle", @@ -373,6 +406,20 @@ def add_class_args(parser, prefix=None): default=1, help=("number of chunks per segment in batch"), ) + + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "dataset-prior"], + help=("exponent for class weights"), + ) + parser.add_argument( "--num-hard-prototypes", type=int, @@ -394,12 +441,12 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--length-column", + "--length-name", default="duration", help="which column in the segment table indicates the duration of the segment", ) parser.add_argument( - "--class-column", + "--class-name", default="class_id", help="which column in the segment table indicates the class of the segment", ) diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py index a971f8ce..2f5cc610 100644 --- a/hyperion/torch/data/seg_chunk_sampler.py +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -11,6 +11,7 @@ import pandas as pd import torch +from ...utils.segment_set import SegmentSet from .hyp_sampler import HypSampler from .seg_sampler import SegSampler import torch.distributed as dist @@ -23,10 +24,10 @@ def __init__( min_chunk_length, max_chunk_length=None, base_sampler=SegSampler, - length_column="duration", + length_name="duration", shuffle=False, seed=1234, - **base_kwargs + **base_kwargs, ): super().__init__(shuffle=shuffle, seed=seed) @@ -37,15 +38,17 @@ def __init__( ) self.avg_chunk_length = (max_chunk_length + min_chunk_length) / 2 self.chunk_set = None - self.length_column = length_column + self.length_name = length_name self.chunk_sampler = base_sampler + if "subbase_sampler" in base_kwargs: + base_kwargs["base_sampler"] = base_kwargs.pop("subbase_sampler") + self.base_kwargs = base_kwargs self.base_kwargs["seed"] = seed self.base_kwargs["shuffle"] = shuffle - if "subbase_sampler" in base_kwargs: - base_kwargs["base_sampler"] = base_kwargs.pop("subbase_sampler") self.__iter__() + self.avg_batch_size = self._seg_sampler.avg_batch_size def __len__(self): return len(self._seg_sampler) @@ -78,7 +81,7 @@ def get_random_duration(self): def _create_chunks(self): chunks = [] - for id, len in zip(self.seg_set["id"], self.seg_set[self.length_column]): + for id, len in zip(self.seg_set["id"], self.seg_set[self.length_name]): if len < self.min_chunk_length: # discard too short sequences continue @@ -88,51 +91,46 @@ def _create_chunks(self): start = 0 for i in range(num_chunks - 1): dur = self.get_random_duration() - chunk = (id, start, dur) + chunk = (f"{id}-{i}", id, start, dur) chunks.append(chunk) start += dur # special treatment for last chunk we get from the recording remainder = len - start + chunk_id = f"{id}-{num_chunks - 1}" if remainder > self.max_chunk_length: # here we discard part of the end - chunk = (id, start, self.max_chunk_length) + chunk = (chunk_id, id, start, self.max_chunk_length) elif remainder < self.min_chunk_length: # here we overlap with second last chunk - chunk = (id, len - self.min_chunk_length, self.min_chunk_length) + chunk = ( + chunk_id, + id, + len - self.min_chunk_length, + self.min_chunk_length, + ) else: # here the last chunk is what it is left - chunk = (id, start, remainder) + chunk = (chunk_id, id, start, remainder) chunks.append(chunk) - self.chunk_set = pd.DataFrame( - chunks, columns=["id", "chunk_start", self.length_column] + chunk_set = pd.DataFrame( + chunks, columns=["id", "seg_id", "chunk_start", self.length_name] ) + self.chunk_set = SegmentSet(chunk_set) def __iter__(self): super().__iter__() self._create_chunks() - self._seg_sampler = SegSampler(self.chunk_set, self._base_kwargs) + self._seg_sampler = SegSampler(self.chunk_set, **self.base_kwargs) self._seg_sampler.set_epoch(self.epoch) self._seg_sampler.__iter__() return self def __next__(self): - return next(self._seg_sampler) - # if self.batch == self._len: - # raise StopIteration - - # start = (self.batch -1)*self.batch_size - # chunks = self.chunks[start:start+self.batch_size] - - # if self.batch == 0: - # logging.info("batch 0 chunks=%s", str(chunks[:10])) - - # self.batch +=1 - # return chunks @staticmethod def filter_args(**kwargs): @@ -140,7 +138,7 @@ def filter_args(**kwargs): valid_args = ( "min_chunk_length", "max_chunk_length", - "length_column", + "length_name", "shuffle", "seed", ) diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index 6802cc8e..73319dca 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -20,7 +20,7 @@ def __init__( min_batch_size=1, max_batch_size=None, max_batch_length=None, - length_column="duration", + length_name="duration", shuffle=False, drop_last=False, seed=1234, @@ -31,31 +31,31 @@ def __init__( self.max_batch_size = max_batch_size self.max_batch_length = max_batch_length self.var_batch_size = max_batch_length is not None - self.length_column = length_column + self.length_name = length_name if self.var_batch_size: avg_batch_size = max_batch_length / torch.mean( - self.seg_set[self.length_column] + self.seg_set[self.length_name] ) else: avg_batch_size = min_batch_size - len = len(self.seg_set) / avg_batch_size / self.world_size + self.avg_batch_size = avg_batch_size + + num_batches = len(self.seg_set) / avg_batch_size / self.world_size if drop_last: - self._len = int(len) + self._len = int(num_batches) else: - self._len = int(math.ceil(len)) + self._len = int(math.ceil(num_batches)) self._permutation = None - @property - def seg_set(self): - return self.dataset - def __len__(self): return self._len def _shuffle_segs(self): - self._permutation = torch.randperm(len(self.seg_set), generator=self.rng) + self._permutation = torch.randperm( + len(self.seg_set), generator=self.rng + ).numpy() def __iter__(self): super().__iter__() @@ -95,25 +95,31 @@ def __next__(self): assert len(idxs) > self.min_batch_size else: - stop = min(self.start + self.min_batch_size, len(self.seg_set)) + stop = min( + self.start + self.world_size * self.min_batch_size, len(self.seg_set) + ) if self.shuffle: - idx = self._permutation[self.start : stop] + idx = self._permutation[self.start : stop : self.world_size] else: - idx = slice(self.start, stop) - self.start + idx = slice(self.start, stop, self.world_size) + + self.start += self.world_size * self.min_batch_size - seg_ids = self.seg_set.iloc[idx].id + if "chunk_start" in self.seg_set: + chunks = self.seg_set.iloc[idx] + seg_ids = [ + (id, s, d) + for id, s, d in zip( + chunks.seg_id, chunks.chunk_start, chunks[self.length_name] + ) + ] + else: + seg_ids = self.seg_set.iloc[idx].id if self.batch == 0: logging.info("batch 0 chunks=%s", str(seg_ids[:10])) self.batch += 1 - if "chunk_start" in self.seg_set: - chunks = self.seg_set.loc[ - seg_ids, ["chunk_start", self.length_column] - ].values - return [(id, chunk[0], chunk[1]) for id, chunk in zip(seg_ids, chunks)] - return seg_ids @staticmethod @@ -123,7 +129,7 @@ def filter_args(**kwargs): "min_batch_size", "max_batch_size", "max_batch_length", - "length_column", + "length_name", "shuffle", "drop_last", "seed", @@ -181,7 +187,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--length-column", + "--length-name", default="duration", help="which column in the segment table indicates the duration of the file", ) diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index e3ba84f8..f09095e6 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -56,7 +56,11 @@ def create( sampler_kwargs.update(base_sampler_kwargs) if sampler_type in ["class_weighted_random_seg_chunk_sampler"]: - sampler_kwargs["class_info"] = dataset.class_info + try: + class_name = sampler_kwargs["class_name"] + except: + class_name = "class_id" + sampler_kwargs["class_info"] = dataset.class_info[class_name] logging.info(f"sampler-args={sampler_kwargs}") @@ -77,8 +81,8 @@ def filter_args(**kwargs): "num_segs_per_class", "num_chunks_per_seg", "num_hard_prototypes", - "class_column", - "length_column", + "class_name", + "length_name", "iters_per_epoch", "batch_size", "shuffle", @@ -115,12 +119,6 @@ def add_class_args(parser, prefix=None): help=("minimum length of the segment chunks"), ) - parser.add_argument( - "--min-chunk-length", - type=float, - default=4.0, - help=("minimum length of the segment chunks"), - ) parser.add_argument( "--max-chunk-length", type=float, @@ -211,12 +209,12 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--length-column", + "--length-name", default="duration", help="which column in the segment table indicates the duration of the segment", ) parser.add_argument( - "--class-column", + "--class-name", default="class_id", help="which column in the segment table indicates the class of the segment", ) diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index 1a542bf2..3fef6e93 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -133,7 +133,7 @@ def create( update_lr_on_opt_step=update_lr_on_opt_step, ) - if lrsch_type == "cos_lr": + if lrsch_type == "triangular": return TriangularLR( optimizer, t, @@ -251,7 +251,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( "--gamma", - default=1 / 100, + default=1.0, type=float, help=("LR decay rate for each restart in cos/triangular lr"), ) diff --git a/hyperion/torch/lr_schedulers/triangular_lr.py b/hyperion/torch/lr_schedulers/triangular_lr.py index c2b66c42..f2578e1d 100644 --- a/hyperion/torch/lr_schedulers/triangular_lr.py +++ b/hyperion/torch/lr_schedulers/triangular_lr.py @@ -84,9 +84,9 @@ def get_lr(self, step): ) alpha = self.gamma ** self.num_restarts - x = math.abs(2 * x / self.T - 1) + x = abs(2 * x / self.T - 1) return [ - eta_min + (alpha * eta_max - eta_min) * math.max(0, 1 - x) + eta_min + (alpha * eta_max - eta_min) * max(0, 1 - x) for eta_max, eta_min in zip(self.base_lrs, self.min_lrs) ] diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 4e29dab5..5f573904 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -207,6 +207,17 @@ def __init__( self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs ) + def set_epoch(self, data_loader): + try: + data_loader.dataset.set_epoch(self.cur_epoch) + except AttributeError: + logging.warning("dataset doesn't have set_epoch member function") + + try: + data_loader.batch_sampler.set_epoch(self.cur_epoch) + except AttributeError: + logging.warning("sampler doesn't have set_epoch member function") + def fit(self, train_data, val_data=None): """Training function, it performs the training and validation epochs @@ -223,7 +234,7 @@ def fit(self, train_data, val_data=None): val_logs = {} self.loggers.on_train_begin(epochs=self.epochs) for epoch in range(self.cur_epoch, self.epochs): - + self.set_epoch(train_data) self.loggers.on_epoch_begin(epoch, batches=len(train_data)) if self.lr_scheduler is not None: # this is needed by cosine scheduler @@ -232,6 +243,7 @@ def fit(self, train_data, val_data=None): logs = self.train_epoch(train_data) if val_data is not None: + self.set_epoch(val_data) val_logs = self.validation_epoch(val_data) logs.update(val_logs) @@ -262,7 +274,6 @@ def fit(self, train_data, val_data=None): self.save_swa_model(logs) def set_train_mode(self): - # self.model.train_mode = self.train_mode self.model.set_train_mode(self.train_mode) def train_epoch(self, data_loader): diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index 2aed18c1..ff98c7c5 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -2,6 +2,10 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from pathlib import Path + +import numpy as np +import pandas as pd from .info_table import InfoTable @@ -13,16 +17,57 @@ def __init__(self, df): self.add_class_idx() if "weights" not in self.df: - self.add_equal_weights() + self.set_uniform_weights() else: - self.df['weights'] /= self.df['weigths'].sum() + self.df["weights"] /= self.df["weigths"].sum() def add_class_idx(self): self.df["class_idx"] = [i for i in range(len(self.df))] - def add_equal_weights(self): + def set_uniform_weights(self): self.df["weights"] = 1 / len(self.df) + def set_weights(self, weights): + self.df["weights"] = weights / weights.sum() + + def exp_weights(self, x): + weights = self.df["weights"] ** x + self.set_weights(weights) + + def set_zero_weight(self, id): + self.df.loc[id, "weights"] = 0 + self.df["weights"] /= self.df["weights"].sum() + @property def weights(self, id): return self.df.loc[id, "weights"] + + @property + def num_classes(self): + return self.df["class_idx"].values.max() + 1 + + @classmethod + def load(cls, file_path, sep=None): + """Loads utt2info list from text file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + dtype: Dictionary with the dtypes of each column. + Returns: + Utt2Info object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext == "": + # if no extension we load as kaldi utt2spk file + df = pd.read_csv( + file_path, + sep=" ", + header=None, + names=["id"], + dtype={"id": np.str}, + ) + return cls(df) + + return super().load(file_path, sep) diff --git a/hyperion/utils/feature_set.py b/hyperion/utils/feature_set.py index 456cf99b..986a21b9 100644 --- a/hyperion/utils/feature_set.py +++ b/hyperion/utils/feature_set.py @@ -26,7 +26,7 @@ def save(self, file_path, sep=None): file_path = Path(file_path) file_path.parent.mkdir(parents=True, exist_ok=True) ext = file_path.suffix - if ext == "": + if ext == ".scp": # if no extension we save as kaldi feats.scp file from .scp_list import SCPList @@ -55,7 +55,7 @@ def load(cls, file_path, sep=None): """ file_path = Path(file_path) ext = file_path.suffix - if ext == "": + if ext == ".scp": # if no extension we load as kaldi feats.scp file from .scp_list import SCPList diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 247001c0..25632941 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -24,7 +24,7 @@ class InfoTable(object): def __init__(self, df): self.df = df - assert "id" in df + assert "id" in df, f"info_table={df}" self.df.set_index("id", drop=False, inplace=True) def copy(self): @@ -59,6 +59,10 @@ def loc(self): def __getitem__(self): return self.df.__getitem__ + @property + def __setitem__(self): + return self.df.__setitem__ + @property def __contains__(self): return self.df.__contains__ @@ -73,7 +77,7 @@ def save(self, file_path, sep=None): file_path = Path(file_path) file_path.parent.mkdir(parents=True, exist_ok=True) ext = file_path.suffix - if ext == "": + if ext in ["", ".scp"]: # if no extension we save as kaldi utt2spk file self.df.to_csv(file_path, sep=" ", header=False, index=False) return @@ -96,7 +100,7 @@ def load(cls, file_path, sep=None): """ file_path = Path(file_path) ext = file_path.suffix - if ext == "": + if ext in ["", ".scp"]: # if no extension we load as kaldi utt2spk file df = pd.read_csv( file_path, @@ -105,11 +109,12 @@ def load(cls, file_path, sep=None): names=["id", "class_id"], dtype={"id": np.str, "class_id": np.str}, ) + else: + if sep is None: + sep = "\t" if ".tsv" in ext else "," - if sep is None: - sep = "\t" if ".tsv" in ext else "," + df = pd.read_csv(file_path, sep=sep) - df = pd.read_csv(file_path, sep=sep) return cls(df) def sort(self, column="id", ascending=True): diff --git a/hyperion/utils/recording_set.py b/hyperion/utils/recording_set.py index ad6f65f6..9695cef3 100644 --- a/hyperion/utils/recording_set.py +++ b/hyperion/utils/recording_set.py @@ -26,7 +26,7 @@ def save(self, file_path, sep=None): file_path = Path(file_path) file_path.parent.mkdir(parents=True, exist_ok=True) ext = file_path.suffix - if ext == "": + if ext == ".scp": # if no extension we save as kaldi feats.scp file from .scp_list import SCPList @@ -48,7 +48,7 @@ def load(cls, file_path, sep=None): """ file_path = Path(file_path) ext = file_path.suffix - if ext == "": + if ext == ".scp": # if no extension we load as kaldi feats.scp file from .scp_list import SCPList diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index 4332dea3..f9da69fa 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -9,3 +9,9 @@ class SegmentSet(InfoTable): def __init__(self, df): super().__init__(df) + + def recording_ids(self, ids): + if "recording_id" in self.df: + return self.df.loc[ids, "recording_id"] + + return ids diff --git a/hyperion/utils/trial_ndx.py b/hyperion/utils/trial_ndx.py index 783f39c4..58a36aa7 100644 --- a/hyperion/utils/trial_ndx.py +++ b/hyperion/utils/trial_ndx.py @@ -320,7 +320,7 @@ def __ne__(self, other): def __cmp__(self, other): """Comparison operator""" - if self.__eq__(oher): + if self.__eq__(other): return 0 return 1 diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py index 19e17190..164b39df 100644 --- a/hyperion/utils/trial_scores.py +++ b/hyperion/utils/trial_scores.py @@ -402,7 +402,7 @@ def __ne__(self, other): def __cmp__(self, other): """Comparison operator""" - if self.__eq__(oher): + if self.__eq__(other): return 0 return 1 From 80a24987a71047f7195b806f0920752b6795f04c Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sun, 4 Sep 2022 17:53:57 -0400 Subject: [PATCH 024/154] modified sparse scores --- hyperion/utils/sparse_trial_scores.py | 41 +++++++++++++++------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/hyperion/utils/sparse_trial_scores.py b/hyperion/utils/sparse_trial_scores.py index d269c629..0684c57e 100644 --- a/hyperion/utils/sparse_trial_scores.py +++ b/hyperion/utils/sparse_trial_scores.py @@ -34,7 +34,7 @@ class SparseTrialScores(TrialScores): """ def __init__(self, model_set=None, seg_set=None, scores=None, score_mask=None): - super(SparseTrialScores, self).__init__(model_set, seg_set, scores, score_mask) + super().__init__(model_set, seg_set, scores, score_mask) def save_h5(self, file_path): raise NotImplementedError() @@ -123,7 +123,7 @@ def validate(self): assert len(np.unique(self.seg_set)) == len(self.seg_set) if self.scores is None: self.scores = sparse.csr_matrix( - (len(model_set), len(seg_set)), dtype=float_cpu() + (len(self.model_set), len(self.seg_set)), dtype=float_cpu() ) else: assert self.scores.shape == (len(self.model_set), len(self.seg_set)) @@ -165,21 +165,6 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): if raise_missing: raise Exception("some scores were not computed") - # model_set = self.model_set[mod_idx] - # set_set = self.seg_set[seg_idx] - # ix = np.ix_(mod_idx, seg_idx) - - # logging.info('hola1') - # new_src = [[self.scores[r,c], i, j] for i,r in enumerate(mod_idx) for j,c in enumerate(seg_idx) if self.score_mask[r,c]] - # logging.info('hola2') - # new_data = np.array([r[0] for r in new_src], dtype=float_cpu()) - # new_row = np.array([r[1] for r in new_src], dtype=np.int) - # new_col = np.array([r[2] for r in new_src], dtype=np.int) - # logging.info('hola3') - # shape = (len(model_set), len(seg_set)) - # scores = sparse.coo_matrix((new_data, (new_row, new_col)), shape=shape).tocsr() - # score_mask = sparse.coo_matrix((np.ones(new_data.shape, dtype=np.bool), (new_row, new_col)), shape=shape).tocsr() - num_mod = len(model_set) num_seg = len(seg_set) shape = (num_mod, num_seg) @@ -288,9 +273,29 @@ def get_tar_non(self, key): non = np.array(scr.scores[non_mask])[0] return tar, non + def get_valid_scores(self, ndx=None): + if ndx is None: + scr = self + else: + scr = self.align_with_ndx(ndx) + + scores = np.array(scr.scores[scr.score_mask])[0] + return scores + + def set_valid_scores(self, scores, ndx=None): + if ndx is not None: + scr = self.align_with_ndx(ndx) + self.model_set = scr.model_set + self.seg_set = scr.seg_set + self.scores = scr.scores + self.score_mat = scr.score_mat + + self.scores[self.score_mask]=scores + @classmethod def from_trial_scores(cls, scr): - scores = sparse.csr_matrix(scr.scores) + scores = scr.scores * scr.score_mask + scores = sparse.csr_matrix(scores) score_mask = sparse.csr_matrix(scr.score_mask) scores.eliminate_zeros() score_mask.eliminate_zeros() From 9f1f73b30ed615ebfcc56c809106dd138c60e7c3 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Mon, 3 Oct 2022 19:45:43 -0400 Subject: [PATCH 025/154] software structure for transducer --- hyperion/bin/train_wav2vec2transducer.py | 199 +++++++++ hyperion/bin/train_wav2vec2xvector.py | 23 +- hyperion/torch/data/__init__.py | 2 +- hyperion/torch/models/__init__.py | 3 + hyperion/torch/models/transducer/__init__.py | 7 + .../torch/models/transducer/transducer.py | 126 ++++++ .../torch/models/wav2transducer/__init__.py | 7 + .../wav2transducer/hf_wav2transducer.py | 387 ++++++++++++++++++ .../wav2transducer/hf_wav2vec2_transducer.py | 387 ++++++++++++++++++ hyperion/torch/trainers/__init__.py | 3 + hyperion/torch/trainers/transducer_trainer.py | 160 ++++++++ 11 files changed, 1294 insertions(+), 10 deletions(-) create mode 100755 hyperion/bin/train_wav2vec2transducer.py create mode 100644 hyperion/torch/models/transducer/__init__.py create mode 100644 hyperion/torch/models/transducer/transducer.py create mode 100644 hyperion/torch/models/wav2transducer/__init__.py create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2transducer.py create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py create mode 100644 hyperion/torch/trainers/transducer_trainer.py diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py new file mode 100755 index 00000000..7f6fffef --- /dev/null +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import HFWav2Vec2Transducer + +model_dict = { + "hf_wav2vec2transducer": HFWav2Vec2Transducer, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = Sampler.filter_args(**kwargs["sampler"]) + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + ad_args["is_val"] = partition == "val" + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = Sampler(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_model(num_classes, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["num_classes"] = num_classes + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(train_loader.dataset.num_classes, **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + Sampler.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + Sampler.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument("--data.train.dataset.class_file", action=ActionParser(parser=data_parser)) + parser.add_argument("--data.val.dataset.class_file", action=ActionParser(parser=data_parser)) + parser.add_argument("--data.train.data_loader.num_workers", action=ActionParser(parser=data_parser)) + parser.add_argument("--data.val.data_loader.num_workers", action=ActionParser(parser=data_parser)) + # parser.link_arguments( + # "data.train.dataset.class_file", "data.val.dataset.class_file" + # ) + # parser.link_arguments( + # "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + # ) + # parser.link_arguments( + # "data.train.sampler.batch_size", "data.val.sampler.batch_size" + # ) + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index e92b9a1a..08913605 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -143,15 +143,20 @@ def make_parser(model_class): data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) - parser.link_arguments( - "data.train.dataset.class_file", "data.val.dataset.class_file" - ) - parser.link_arguments( - "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" - ) - parser.link_arguments( - "data.train.sampler.batch_size", "data.val.sampler.batch_size" - ) + + parser.add_argument("--data.train.dataset.class_file", action=ActionParser(parser=data_parser)) + parser.add_argument("--data.val.dataset.class_file", action=ActionParser(parser=data_parser)) + parser.add_argument("--data.train.data_loader.num_workers", action=ActionParser(parser=data_parser)) + parser.add_argument("--data.val.data_loader.num_workers", action=ActionParser(parser=data_parser)) + # parser.link_arguments( + # "data.train.dataset.class_file", "data.val.dataset.class_file" + # ) + # parser.link_arguments( + # "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + # ) + # parser.link_arguments( + # "data.train.sampler.batch_size", "data.val.sampler.batch_size" + # ) model_class.add_class_args(parser, prefix="model") Trainer.add_class_args( diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index 752cf0f5..aebcfe8a 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -10,5 +10,5 @@ from .audio_dataset import AudioDataset # samplers -# from .weighted_seq_sampler import ClassWeightedSeqSampler +from .weighted_seq_sampler import ClassWeightedSeqSampler from .seg_sampler_factory import SegSamplerFactory diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index e953f58c..5a1368e2 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -18,5 +18,8 @@ HFWavLM2ResNet1dXVector, ) + +from .transducer import HFWav2Vec2Transducer + from .vae.vae import VAE from .vae.vq_vae import VQVAE diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py new file mode 100644 index 00000000..20372911 --- /dev/null +++ b/hyperion/torch/models/transducer/__init__.py @@ -0,0 +1,7 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .transducer import Transducer diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py new file mode 100644 index 00000000..8305248c --- /dev/null +++ b/hyperion/torch/models/transducer/transducer.py @@ -0,0 +1,126 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Note we use `rnnt_loss` from torchaudio, which exists only in +torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0 +""" +import k2 +import torch +import torch.nn as nn +import torchaudio +import torchaudio.functional +from encoder_interface import EncoderInterface + +from icefall.utils import add_sos + + +class Transducer(nn.Module): + """It implements https://arxiv.org/pdf/1211.3711.pdf + "Sequence Transduction with Recurrent Neural Networks" + """ + + def __init__( + self, + encoder: EncoderInterface, + decoder: nn.Module, + joiner: nn.Module, + ): + """ + Args: + encoder: + It is the transcription network in the paper. Its accepts + two inputs: `x` of (N, T, C) and `x_lens` of shape (N,). + It returns two tensors: `logits` of shape (N, T, C) and + `logit_lens` of shape (N,). + decoder: + It is the prediction network in the paper. Its input shape + is (N, U) and its output shape is (N, U, C). It should contain + one attribute: `blank_id`. + joiner: + It has two inputs with shapes: (N, T, C) and (N, U, C). Its + output shape is (N, T, U, C). Note that its output contains + unnormalized probs, i.e., not processed by log-softmax. + """ + super().__init__() + assert isinstance(encoder, EncoderInterface) + assert hasattr(decoder, "blank_id") + + self.encoder = encoder + self.decoder = decoder + self.joiner = joiner + + def forward( + self, + x: torch.Tensor, + x_lens: torch.Tensor, + y: k2.RaggedTensor, + ) -> torch.Tensor: + """ + Args: + x: + A 3-D tensor of shape (N, T, C). + x_lens: + A 1-D tensor of shape (N,). It contains the number of frames in `x` + before padding. + y: + A ragged tensor with 2 axes [utt][label]. It contains labels of each + utterance. + Returns: + Return the transducer loss. + """ + assert x.ndim == 3, x.shape + assert x_lens.ndim == 1, x_lens.shape + assert y.num_axes == 2, y.num_axes + + assert x.size(0) == x_lens.size(0) == y.dim0 + + encoder_out, x_lens = self.encoder(x, x_lens) + assert torch.all(x_lens > 0) + + # Now for the decoder, i.e., the prediction network + row_splits = y.shape.row_splits(1) + y_lens = row_splits[1:] - row_splits[:-1] + + blank_id = self.decoder.blank_id + sos_y = add_sos(y, sos_id=blank_id) + + sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id) + sos_y_padded = sos_y_padded.to(torch.int64) + + decoder_out, _ = self.decoder(sos_y_padded) + + logits = self.joiner(encoder_out, decoder_out) + + # rnnt_loss requires 0 padded targets + # Note: y does not start with SOS + y_padded = y.pad(mode="constant", padding_value=0) + + assert hasattr(torchaudio.functional, "rnnt_loss"), ( + f"Current torchaudio version: {torchaudio.__version__}\n" + "Please install a version >= 0.10.0" + ) + + loss = torchaudio.functional.rnnt_loss( + logits=logits, + targets=y_padded, + logit_lengths=x_lens, + target_lengths=y_lens, + blank=blank_id, + reduction="sum", + ) + + return loss diff --git a/hyperion/torch/models/wav2transducer/__init__.py b/hyperion/torch/models/wav2transducer/__init__.py new file mode 100644 index 00000000..5346bc78 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/__init__.py @@ -0,0 +1,7 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" + +from .hf_wav2vec2_transducer import HFWav2Vec2Transducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py new file mode 100644 index 00000000..3fed7143 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py @@ -0,0 +1,387 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import contextlib +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + +# import torch.nn.functional as nnf + +from ...torch_model import TorchModel +from ...utils import remove_silence + + +class HFWav2XVector(TorchModel): + """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + xvector: x-vector model object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" + ): + + super().__init__() + self.hf_feats = hf_feats + self.xvector = xvector + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start :] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + def compute_prototype_affinity(self): + return self.xvector.compute_prototype_affinity() + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.xvector.update_loss_margin(epoch) + + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + self.xvector.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) + for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + y=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers + ) + output = self.xvector( + feats, + feat_lengths, + y, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + + if not return_feat_layers: + return output + + if not isinstance(output, dict): + # if the xvector just returned the logits we put then into a dictionary + # to append the hid feats later. + output["logits"] = output + + output["h_feats"] = hid_feats + return output + + def extract_embed( + self, + x, + x_lengths=None, + vad_samples=None, + hf_chunk_length=0, + xvec_chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + + feats, _, feat_lengths = self.forward_feats( + x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks + ) + xvec_chunk_length = int( + xvec_chunk_length + * self.hf_feats.sample_frequency + * feats.size(-1) + // x.size(-1) + ) + return self.xvector.extract_embed( + feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks + ) + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_feat_fuser() + self.freeze_hf_feats() + self.xvector.freeze_preembed_layers() + elif mode in ["ft-xvector", "ft-xvector-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode == "ft-embed-affine": + self.hf_feats.train() + self.xvector._train("ft-embed_affine") + elif train_mode in [ + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.xvector._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "xvector", + "feat_fusion_start", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + + hf_cfg = self.hf_feats.get_config() + xvec_cfg = self.xvector.get_config() + del hf_cfg["class_name"] + del xvec_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "xvector": xvec_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, xvector): + logging.info("changing hf wav2xvector config") + self.hf_feats.change_config(**hf_feats) + self.xvector.change_config(**xvector) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=( + "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers" + ), + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]" + ), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + help="xvector options", + ) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py new file mode 100644 index 00000000..e83dcb8c --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py @@ -0,0 +1,387 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import contextlib +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + +# import torch.nn.functional as nnf + +from ...torch_model import TorchModel +from ...utils import remove_silence + + +class HFWav2Vec2Transducer(HFWav2Transducer): + """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + xvector: x-vector model object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" + ): + + super().__init__() + self.hf_feats = hf_feats + self.xvector = xvector + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start :] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + def compute_prototype_affinity(self): + return self.xvector.compute_prototype_affinity() + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.xvector.update_loss_margin(epoch) + + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + ): + self.xvector.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) + + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) + for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + y=None, + return_feat_layers=None, + return_enc_layers=None, + return_classif_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_enc_layers: list of integers indicating, which classification head layers + we should return. If None, no head layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers + ) + output = self.xvector( + feats, + feat_lengths, + y, + return_enc_layers=return_enc_layers, + return_classif_layers=return_classif_layers, + return_logits=return_logits, + ) + + if not return_feat_layers: + return output + + if not isinstance(output, dict): + # if the xvector just returned the logits we put then into a dictionary + # to append the hid feats later. + output["logits"] = output + + output["h_feats"] = hid_feats + return output + + def extract_embed( + self, + x, + x_lengths=None, + vad_samples=None, + hf_chunk_length=0, + xvec_chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + + feats, _, feat_lengths = self.forward_feats( + x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks + ) + xvec_chunk_length = int( + xvec_chunk_length + * self.hf_feats.sample_frequency + * feats.size(-1) + // x.size(-1) + ) + return self.xvector.extract_embed( + feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks + ) + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_feat_fuser() + self.freeze_hf_feats() + self.xvector.freeze_preembed_layers() + elif mode in ["ft-xvector", "ft-xvector-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode == "ft-embed-affine": + self.hf_feats.train() + self.xvector._train("ft-embed_affine") + elif train_mode in [ + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.xvector._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "xvector", + "feat_fusion_start", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + + hf_cfg = self.hf_feats.get_config() + xvec_cfg = self.xvector.get_config() + del hf_cfg["class_name"] + del xvec_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "xvector": xvec_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, xvector): + logging.info("changing hf wav2xvector config") + self.hf_feats.change_config(**hf_feats) + self.xvector.change_config(**xvector) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=( + "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers" + ), + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]" + ), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + help="xvector options", + ) diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py index 8fef7df5..593cfa1f 100644 --- a/hyperion/torch/trainers/__init__.py +++ b/hyperion/torch/trainers/__init__.py @@ -5,6 +5,9 @@ from .torch_trainer import TorchTrainer + +from .transducer_trainer import TransducerTrainer + from .xvector_trainer import XVectorTrainer from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg from .xvector_adv_trainer import XVectorAdvTrainer diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py new file mode 100644 index 00000000..a67da181 --- /dev/null +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -0,0 +1,160 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import os +from collections import OrderedDict as ODict + +import logging + +import torch +import torch.nn as nn + +from ..utils import MetricAcc +from .torch_trainer import TorchTrainer +from torch.distributed.elastic.multiprocessing.errors import record + + +class TransducerTrainer(TorchTrainer): + """Trainer to train ASR style models. + + Attributes: + model: ASR model object. + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp + """ + + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + eff_batch_size=None, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + train_mode="full", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + if loss is None: + # TODO: Check and Modify loss + loss = nn.CrossEntropyLoss() + super().__init__( + model, + loss, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + eff_batch_size=eff_batch_size, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) + + @record + def train_epoch(self, data_loader): + """Training epoch loop + + Args: + data_loader: pytorch data loader returning features and class labels. + """ + + self.model.update_loss_margin(self.cur_epoch) + + metric_acc = MetricAcc(device=self.device) + batch_metrics = ODict() + self.model.train() + for batch, (data, target) in enumerate(data_loader): + self.loggers.on_batch_begin(batch) + + if batch % self.grad_acc_steps == 0: + self.optimizer.zero_grad() + # TODO: Check and Modify data, target + data, target = data.to(self.device), target.to(self.device) + batch_size = data.shape[0] + + with self.amp_autocast(): + output = self.model(data, y=target) + loss = self.loss(output, target).mean() / self.grad_acc_steps + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() + + if (batch + 1) % self.grad_acc_steps == 0: + if self.lr_scheduler is not None and not self.in_swa: + self.lr_scheduler.on_opt_step() + self.update_model() + + batch_metrics["loss"] = loss.item() * self.grad_acc_steps + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + logs = metric_acc.metrics + logs["lr"] = self._get_lr() + self.loggers.on_batch_end(logs=logs, batch_size=batch_size) + + logs = metric_acc.metrics + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() + return logs From b8ffac41c60b13872025caf7e6dfd9dfe1d87347 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Mon, 3 Oct 2022 19:55:35 -0400 Subject: [PATCH 026/154] Librispeech recognition recipe --- egs/librispeech/v1/cmd.sh | 28 +++++ egs/librispeech/v1/conf/fbank80_16k.yaml | 7 ++ .../conf/wav2vec2base960h_ecapatdnn512x2.yaml | 37 ++++++ egs/librispeech/v1/datapath.sh | 22 ++++ egs/librispeech/v1/feats | 1 + ...nn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh | 55 +++++++++ egs/librispeech/v1/hyp_utils | 1 + egs/librispeech/v1/local/data_prep.sh | 85 ++++++++++++++ egs/librispeech/v1/path.sh | 5 + egs/librispeech/v1/run_001_prepare_data.sh | 52 ++++++++ egs/librispeech/v1/run_003_compute_fbank.sh | 67 +++++++++++ egs/librispeech/v1/run_004_compute_bpe.sh | 84 +++++++++++++ .../v1/run_010_prepare_asr_train_data.sh | 42 +++++++ egs/librispeech/v1/run_011_train_asr.sh | 111 ++++++++++++++++++ egs/librispeech/v1/run_030_inference.sh | 74 ++++++++++++ egs/librispeech/v1/run_040_eval_wer.sh | 103 ++++++++++++++++ egs/librispeech/v1/steps | 1 + egs/librispeech/v1/steps_be | 1 + egs/librispeech/v1/steps_pyfe | 1 + egs/librispeech/v1/utils | 1 + egs/librispeech/v1/xvectors | 1 + 21 files changed, 779 insertions(+) create mode 100755 egs/librispeech/v1/cmd.sh create mode 100644 egs/librispeech/v1/conf/fbank80_16k.yaml create mode 100644 egs/librispeech/v1/conf/wav2vec2base960h_ecapatdnn512x2.yaml create mode 100644 egs/librispeech/v1/datapath.sh create mode 120000 egs/librispeech/v1/feats create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh create mode 120000 egs/librispeech/v1/hyp_utils create mode 100755 egs/librispeech/v1/local/data_prep.sh create mode 100755 egs/librispeech/v1/path.sh create mode 100755 egs/librispeech/v1/run_001_prepare_data.sh create mode 100755 egs/librispeech/v1/run_003_compute_fbank.sh create mode 100755 egs/librispeech/v1/run_004_compute_bpe.sh create mode 100755 egs/librispeech/v1/run_010_prepare_asr_train_data.sh create mode 100755 egs/librispeech/v1/run_011_train_asr.sh create mode 100755 egs/librispeech/v1/run_030_inference.sh create mode 100755 egs/librispeech/v1/run_040_eval_wer.sh create mode 120000 egs/librispeech/v1/steps create mode 120000 egs/librispeech/v1/steps_be create mode 120000 egs/librispeech/v1/steps_pyfe create mode 120000 egs/librispeech/v1/utils create mode 120000 egs/librispeech/v1/xvectors diff --git a/egs/librispeech/v1/cmd.sh b/egs/librispeech/v1/cmd.sh new file mode 100755 index 00000000..71f3bae0 --- /dev/null +++ b/egs/librispeech/v1/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/librispeech/v1/conf/fbank80_16k.yaml b/egs/librispeech/v1/conf/fbank80_16k.yaml new file mode 100644 index 00000000..88bae69e --- /dev/null +++ b/egs/librispeech/v1/conf/fbank80_16k.yaml @@ -0,0 +1,7 @@ +sample_frequency: 16000 +frame_length: 25 +low_freq: 20 +high_freq: 7600 +num_filters: 80 +snip_edges: false +use_energy: false diff --git a/egs/librispeech/v1/conf/wav2vec2base960h_ecapatdnn512x2.yaml b/egs/librispeech/v1/conf/wav2vec2base960h_ecapatdnn512x2.yaml new file mode 100644 index 00000000..85964372 --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2base960h_ecapatdnn512x2.yaml @@ -0,0 +1,37 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h +xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +feat_fusion_start: 2 +feat_fusion_method: weighted-avg diff --git a/egs/librispeech/v1/datapath.sh b/egs/librispeech/v1/datapath.sh new file mode 100644 index 00000000..4c7987ef --- /dev/null +++ b/egs/librispeech/v1/datapath.sh @@ -0,0 +1,22 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + librispeech_root=/export/corpora5/LibriSpeech + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + # voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + # voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + # musan_root=/expscratch/dgromero/corpora-open/musan + echo "Put your database paths here" + exit 1 +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/librispeech/v1/feats b/egs/librispeech/v1/feats new file mode 120000 index 00000000..7b9d122a --- /dev/null +++ b/egs/librispeech/v1/feats @@ -0,0 +1 @@ +hyp_utils/feats \ No newline at end of file diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh new file mode 100644 index 00000000..942fb336 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh @@ -0,0 +1,55 @@ +# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 + +# hugging face model +hf_model_name=wav2vec2base + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +dropout=0 +embed_dim=256 +lr=0.05 +s=30 +margin_warmup=20 +margin=0.3 +nnet_num_epochs=70 + + +lr=0.001 +#lr=0.005 +xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml +xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75 --model conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml --data.train.dataset.max-chunk-length 2 --data.train.dataset.min-chunk-length 2" + +nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v12 #v1 + +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0060.pth +nnet=$nnet_dir/swa_model_ep0076.pth +nnet=$nnet_dir/model_ep0060.pth +nnet=$nnet_dir/model_ep0030.pth +nnet=$nnet_dir/model_ep0040.pth +nnet=$nnet_dir/model_ep0020.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=6 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/librispeech/v1/hyp_utils b/egs/librispeech/v1/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/librispeech/v1/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/librispeech/v1/local/data_prep.sh b/egs/librispeech/v1/local/data_prep.sh new file mode 100755 index 00000000..c903d45b --- /dev/null +++ b/egs/librispeech/v1/local/data_prep.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +# Copyright 2014 Vassil Panayotov +# 2014 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" + exit 1 +fi + +src=$1 +dst=$2 + +# all utterances are FLAC compressed +if ! which flac >&/dev/null; then + echo "Please install 'flac' on ALL worker nodes!" + exit 1 +fi + +spk_file=$src/../SPEAKERS.TXT + +mkdir -p $dst || exit 1 + +[ ! -d $src ] && echo "$0: no such directory $src" && exit 1 +[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1 + + +wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp +trans=$dst/text; [[ -f "$trans" ]] && rm $trans +utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk +spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender + +for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do + reader=$(basename $reader_dir) + if ! [ $reader -eq $reader ]; then # not integer. + echo "$0: unexpected subdirectory name $reader" + exit 1 + fi + + reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}') + if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then + echo "Unexpected gender: '$reader_gender'" + exit 1 + fi + + for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do + chapter=$(basename $chapter_dir) + if ! [ "$chapter" -eq "$chapter" ]; then + echo "$0: unexpected chapter-subdirectory name $chapter" + exit 1 + fi + + find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ + awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1 + + chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt + [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 + cat $chapter_trans >>$trans + + # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered + # to be a different speaker. This is done for simplicity and because we want + # e.g. the CMVN to be calculated per-chapter + awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \ + <$chapter_trans >>$utt2spk || exit 1 + + # reader -> gender map (again using per-chapter granularity) + echo "${reader}-${chapter} $reader_gender" >>$spk2gender + done +done + +spk2utt=$dst/spk2utt +utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 + +ntrans=$(wc -l <$trans) +nutt2spk=$(wc -l <$utt2spk) +! [ "$ntrans" -eq "$nutt2spk" ] && \ + echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1 + +utils/validate_data_dir.sh --no-feats $dst || exit 1 + +echo "$0: successfully prepared data in $dst" + +exit 0 diff --git a/egs/librispeech/v1/path.sh b/egs/librispeech/v1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/librispeech/v1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/librispeech/v1/run_001_prepare_data.sh b/egs/librispeech/v1/run_001_prepare_data.sh new file mode 100755 index 00000000..c6c15692 --- /dev/null +++ b/egs/librispeech/v1/run_001_prepare_data.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. ./datapath.sh + + +nj=6 + +mkdir -p data + + +if [ ${stage} -le 1 ]; then + ### Task dependent. You have to make data the following preparation part by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 0: Data preparation" + for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do + # use underscore-separated names in data directories. + local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} + done +fi + +# if [ $stage -le 1 ]; then +# echo "Stage 1: Prepare LibriSpeech manifest" +# # We assume that you have downloaded the LibriSpeech corpus +# # to $librispeech_root +# mkdir -p data/manifests +# if [ ! -e data/manifests/.librispeech.done ]; then +# lhotse prepare librispeech -j $nj $librispeech_root data/manifests +# touch data/manifests/.librispeech.done +# fi +# fi + +# if [ $stage -le 2 ]; then +# echo "Stage 2: Prepare musan manifest" +# # We assume that you have downloaded the musan corpus +# # to $musan_root +# mkdir -p data/manifests +# if [ ! -e data/manifests/.musan.done ]; then +# lhotse prepare musan $musan_root data/manifests +# touch data/manifests/.musan.done +# fi +# fi diff --git a/egs/librispeech/v1/run_003_compute_fbank.sh b/egs/librispeech/v1/run_003_compute_fbank.sh new file mode 100755 index 00000000..0f5966a8 --- /dev/null +++ b/egs/librispeech/v1/run_003_compute_fbank.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +storage_name=$(date +'%m_%d_%H_%M') +fbankdir=`pwd`/exp/fbank + +stage=1 +config_file=default_config.sh +feat_vers="numpy" + +. parse_options.sh || exit 1; + +if [ "$feat_vers" == "kaldi" ];then + make_fbank=steps/make_fbank.sh + fbank_cfg=conf/fbank80_16k.conf +else + fbank_cfg=conf/fbank80_16k.yaml + if [ "$feat_vers" == "numpy" ];then + make_fbank=steps_pyfe/make_fbank.sh + else + make_fbank=steps_pyfe/make_torch_fbank.sh + fi +fi + + +# Make filterbanks +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $fbankdir/storage ]; then + dir_name=$USER/hyp-data/librispeech/v1/$storage_name/fbank/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $fbankdir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $fbankdir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $fbankdir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $fbankdir/storage + else + echo "we don't distribute data between multiple machines" + fi + fi +fi + +if [ $stage -le 2 ];then + for name in dev_clean test_clean dev_other test_other train_clean_100 train_clean_360 train_other_500; + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + $make_fbank --write-utt2num-frames true --fbank-config $fbank_cfg --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_fbank/$name $fbankdir + utils/fix_data_dir.sh data/${name} + done + +fi + diff --git a/egs/librispeech/v1/run_004_compute_bpe.sh b/egs/librispeech/v1/run_004_compute_bpe.sh new file mode 100755 index 00000000..571205a8 --- /dev/null +++ b/egs/librispeech/v1/run_004_compute_bpe.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +storage_name=$(date +'%m_%d_%H_%M') + + +dl_dir=$PWD/download + +stage=2 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + + +if [ $stage -le 1 ]; then + echo "Stage 1: Download LM" + mkdir -p $dl_dir/lm + if [ ! -e $dl_dir/lm/.done ]; then + ./local/download_lm.py --out-dir=$dl_dir/lm + touch $dl_dir/lm/.done + fi +fi + +if [ $stage -le 2 ]; then + echo "Stage 2: Prepare phone based lang" + lang_dir=data/lang_phone + mkdir -p $lang_dir + + (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | + cat - $dl_dir/lm/librispeech-lexicon.txt | + sort | uniq > $lang_dir/lexicon.txt + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang.py --lang-dir $lang_dir + fi +fi + + +if [ $stage -le 3 ]; then + echo "Stage 3: Prepare BPE based lang" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir + # We reuse words.txt from phone based lexicon + # so that the two can share G.pt later. + cp data/lang_phone/words.txt $lang_dir + + if [ ! -f $lang_dir/transcript_words.txt ]; then + echo "Generate data for BPE training" + files=$( + find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" + find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" + find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" + ) + for f in ${files[@]}; do + cat $f | cut -d " " -f 2- + done > $lang_dir/transcript_words.txt + fi + + if [ ! -f $lang_dir/bpe.model ]; then + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript $lang_dir/transcript_words.txt + fi + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir + + echo "Validating $lang_dir/lexicon.txt" + ./local/validate_bpe_lexicon.py \ + --lexicon $lang_dir/lexicon.txt \ + --bpe-model $lang_dir/bpe.model + fi + done +fi diff --git a/egs/librispeech/v1/run_010_prepare_asr_train_data.sh b/egs/librispeech/v1/run_010_prepare_asr_train_data.sh new file mode 100755 index 00000000..5936fbf4 --- /dev/null +++ b/egs/librispeech/v1/run_010_prepare_asr_train_data.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 2 ]; then + # This script preprocess audio for x-vector training + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${nnet_data} data/${nnet_data}_proc_audio_no_sil exp/${nnet_data}_proc_audio_no_sil + hyp_utils/kaldi/utils/fix_data_dir.sh data/${nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 4s + hyp_utils/remove_short_audios.sh --min-len 4 data/${nnet_data}_proc_audio_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 4 utterances. + hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 4 ]; then + # Prepare train and validation lists for x-vectors + local/make_train_lists_sup_embed_with_augm.sh \ + data/${nnet_data}_proc_audio_no_sil \ + data/${nnet_data}_proc_audio_no_sil/lists_xvec +fi + +exit diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh new file mode 100755 index 00000000..dc4e1dee --- /dev/null +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_s1_dir $args \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir $args \ + --num-gpus $ngpu \ + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir $args \ + --num-gpus $ngpu \ + +fi + diff --git a/egs/librispeech/v1/run_030_inference.sh b/egs/librispeech/v1/run_030_inference.sh new file mode 100755 index 00000000..67122f85 --- /dev/null +++ b/egs/librispeech/v1/run_030_inference.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=2 +config_file=default_config.sh +use_gpu=false +nnet_stage=3 +hf_chunk_length=120 #seconds +xvec_chunk_length=120 #seconds +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 6G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [ $stage -le 1 ]; then + # Extract xvectors for training LDA/PLDA + for name in voxceleb2cat_train + do + if [ $plda_num_augs -eq 0 ]; then + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + $nnet data/${name} \ + $xvector_dir/${name} + else + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ + --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + --aug-config $plda_aug_config --num-augs $plda_num_augs \ + $nnet data/${name} \ + $xvector_dir/${name}_augx${plda_num_augs} \ + data/${name}_augx${plda_num_augs} + fi + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + for name in voxceleb1_test + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj $nj ${xvec_args} \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + +exit diff --git a/egs/librispeech/v1/run_040_eval_wer.sh b/egs/librispeech/v1/run_040_eval_wer.sh new file mode 100755 index 00000000..ac561344 --- /dev/null +++ b/egs/librispeech/v1/run_040_eval_wer.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +# By default we evaluate the nnet after finetuning stage 3 and only with cosine scoring +stage=3 +config_file=default_config.sh +nnet_stage=3 + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_plda_dir=$score_dir/plda +score_cosine_dir=exp/scores/$nnet_name/cosine + +if [ $stage -le 1 ]; then + echo "Train PLDA on Voxceleb2" + steps_be/train_be_v1.sh \ + --cmd "$train_cmd" \ + --lda_dim $lda_dim \ + --plda_type $plda_type \ + --y_dim $plda_y_dim --z_dim $plda_z_dim \ + $xvector_dir/$plda_data/xvector.scp \ + data/$plda_data \ + $be_dir & + + wait +fi + + +if [ $stage -le 2 ];then + + echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" + steps_be/eval_be_v1.sh \ + --cmd "$train_cmd" --plda_type $plda_type \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $be_dir/lda_lnorm.h5 \ + $be_dir/plda.h5 \ + $score_plda_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + +score_plda_dir=$score_cosine_dir + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_plda_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + + +exit + diff --git a/egs/librispeech/v1/steps b/egs/librispeech/v1/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/librispeech/v1/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/librispeech/v1/steps_be b/egs/librispeech/v1/steps_be new file mode 120000 index 00000000..b2098c2a --- /dev/null +++ b/egs/librispeech/v1/steps_be @@ -0,0 +1 @@ +../v1/steps_be \ No newline at end of file diff --git a/egs/librispeech/v1/steps_pyfe b/egs/librispeech/v1/steps_pyfe new file mode 120000 index 00000000..7b9d122a --- /dev/null +++ b/egs/librispeech/v1/steps_pyfe @@ -0,0 +1 @@ +hyp_utils/feats \ No newline at end of file diff --git a/egs/librispeech/v1/utils b/egs/librispeech/v1/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/librispeech/v1/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/librispeech/v1/xvectors b/egs/librispeech/v1/xvectors new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/librispeech/v1/xvectors @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file From 09354a45d0afa6de79099193db1d1a23fe2d70eb Mon Sep 17 00:00:00 2001 From: neillu23 Date: Mon, 3 Oct 2022 20:25:26 -0400 Subject: [PATCH 027/154] discard the change of train_wav2vec2xvector --- hyperion/bin/train_wav2vec2xvector.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 08913605..8c30faaf 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -144,19 +144,15 @@ def make_parser(model_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.train.dataset.class_file", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.val.dataset.class_file", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.train.data_loader.num_workers", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.val.data_loader.num_workers", action=ActionParser(parser=data_parser)) - # parser.link_arguments( - # "data.train.dataset.class_file", "data.val.dataset.class_file" - # ) - # parser.link_arguments( - # "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" - # ) - # parser.link_arguments( - # "data.train.sampler.batch_size", "data.val.sampler.batch_size" - # ) + parser.link_arguments( + "data.train.dataset.class_file", "data.val.dataset.class_file" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + parser.link_arguments( + "data.train.sampler.batch_size", "data.val.sampler.batch_size" + ) model_class.add_class_args(parser, prefix="model") Trainer.add_class_args( From ac99960641cbb8e021fdf8bbfbd3b45512621235 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 5 Oct 2022 08:51:26 -0400 Subject: [PATCH 028/154] improved utils --- egs/voxceleb/v1/steps_be/eval-be-v1.py | 4 +- egs/voxceleb/v1/steps_be/eval-be-v2.py | 4 +- .../preprocess_audios_for_nnet_train.sh | 2 +- .../data/class_weighted_seg_chunk_sampler.py | 40 ++++++++++++++----- hyperion/torch/data/seg_sampler_factory.py | 24 +++++++++++ hyperion/utils/__init__.py | 7 +++- hyperion/utils/class_info.py | 2 +- hyperion/utils/feature_set.py | 6 ++- hyperion/utils/info_table.py | 21 ++++++++++ hyperion/utils/recording_set.py | 2 +- 10 files changed, 95 insertions(+), 17 deletions(-) diff --git a/egs/voxceleb/v1/steps_be/eval-be-v1.py b/egs/voxceleb/v1/steps_be/eval-be-v1.py index f7d26390..da77f8f3 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-v1.py +++ b/egs/voxceleb/v1/steps_be/eval-be-v1.py @@ -20,6 +20,7 @@ import numpy as np from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.list_utils import ismember from hyperion.utils import TrialNdx, TrialScores from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F @@ -78,7 +79,8 @@ def eval_plda( if num_model_parts > 1 or num_seg_parts > 1: score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) logging.info("saving scores to %s" % (score_file)) - s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask) + f, loc = ismember(enroll, ndx.model_set) + s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask[loc]) s.save_txt(score_file) diff --git a/egs/voxceleb/v1/steps_be/eval-be-v2.py b/egs/voxceleb/v1/steps_be/eval-be-v2.py index d5cd6a55..413ca313 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-v2.py +++ b/egs/voxceleb/v1/steps_be/eval-be-v2.py @@ -18,6 +18,7 @@ import numpy as np from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.list_utils import ismember from hyperion.utils import TrialNdx, TrialScores from hyperion.utils.math import cosine_scoring from hyperion.helpers import TrialDataReader as TDR @@ -72,7 +73,8 @@ def eval_plda( if num_model_parts > 1 or num_seg_parts > 1: score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) logging.info("saving scores to %s" % (score_file)) - s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask) + f, loc = ismember(enroll, ndx.model_set) + s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask[loc]) s.save_txt(score_file) diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index 1a1fd7ad..7c35b234 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -45,7 +45,7 @@ mkdir -p $data_out output_dir=$(utils/make_absolute.sh $dir) if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $output_dir/storage ]; then - dir_name=$USER/hyp-data/xvectors/$storage_name/xvector_audio/storage + dir_name=$USER/hyp-data/$storage_name/xvector_audio/storage if [ "$nodes" == "b0" ];then utils/create_split_dir.pl \ utils/create_split_dir.pl \ diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 91d592bc..7dfb8a35 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -29,6 +29,7 @@ def __init__( num_chunks_per_seg=1, weight_exponent=1.0, weight_mode="custom", + seg_weight_mode="uniform", num_hard_prototypes=0, affinity_matrix=None, class_name="class_id", @@ -76,6 +77,7 @@ def __init__( self.weight_exponent = weight_exponent self.weight_mode = weight_mode + self.seg_weight_mode = seg_weight_mode self.num_hard_prototypes = num_hard_prototypes self.batch = 0 @@ -164,7 +166,7 @@ def _gather_class_info(self): def _set_class_weights(self): if self.weight_mode == "uniform": self.class_info.set_uniform_weights() - elif self.weight_mode == "dataset-prior": + elif self.weight_mode == "data-prior": weights = self.class_info["total_duration"].values self.class_info.set_weights(self, weights) @@ -264,12 +266,24 @@ def _sample_segs(self, class_ids, chunk_length): # sample num_segs_per_class random segments if len(seg_ids_c) == 0: print(chunk_length, c, self.class_info.loc[c], flush=True) - sel_seg_idx_c = torch.randint( - low=0, - high=len(seg_ids_c), - size=(self.num_segs_per_class,), - generator=self.rng, - ).numpy() + if self.seg_weight_mode == "uniform": + sel_seg_idx_c = torch.randint( + low=0, + high=len(seg_ids_c), + size=(self.num_segs_per_class,), + generator=self.rng, + ).numpy() + elif self.seg_weight_mode == "data-prior": + weights = self.seg_set.loc[seg_mask, self.length_name].values + weights /= weights.sum() + sel_seg_idx_c = torch.multinomial( + torch.from_numpy(weights), + num_samples=self.num_segs_per_class, + replacement=True, + generator=self.rng, + ).numpy() + else: + raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode) sel_seg_ids_c = list(seg_ids_c[sel_seg_idx_c]) seg_ids.extend(sel_seg_ids_c) @@ -319,6 +333,7 @@ def filter_args(**kwargs): "num_chunks_per_seg", "weight_exponent", "weight_mode", + "seg_weight_mode", "num_hard_prototypes", "class_name", "length_name", @@ -416,8 +431,15 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--weight-mode", default="custom", - choices=["custom", "uniform", "dataset-prior"], - help=("exponent for class weights"), + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--seg-weight-mode", + default="uniform", + choices=["uniform", "data-prior"], + help=("method to sample segments given a class"), ) parser.add_argument( diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index f09095e6..3093a532 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -80,6 +80,9 @@ def filter_args(**kwargs): "num_chunks_per_seg_epoch", "num_segs_per_class", "num_chunks_per_seg", + "weight_mode", + "weight_exponent", + "seg_weight_mode", "num_hard_prototypes", "class_name", "length_name", @@ -183,6 +186,27 @@ def add_class_args(parser, prefix=None): default=1, help=("number of chunks per segment in batch"), ) + + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--seg-weight-mode", + default="uniform", + choices=["uniform", "data-prior"], + help=("method to sample segments given a class"), + ) + parser.add_argument( "--num-hard-prototypes", type=int, diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index bfd81028..251361ae 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -10,7 +10,12 @@ from .sparse_trial_scores import SparseTrialScores from .scp_list import SCPList from .utt2info import Utt2Info -from .ext_segment_list import ExtSegmentList + +# from .ext_segment_list import ExtSegmentList from .segment_list import SegmentList from .kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix from .rttm import RTTM +from .recording_set import RecordingSet +from .class_info import ClassInfo +from .segment_set import SegmentSet +from .feature_set import FeatureSet diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index ff98c7c5..f1eaf665 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -19,7 +19,7 @@ def __init__(self, df): if "weights" not in self.df: self.set_uniform_weights() else: - self.df["weights"] /= self.df["weigths"].sum() + self.df["weights"] /= self.df["weights"].sum() def add_class_idx(self): self.df["class_idx"] = [i for i in range(len(self.df))] diff --git a/hyperion/utils/feature_set.py b/hyperion/utils/feature_set.py index 986a21b9..2b2f0aaf 100644 --- a/hyperion/utils/feature_set.py +++ b/hyperion/utils/feature_set.py @@ -30,14 +30,16 @@ def save(self, file_path, sep=None): # if no extension we save as kaldi feats.scp file from .scp_list import SCPList - offset = self.df["storage_byte"] if "storage_byte" is not None else None + offset = self.df["storage_byte"] if "storage_byte" in self.df else None range = None if "start" and "num_frames" in self.df: range = [ np.array([s, n], dtype=np.int64) for s, n in self.df[["start", "num_frames"]] ] - scp = SCPList(self.df["id"], self.df["storage_path"], offset, range) + scp = SCPList( + self.df["id"].values, self.df["storage_path"].values, offset, range + ) scp.save(file_path) return diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 25632941..61033d16 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -385,4 +385,25 @@ def shuffle(self, seed=1024, rng=None): self.df = self.df.iloc[index] return index + def set_index(self, keys, inplace=True): + if inplace: + self.df.set_index(keys, drop=False, inplace=True) + return + + df = self.df.set_index(keys, drop=False, inplace=False) + return type(self)(df) + + def reset_index(self): + self.df.set_index("id", drop=False, inplace=True) + + def get_loc(self, keys): + loc = self.df.index.get_loc(keys) + if isinstance(loc, int): + return loc + elif isinstance(loc, np.ndarray) and loc.dtype==np.bool: + return np.nonzero(loc)[0] + else: + return list(range(loc.start, loc.stop, loc.step)) + + \ No newline at end of file diff --git a/hyperion/utils/recording_set.py b/hyperion/utils/recording_set.py index 9695cef3..8346315c 100644 --- a/hyperion/utils/recording_set.py +++ b/hyperion/utils/recording_set.py @@ -30,7 +30,7 @@ def save(self, file_path, sep=None): # if no extension we save as kaldi feats.scp file from .scp_list import SCPList - scp = SCPList(self.df["id"], self.df["storage_path"]) + scp = SCPList(self.df["id"].values, self.df["storage_path"].values) scp.save(file_path) return From 42daf5cee831f9b117bc53d40244f19b4a721891 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Thu, 13 Oct 2022 22:51:05 -0400 Subject: [PATCH 029/154] update data preparation --- egs/librispeech/v1/local/prepare_lang.py | 413 ++++++++ egs/librispeech/v1/local/prepare_lang_bpe.py | 261 +++++ egs/librispeech/v1/local/train_bpe_model.py | 97 ++ hyperion/torch/models/__init__.py | 2 +- .../models/transducer/encoder_interface.py | 43 + .../torch/models/transducer/transducer.py | 4 +- .../wav2transducer/hf_wav2transducer.py | 2 +- .../wav2transducer/hf_wav2vec2_transducer.py | 1 + hyperion/utils/lexicon.py | 277 +++++ hyperion/utils/utils.py | 978 ++++++++++++++++++ 10 files changed, 2074 insertions(+), 4 deletions(-) create mode 100755 egs/librispeech/v1/local/prepare_lang.py create mode 100755 egs/librispeech/v1/local/prepare_lang_bpe.py create mode 100755 egs/librispeech/v1/local/train_bpe_model.py create mode 100644 hyperion/torch/models/transducer/encoder_interface.py create mode 100644 hyperion/utils/lexicon.py create mode 100644 hyperion/utils/utils.py diff --git a/egs/librispeech/v1/local/prepare_lang.py b/egs/librispeech/v1/local/prepare_lang.py new file mode 100755 index 00000000..74e09629 --- /dev/null +++ b/egs/librispeech/v1/local/prepare_lang.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script takes as input a lexicon file "data/lang_phone/lexicon.txt" +consisting of words and tokens (i.e., phones) and does the following: + +1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt + +2. Generate tokens.txt, the token table mapping a token to a unique integer. + +3. Generate words.txt, the word table mapping a word to a unique integer. + +4. Generate L.pt, in k2 format. It can be loaded by + + d = torch.load("L.pt") + lexicon = k2.Fsa.from_dict(d) + +5. Generate L_disambig.pt, in k2 format. +""" +import argparse +import math +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import k2 +import torch + +from hyperion.utils.lexicon import read_lexicon, write_lexicon +from hyperion.utils.utils import str2bool + +Lexicon = List[Tuple[str, List[str]]] + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain a file lexicon.txt. + Generated files by this script are saved into this directory. + """, + ) + + parser.add_argument( + "--debug", + type=str2bool, + default=False, + help="""True for debugging, which will generate + a visualization of the lexicon FST. + + Caution: If your lexicon contains hundreds of thousands + of lines, please set it to False! + """, + ) + + return parser.parse_args() + + +def write_mapping(filename: str, sym2id: Dict[str, int]) -> None: + """Write a symbol to ID mapping to a file. + + Note: + No need to implement `read_mapping` as it can be done + through :func:`k2.SymbolTable.from_file`. + + Args: + filename: + Filename to save the mapping. + sym2id: + A dict mapping symbols to IDs. + Returns: + Return None. + """ + with open(filename, "w", encoding="utf-8") as f: + for sym, i in sym2id.items(): + f.write(f"{sym} {i}\n") + + +def get_tokens(lexicon: Lexicon) -> List[str]: + """Get tokens from a lexicon. + + Args: + lexicon: + It is the return value of :func:`read_lexicon`. + Returns: + Return a list of unique tokens. + """ + ans = set() + for _, tokens in lexicon: + ans.update(tokens) + sorted_ans = sorted(list(ans)) + return sorted_ans + + +def get_words(lexicon: Lexicon) -> List[str]: + """Get words from a lexicon. + + Args: + lexicon: + It is the return value of :func:`read_lexicon`. + Returns: + Return a list of unique words. + """ + ans = set() + for word, _ in lexicon: + ans.add(word) + sorted_ans = sorted(list(ans)) + return sorted_ans + + +def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]: + """It adds pseudo-token disambiguation symbols #1, #2 and so on + at the ends of tokens to ensure that all pronunciations are different, + and that none is a prefix of another. + + See also add_lex_disambig.pl from kaldi. + + Args: + lexicon: + It is returned by :func:`read_lexicon`. + Returns: + Return a tuple with two elements: + + - The output lexicon with disambiguation symbols + - The ID of the max disambiguation symbol that appears + in the lexicon + """ + + # (1) Work out the count of each token-sequence in the + # lexicon. + count = defaultdict(int) + for _, tokens in lexicon: + count[" ".join(tokens)] += 1 + + # (2) For each left sub-sequence of each token-sequence, note down + # that it exists (for identifying prefixes of longer strings). + issubseq = defaultdict(int) + for _, tokens in lexicon: + tokens = tokens.copy() + tokens.pop() + while tokens: + issubseq[" ".join(tokens)] = 1 + tokens.pop() + + # (3) For each entry in the lexicon: + # if the token sequence is unique and is not a + # prefix of another word, no disambig symbol. + # Else output #1, or #2, #3, ... if the same token-seq + # has already been assigned a disambig symbol. + ans = [] + + # We start with #1 since #0 has its own purpose + first_allowed_disambig = 1 + max_disambig = first_allowed_disambig - 1 + last_used_disambig_symbol_of = defaultdict(int) + + for word, tokens in lexicon: + tokenseq = " ".join(tokens) + assert tokenseq != "" + if issubseq[tokenseq] == 0 and count[tokenseq] == 1: + ans.append((word, tokens)) + continue + + cur_disambig = last_used_disambig_symbol_of[tokenseq] + if cur_disambig == 0: + cur_disambig = first_allowed_disambig + else: + cur_disambig += 1 + + if cur_disambig > max_disambig: + max_disambig = cur_disambig + last_used_disambig_symbol_of[tokenseq] = cur_disambig + tokenseq += f" #{cur_disambig}" + ans.append((word, tokenseq.split())) + return ans, max_disambig + + +def generate_id_map(symbols: List[str]) -> Dict[str, int]: + """Generate ID maps, i.e., map a symbol to a unique ID. + + Args: + symbols: + A list of unique symbols. + Returns: + A dict containing the mapping between symbols and IDs. + """ + return {sym: i for i, sym in enumerate(symbols)} + + +def add_self_loops( + arcs: List[List[Any]], disambig_token: int, disambig_word: int +) -> List[List[Any]]: + """Adds self-loops to states of an FST to propagate disambiguation symbols + through it. They are added on each state with non-epsilon output symbols + on at least one arc out of the state. + + See also fstaddselfloops.pl from Kaldi. One difference is that + Kaldi uses OpenFst style FSTs and it has multiple final states. + This function uses k2 style FSTs and it does not need to add self-loops + to the final state. + + The input label of a self-loop is `disambig_token`, while the output + label is `disambig_word`. + + Args: + arcs: + A list-of-list. The sublist contains + `[src_state, dest_state, label, aux_label, score]` + disambig_token: + It is the token ID of the symbol `#0`. + disambig_word: + It is the word ID of the symbol `#0`. + + Return: + Return new `arcs` containing self-loops. + """ + states_needs_self_loops = set() + for arc in arcs: + src, dst, ilabel, olabel, score = arc + if olabel != 0: + states_needs_self_loops.add(src) + + ans = [] + for s in states_needs_self_loops: + ans.append([s, s, disambig_token, disambig_word, 0]) + + return arcs + ans + + +def lexicon_to_fst( + lexicon: Lexicon, + token2id: Dict[str, int], + word2id: Dict[str, int], + sil_token: str = "SIL", + sil_prob: float = 0.5, + need_self_loops: bool = False, +) -> k2.Fsa: + """Convert a lexicon to an FST (in k2 format) with optional silence at + the beginning and end of each word. + + Args: + lexicon: + The input lexicon. See also :func:`read_lexicon` + token2id: + A dict mapping tokens to IDs. + word2id: + A dict mapping words to IDs. + sil_token: + The silence token. + sil_prob: + The probability for adding a silence at the beginning and end + of the word. + need_self_loops: + If True, add self-loop to states with non-epsilon output symbols + on at least one arc out of the state. The input label for this + self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. + Returns: + Return an instance of `k2.Fsa` representing the given lexicon. + """ + assert sil_prob > 0.0 and sil_prob < 1.0 + # CAUTION: we use score, i.e, negative cost. + sil_score = math.log(sil_prob) + no_sil_score = math.log(1.0 - sil_prob) + + start_state = 0 + loop_state = 1 # words enter and leave from here + sil_state = 2 # words terminate here when followed by silence; this state + # has a silence transition to loop_state. + next_state = 3 # the next un-allocated state, will be incremented as we go. + arcs = [] + + assert token2id[""] == 0 + assert word2id[""] == 0 + + eps = 0 + + sil_token = token2id[sil_token] + + arcs.append([start_state, loop_state, eps, eps, no_sil_score]) + arcs.append([start_state, sil_state, eps, eps, sil_score]) + arcs.append([sil_state, loop_state, sil_token, eps, 0]) + + for word, tokens in lexicon: + assert len(tokens) > 0, f"{word} has no pronunciations" + cur_state = loop_state + + word = word2id[word] + tokens = [token2id[i] for i in tokens] + + for i in range(len(tokens) - 1): + w = word if i == 0 else eps + arcs.append([cur_state, next_state, tokens[i], w, 0]) + + cur_state = next_state + next_state += 1 + + # now for the last token of this word + # It has two out-going arcs, one to the loop state, + # the other one to the sil_state. + i = len(tokens) - 1 + w = word if i == 0 else eps + arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score]) + arcs.append([cur_state, sil_state, tokens[i], w, sil_score]) + + if need_self_loops: + disambig_token = token2id["#0"] + disambig_word = word2id["#0"] + arcs = add_self_loops( + arcs, + disambig_token=disambig_token, + disambig_word=disambig_word, + ) + + final_state = next_state + arcs.append([loop_state, final_state, -1, -1, 0]) + arcs.append([final_state]) + + arcs = sorted(arcs, key=lambda arc: arc[0]) + arcs = [[str(i) for i in arc] for arc in arcs] + arcs = [" ".join(arc) for arc in arcs] + arcs = "\n".join(arcs) + + fsa = k2.Fsa.from_str(arcs, acceptor=False) + return fsa + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + lexicon_filename = lang_dir / "lexicon.txt" + sil_token = "SIL" + sil_prob = 0.5 + + lexicon = read_lexicon(lexicon_filename) + tokens = get_tokens(lexicon) + words = get_words(lexicon) + + lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) + + for i in range(max_disambig + 1): + disambig = f"#{i}" + assert disambig not in tokens + tokens.append(f"#{i}") + + assert "" not in tokens + tokens = [""] + tokens + + assert "" not in words + assert "#0" not in words + assert "" not in words + assert "" not in words + + words = [""] + words + ["#0", "", ""] + + token2id = generate_id_map(tokens) + word2id = generate_id_map(words) + + write_mapping(lang_dir / "tokens.txt", token2id) + write_mapping(lang_dir / "words.txt", word2id) + write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig) + + L = lexicon_to_fst( + lexicon, + token2id=token2id, + word2id=word2id, + sil_token=sil_token, + sil_prob=sil_prob, + ) + + L_disambig = lexicon_to_fst( + lexicon_disambig, + token2id=token2id, + word2id=word2id, + sil_token=sil_token, + sil_prob=sil_prob, + need_self_loops=True, + ) + torch.save(L.as_dict(), lang_dir / "L.pt") + torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") + + if args.debug: + labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") + + L.labels_sym = labels_sym + L.aux_labels_sym = aux_labels_sym + L.draw(f"{lang_dir / 'L.svg'}", title="L.pt") + + L_disambig.labels_sym = labels_sym + L_disambig.aux_labels_sym = aux_labels_sym + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt") + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/v1/local/prepare_lang_bpe.py b/egs/librispeech/v1/local/prepare_lang_bpe.py new file mode 100755 index 00000000..d8cee8ed --- /dev/null +++ b/egs/librispeech/v1/local/prepare_lang_bpe.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) + +""" + +This script takes as input `lang_dir`, which should contain:: + + - lang_dir/bpe.model, + - lang_dir/words.txt + +and generates the following files in the directory `lang_dir`: + + - lexicon.txt + - lexicon_disambig.txt + - L.pt + - L_disambig.pt + - tokens.txt +""" + +import argparse +from pathlib import Path +from typing import Dict, List, Tuple + +import k2 +import sentencepiece as spm +import torch +from prepare_lang import ( + Lexicon, + add_disambig_symbols, + add_self_loops, + write_lexicon, + write_mapping, +) + +from hyperion.utils.utils import str2bool + + +def lexicon_to_fst_no_sil( + lexicon: Lexicon, + token2id: Dict[str, int], + word2id: Dict[str, int], + need_self_loops: bool = False, +) -> k2.Fsa: + """Convert a lexicon to an FST (in k2 format). + + Args: + lexicon: + The input lexicon. See also :func:`read_lexicon` + token2id: + A dict mapping tokens to IDs. + word2id: + A dict mapping words to IDs. + need_self_loops: + If True, add self-loop to states with non-epsilon output symbols + on at least one arc out of the state. The input label for this + self loop is `token2id["#0"]` and the output label is `word2id["#0"]`. + Returns: + Return an instance of `k2.Fsa` representing the given lexicon. + """ + loop_state = 0 # words enter and leave from here + next_state = 1 # the next un-allocated state, will be incremented as we go + + arcs = [] + + # The blank symbol is defined in local/train_bpe_model.py + assert token2id[""] == 0 + assert word2id[""] == 0 + + eps = 0 + + for word, pieces in lexicon: + assert len(pieces) > 0, f"{word} has no pronunciations" + cur_state = loop_state + + word = word2id[word] + pieces = [token2id[i] for i in pieces] + + for i in range(len(pieces) - 1): + w = word if i == 0 else eps + arcs.append([cur_state, next_state, pieces[i], w, 0]) + + cur_state = next_state + next_state += 1 + + # now for the last piece of this word + i = len(pieces) - 1 + w = word if i == 0 else eps + arcs.append([cur_state, loop_state, pieces[i], w, 0]) + + if need_self_loops: + disambig_token = token2id["#0"] + disambig_word = word2id["#0"] + arcs = add_self_loops( + arcs, + disambig_token=disambig_token, + disambig_word=disambig_word, + ) + + final_state = next_state + arcs.append([loop_state, final_state, -1, -1, 0]) + arcs.append([final_state]) + + arcs = sorted(arcs, key=lambda arc: arc[0]) + arcs = [[str(i) for i in arc] for arc in arcs] + arcs = [" ".join(arc) for arc in arcs] + arcs = "\n".join(arcs) + + fsa = k2.Fsa.from_str(arcs, acceptor=False) + return fsa + + +def generate_lexicon( + model_file: str, words: List[str] +) -> Tuple[Lexicon, Dict[str, int]]: + """Generate a lexicon from a BPE model. + + Args: + model_file: + Path to a sentencepiece model. + words: + A list of strings representing words. + Returns: + Return a tuple with two elements: + - A dict whose keys are words and values are the corresponding + word pieces. + - A dict representing the token symbol, mapping from tokens to IDs. + """ + sp = spm.SentencePieceProcessor() + sp.load(str(model_file)) + + # Convert word to word piece IDs instead of word piece strings + # to avoid OOV tokens. + words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int) + + # Now convert word piece IDs back to word piece strings. + words_pieces: List[List[str]] = [ + sp.id_to_piece(ids) for ids in words_pieces_ids + ] + + lexicon = [] + for word, pieces in zip(words, words_pieces): + lexicon.append((word, pieces)) + + # The OOV word is + lexicon.append(("", [sp.id_to_piece(sp.unk_id())])) + + token2id: Dict[str, int] = dict() + for i in range(sp.vocab_size()): + token2id[sp.id_to_piece(i)] = i + + return lexicon, token2id + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + It should contain the bpe.model and words.txt + """, + ) + + parser.add_argument( + "--debug", + type=str2bool, + default=False, + help="""True for debugging, which will generate + a visualization of the lexicon FST. + + Caution: If your lexicon contains hundreds of thousands + of lines, please set it to False! + + See "test/test_bpe_lexicon.py" for usage. + """, + ) + + return parser.parse_args() + + +def main(): + args = get_args() + lang_dir = Path(args.lang_dir) + model_file = lang_dir / "bpe.model" + + word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt") + + words = word_sym_table.symbols + + excluded = ["", "!SIL", "", "", "#0", "", ""] + for w in excluded: + if w in words: + words.remove(w) + + lexicon, token_sym_table = generate_lexicon(model_file, words) + + lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) + + next_token_id = max(token_sym_table.values()) + 1 + for i in range(max_disambig + 1): + disambig = f"#{i}" + assert disambig not in token_sym_table + token_sym_table[disambig] = next_token_id + next_token_id += 1 + + word_sym_table.add("#0") + word_sym_table.add("") + word_sym_table.add("") + + write_mapping(lang_dir / "tokens.txt", token_sym_table) + + write_lexicon(lang_dir / "lexicon.txt", lexicon) + write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig) + + L = lexicon_to_fst_no_sil( + lexicon, + token2id=token_sym_table, + word2id=word_sym_table, + ) + + L_disambig = lexicon_to_fst_no_sil( + lexicon_disambig, + token2id=token_sym_table, + word2id=word_sym_table, + need_self_loops=True, + ) + torch.save(L.as_dict(), lang_dir / "L.pt") + torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt") + + if args.debug: + labels_sym = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + aux_labels_sym = k2.SymbolTable.from_file(lang_dir / "words.txt") + + L.labels_sym = labels_sym + L.aux_labels_sym = aux_labels_sym + L.draw(f"{lang_dir / 'L.svg'}", title="L.pt") + + L_disambig.labels_sym = labels_sym + L_disambig.aux_labels_sym = aux_labels_sym + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt") + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/v1/local/train_bpe_model.py b/egs/librispeech/v1/local/train_bpe_model.py new file mode 100755 index 00000000..42aba957 --- /dev/null +++ b/egs/librispeech/v1/local/train_bpe_model.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# You can install sentencepiece via: +# +# pip install sentencepiece +# +# Due to an issue reported in +# https://github.com/google/sentencepiece/pull/642#issuecomment-857972030 +# +# Please install a version >=0.1.96 + +import argparse +import shutil +from pathlib import Path + +import sentencepiece as spm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--lang-dir", + type=str, + help="""Input and output directory. + The generated bpe.model is saved to this directory. + """, + ) + + parser.add_argument( + "--transcript", + type=str, + help="Training transcript.", + ) + + parser.add_argument( + "--vocab-size", + type=int, + help="Vocabulary size for BPE training", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + vocab_size = args.vocab_size + lang_dir = Path(args.lang_dir) + + model_type = "unigram" + + model_prefix = f"{lang_dir}/{model_type}_{vocab_size}" + train_text = args.transcript + character_coverage = 1.0 + input_sentence_size = 100000000 + + user_defined_symbols = ["", ""] + unk_id = len(user_defined_symbols) + # Note: unk_id is fixed to 2. + # If you change it, you should also change other + # places that are using it. + + model_file = Path(model_prefix + ".model") + if not model_file.is_file(): + spm.SentencePieceTrainer.train( + input=train_text, + vocab_size=vocab_size, + model_type=model_type, + model_prefix=model_prefix, + input_sentence_size=input_sentence_size, + character_coverage=character_coverage, + user_defined_symbols=user_defined_symbols, + unk_id=unk_id, + bos_id=-1, + eos_id=-1, + ) + + shutil.copyfile(model_file, f"{lang_dir}/bpe.model") + + +if __name__ == "__main__": + main() diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 5a1368e2..44ff171d 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -19,7 +19,7 @@ ) -from .transducer import HFWav2Vec2Transducer +from .wav2transducer import HFWav2Vec2Transducer from .vae.vae import VAE from .vae.vq_vae import VQVAE diff --git a/hyperion/torch/models/transducer/encoder_interface.py b/hyperion/torch/models/transducer/encoder_interface.py new file mode 100644 index 00000000..257facce --- /dev/null +++ b/hyperion/torch/models/transducer/encoder_interface.py @@ -0,0 +1,43 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import torch +import torch.nn as nn + + +class EncoderInterface(nn.Module): + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + A tensor of shape (batch_size, input_seq_len, num_features) + containing the input features. + x_lens: + A tensor of shape (batch_size,) containing the number of frames + in `x` before padding. + Returns: + Return a tuple containing two tensors: + - encoder_out, a tensor of (batch_size, out_seq_len, output_dim) + containing unnormalized probabilities, i.e., the output of a + linear layer. + - encoder_out_lens, a tensor of shape (batch_size,) containing + the number of frames in `encoder_out` before padding. + """ + raise NotImplementedError("Please implement it in a subclass") diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index 8305248c..ff12ef18 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -23,9 +23,9 @@ import torch.nn as nn import torchaudio import torchaudio.functional -from encoder_interface import EncoderInterface +from .encoder_interface import EncoderInterface -from icefall.utils import add_sos +from hyperion.utils.utils import add_sos class Transducer(nn.Module): diff --git a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py index 3fed7143..1e038f17 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py @@ -15,7 +15,7 @@ from ...utils import remove_silence -class HFWav2XVector(TorchModel): +class HFWav2Transducer(TorchModel): """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. Attributes: diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py index e83dcb8c..e2c6e1be 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py @@ -13,6 +13,7 @@ from ...torch_model import TorchModel from ...utils import remove_silence +from .hf_wav2transducer import HFWav2Transducer class HFWav2Vec2Transducer(HFWav2Transducer): diff --git a/hyperion/utils/lexicon.py b/hyperion/utils/lexicon.py new file mode 100644 index 00000000..80bd7c1e --- /dev/null +++ b/hyperion/utils/lexicon.py @@ -0,0 +1,277 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import re +import sys +from pathlib import Path +from typing import List, Tuple + +import k2 +import torch + + +def read_lexicon(filename: str) -> List[Tuple[str, List[str]]]: + """Read a lexicon from `filename`. + + Each line in the lexicon contains "word p1 p2 p3 ...". + That is, the first field is a word and the remaining + fields are tokens. Fields are separated by space(s). + + Args: + filename: + Path to the lexicon.txt + + Returns: + A list of tuples., e.g., [('w', ['p1', 'p2']), ('w1', ['p3, 'p4'])] + """ + ans = [] + + with open(filename, "r", encoding="utf-8") as f: + whitespace = re.compile("[ \t]+") + for line in f: + a = whitespace.split(line.strip(" \t\r\n")) + if len(a) == 0: + continue + + if len(a) < 2: + logging.info( + f"Found bad line {line} in lexicon file {filename}" + ) + logging.info( + "Every line is expected to contain at least 2 fields" + ) + sys.exit(1) + word = a[0] + if word == "": + logging.info( + f"Found bad line {line} in lexicon file {filename}" + ) + logging.info(" should not be a valid word") + sys.exit(1) + + tokens = a[1:] + ans.append((word, tokens)) + + return ans + + +def write_lexicon(filename: str, lexicon: List[Tuple[str, List[str]]]) -> None: + """Write a lexicon to a file. + + Args: + filename: + Path to the lexicon file to be generated. + lexicon: + It can be the return value of :func:`read_lexicon`. + """ + with open(filename, "w", encoding="utf-8") as f: + for word, tokens in lexicon: + f.write(f"{word} {' '.join(tokens)}\n") + + +def convert_lexicon_to_ragged( + filename: str, word_table: k2.SymbolTable, token_table: k2.SymbolTable +) -> k2.RaggedTensor: + """Read a lexicon and convert it to a ragged tensor. + + The ragged tensor has two axes: [word][token]. + + Caution: + We assume that each word has a unique pronunciation. + + Args: + filename: + Filename of the lexicon. It has a format that can be read + by :func:`read_lexicon`. + word_table: + The word symbol table. + token_table: + The token symbol table. + Returns: + A k2 ragged tensor with two axes [word][token]. + """ + disambig_id = word_table["#0"] + # We reuse the same words.txt from the phone based lexicon + # so that we can share the same G.fst. Here, we have to + # exclude some words present only in the phone based lexicon. + excluded_words = ["", "!SIL", ""] + + # epsilon is not a word, but it occupies a position + # + row_splits = [0] + token_ids_list = [] + + lexicon_tmp = read_lexicon(filename) + lexicon = dict(lexicon_tmp) + if len(lexicon_tmp) != len(lexicon): + raise RuntimeError( + "It's assumed that each word has a unique pronunciation" + ) + + for i in range(disambig_id): + w = word_table[i] + if w in excluded_words: + row_splits.append(row_splits[-1]) + continue + tokens = lexicon[w] + token_ids = [token_table[k] for k in tokens] + + row_splits.append(row_splits[-1] + len(token_ids)) + token_ids_list.extend(token_ids) + + cached_tot_size = row_splits[-1] + row_splits = torch.tensor(row_splits, dtype=torch.int32) + + shape = k2.ragged.create_ragged_shape2( + row_splits, + None, + cached_tot_size, + ) + values = torch.tensor(token_ids_list, dtype=torch.int32) + + return k2.RaggedTensor(shape, values) + + +class Lexicon(object): + """Phone based lexicon.""" + + def __init__( + self, + lang_dir: Path, + disambig_pattern: str = re.compile(r"^#\d+$"), + ): + """ + Args: + lang_dir: + Path to the lang directory. It is expected to contain the following + files: + - tokens.txt + - words.txt + - L.pt + The above files are produced by the script `prepare.sh`. You + should have run that before running the training code. + disambig_pattern: + It contains the pattern for disambiguation symbols. + """ + lang_dir = Path(lang_dir) + self.token_table = k2.SymbolTable.from_file(lang_dir / "tokens.txt") + self.word_table = k2.SymbolTable.from_file(lang_dir / "words.txt") + + if (lang_dir / "Linv.pt").exists(): + logging.info(f"Loading pre-compiled {lang_dir}/Linv.pt") + L_inv = k2.Fsa.from_dict(torch.load(lang_dir / "Linv.pt")) + else: + logging.info("Converting L.pt to Linv.pt") + L = k2.Fsa.from_dict(torch.load(lang_dir / "L.pt")) + L_inv = k2.arc_sort(L.invert()) + torch.save(L_inv.as_dict(), lang_dir / "Linv.pt") + + # We save L_inv instead of L because it will be used to intersect with + # transcript FSAs, both of whose labels are word IDs. + self.L_inv = L_inv + self.disambig_pattern = disambig_pattern + + @property + def tokens(self) -> List[int]: + """Return a list of token IDs excluding those from + disambiguation symbols. + + Caution: + 0 is not a token ID so it is excluded from the return value. + """ + symbols = self.token_table.symbols + ans = [] + for s in symbols: + if not self.disambig_pattern.match(s): + ans.append(self.token_table[s]) + if 0 in ans: + ans.remove(0) + ans.sort() + return ans + + +class UniqLexicon(Lexicon): + def __init__( + self, + lang_dir: Path, + uniq_filename: str = "uniq_lexicon.txt", + disambig_pattern: str = re.compile(r"^#\d+$"), + ): + """ + Refer to the help information in Lexicon.__init__. + + uniq_filename: It is assumed to be inside the given `lang_dir`. + + Each word in the lexicon is assumed to have a unique pronunciation. + """ + lang_dir = Path(lang_dir) + super().__init__(lang_dir=lang_dir, disambig_pattern=disambig_pattern) + + self.ragged_lexicon = convert_lexicon_to_ragged( + filename=lang_dir / uniq_filename, + word_table=self.word_table, + token_table=self.token_table, + ) + # TODO: should we move it to a certain device ? + + def texts_to_token_ids( + self, texts: List[str], oov: str = "" + ) -> k2.RaggedTensor: + """ + Args: + texts: + A list of transcripts. Each transcript contains space(s) + separated words. An example texts is:: + + ['HELLO k2', 'HELLO icefall'] + oov: + The OOV word. If a word in `texts` is not in the lexicon, it is + replaced with `oov`. + Returns: + Return a ragged int tensor with 2 axes [utterance][token_id] + """ + oov_id = self.word_table[oov] + + word_ids_list = [] + for text in texts: + word_ids = [] + for word in text.split(): + if word in self.word_table: + word_ids.append(self.word_table[word]) + else: + word_ids.append(oov_id) + word_ids_list.append(word_ids) + ragged_indexes = k2.RaggedTensor(word_ids_list, dtype=torch.int32) + ans = self.ragged_lexicon.index(ragged_indexes) + ans = ans.remove_axis(ans.num_axes - 2) + return ans + + def words_to_token_ids(self, words: List[str]) -> k2.RaggedTensor: + """Convert a list of words to a ragged tensor containing token IDs. + + We assume there are no OOVs in "words". + """ + word_ids = [self.word_table[w] for w in words] + word_ids = torch.tensor(word_ids, dtype=torch.int32) + + ragged, _ = self.ragged_lexicon.index( + indexes=word_ids, + axis=0, + need_value_indexes=False, + ) + return ragged diff --git a/hyperion/utils/utils.py b/hyperion/utils/utils.py new file mode 100644 index 00000000..1663fb03 --- /dev/null +++ b/hyperion/utils/utils.py @@ -0,0 +1,978 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang +# Mingshuang Luo) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import collections +import logging +import os +import re +import subprocess +from collections import defaultdict +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path +from typing import Dict, Iterable, List, TextIO, Tuple, Union + +import k2 +import k2.version +import kaldialign +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.utils.tensorboard import SummaryWriter + +from hyperion.utils.checkpoint import average_checkpoints + +Pathlike = Union[str, Path] + + +# Pytorch issue: https://github.com/pytorch/pytorch/issues/47379 +# Fixed: https://github.com/pytorch/pytorch/pull/49853 +# The fix was included in v1.9.0 +# https://github.com/pytorch/pytorch/releases/tag/v1.9.0 +def is_jit_tracing(): + if torch.jit.is_scripting(): + return False + elif torch.jit.is_tracing(): + return True + return False + + +@contextmanager +def get_executor(): + # We'll either return a process pool or a distributed worker pool. + # Note that this has to be a context manager because we might use multiple + # context manager ("with" clauses) inside, and this way everything will + # free up the resources at the right time. + try: + # If this is executed on the CLSP grid, we will try to use the + # Grid Engine to distribute the tasks. + # Other clusters can also benefit from that, provided a + # cluster-specific wrapper. + # (see https://github.com/pzelasko/plz for reference) + # + # The following must be installed: + # $ pip install dask distributed + # $ pip install git+https://github.com/pzelasko/plz + name = subprocess.check_output("hostname -f", shell=True, text=True) + if name.strip().endswith(".clsp.jhu.edu"): + import plz + from distributed import Client + + with plz.setup_cluster() as cluster: + cluster.scale(80) + yield Client(cluster) + return + except Exception: + pass + # No need to return anything - compute_and_store_features + # will just instantiate the pool itself. + yield None + + +def str2bool(v): + """Used in argparse.ArgumentParser.add_argument to indicate + that a type is a bool type and user can enter + + - yes, true, t, y, 1, to represent True + - no, false, f, n, 0, to represent False + + See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa + """ + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + +def setup_logger( + log_filename: Pathlike, + log_level: str = "info", + use_console: bool = True, +) -> None: + """Setup log level. + + Args: + log_filename: + The filename to save the log. + log_level: + The log level to use, e.g., "debug", "info", "warning", "error", + "critical" + use_console: + True to also print logs to console. + """ + now = datetime.now() + date_time = now.strftime("%Y-%m-%d-%H-%M-%S") + if dist.is_available() and dist.is_initialized(): + world_size = dist.get_world_size() + rank = dist.get_rank() + formatter = f"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] ({rank}/{world_size}) %(message)s" # noqa + log_filename = f"{log_filename}-{date_time}-{rank}" + else: + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + log_filename = f"{log_filename}-{date_time}" + + os.makedirs(os.path.dirname(log_filename), exist_ok=True) + + level = logging.ERROR + if log_level == "debug": + level = logging.DEBUG + elif log_level == "info": + level = logging.INFO + elif log_level == "warning": + level = logging.WARNING + elif log_level == "critical": + level = logging.CRITICAL + + logging.basicConfig( + filename=log_filename, + format=formatter, + level=level, + filemode="w", + ) + if use_console: + console = logging.StreamHandler() + console.setLevel(level) + console.setFormatter(logging.Formatter(formatter)) + logging.getLogger("").addHandler(console) + + +class AttributeDict(dict): + def __getattr__(self, key): + if key in self: + return self[key] + raise AttributeError(f"No such attribute '{key}'") + + def __setattr__(self, key, value): + self[key] = value + + def __delattr__(self, key): + if key in self: + del self[key] + return + raise AttributeError(f"No such attribute '{key}'") + + +def encode_supervisions( + supervisions: dict, subsampling_factor: int +) -> Tuple[torch.Tensor, List[str]]: + """ + Encodes Lhotse's ``batch["supervisions"]`` dict into + a pair of torch Tensor, and a list of transcription strings. + + The supervision tensor has shape ``(batch_size, 3)``. + Its second dimension contains information about sequence index [0], + start frames [1] and num frames [2]. + + The batch items might become re-ordered during this operation -- the + returned tensor and list of strings are guaranteed to be consistent with + each other. + """ + supervision_segments = torch.stack( + ( + supervisions["sequence_idx"], + supervisions["start_frame"] // subsampling_factor, + supervisions["num_frames"] // subsampling_factor, + ), + 1, + ).to(torch.int32) + + indices = torch.argsort(supervision_segments[:, 2], descending=True) + supervision_segments = supervision_segments[indices] + texts = supervisions["text"] + texts = [texts[idx] for idx in indices] + + return supervision_segments, texts + + +def get_texts( + best_paths: k2.Fsa, return_ragged: bool = False +) -> Union[List[List[int]], k2.RaggedTensor]: + """Extract the texts (as word IDs) from the best-path FSAs. + Args: + best_paths: + A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e. + containing multiple FSAs, which is expected to be the result + of k2.shortest_path (otherwise the returned values won't + be meaningful). + return_ragged: + True to return a ragged tensor with two axes [utt][word_id]. + False to return a list-of-list word IDs. + Returns: + Returns a list of lists of int, containing the label sequences we + decoded. + """ + if isinstance(best_paths.aux_labels, k2.RaggedTensor): + # remove 0's and -1's. + aux_labels = best_paths.aux_labels.remove_values_leq(0) + # TODO: change arcs.shape() to arcs.shape + aux_shape = best_paths.arcs.shape().compose(aux_labels.shape) + + # remove the states and arcs axes. + aux_shape = aux_shape.remove_axis(1) + aux_shape = aux_shape.remove_axis(1) + aux_labels = k2.RaggedTensor(aux_shape, aux_labels.values) + else: + # remove axis corresponding to states. + aux_shape = best_paths.arcs.shape().remove_axis(1) + aux_labels = k2.RaggedTensor(aux_shape, best_paths.aux_labels) + # remove 0's and -1's. + aux_labels = aux_labels.remove_values_leq(0) + + assert aux_labels.num_axes == 2 + if return_ragged: + return aux_labels + else: + return aux_labels.tolist() + + +def get_alignments(best_paths: k2.Fsa, kind: str) -> List[List[int]]: + """Extract labels or aux_labels from the best-path FSAs. + + Args: + best_paths: + A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e. + containing multiple FSAs, which is expected to be the result + of k2.shortest_path (otherwise the returned values won't + be meaningful). + kind: + Possible values are: "labels" and "aux_labels". Caution: When it is + "labels", the resulting alignments contain repeats. + Returns: + Returns a list of lists of int, containing the token sequences we + decoded. For `ans[i]`, its length equals to the number of frames + after subsampling of the i-th utterance in the batch. + + Example: + When `kind` is `labels`, one possible alignment example is (with + repeats):: + + c c c blk a a blk blk t t t blk blk + + If `kind` is `aux_labels`, the above example changes to:: + + c blk blk blk a blk blk blk t blk blk blk blk + + """ + assert kind in ("labels", "aux_labels") + # arc.shape() has axes [fsa][state][arc], we remove "state"-axis here + token_shape = best_paths.arcs.shape().remove_axis(1) + # token_shape has axes [fsa][arc] + tokens = k2.RaggedTensor( + token_shape, getattr(best_paths, kind).contiguous() + ) + tokens = tokens.remove_values_eq(-1) + return tokens.tolist() + + +def save_alignments( + alignments: Dict[str, List[int]], + subsampling_factor: int, + filename: str, +) -> None: + """Save alignments to a file. + + Args: + alignments: + A dict containing alignments. Keys of the dict are utterances and + values are the corresponding framewise alignments after subsampling. + subsampling_factor: + The subsampling factor of the model. + filename: + Path to save the alignments. + Returns: + Return None. + """ + ali_dict = { + "subsampling_factor": subsampling_factor, + "alignments": alignments, + } + torch.save(ali_dict, filename) + + +def load_alignments(filename: str) -> Tuple[int, Dict[str, List[int]]]: + """Load alignments from a file. + + Args: + filename: + Path to the file containing alignment information. + The file should be saved by :func:`save_alignments`. + Returns: + Return a tuple containing: + - subsampling_factor: The subsampling_factor used to compute + the alignments. + - alignments: A dict containing utterances and their corresponding + framewise alignment, after subsampling. + """ + ali_dict = torch.load(filename) + subsampling_factor = ali_dict["subsampling_factor"] + alignments = ali_dict["alignments"] + return subsampling_factor, alignments + + +def store_transcripts( + filename: Pathlike, texts: Iterable[Tuple[str, str, str]] +) -> None: + """Save predicted results and reference transcripts to a file. + + Args: + filename: + File to save the results to. + texts: + An iterable of tuples. The first element is the cur_id, the second is + the reference transcript and the third element is the predicted result. + Returns: + Return None. + """ + with open(filename, "w") as f: + for cut_id, ref, hyp in texts: + print(f"{cut_id}:\tref={ref}", file=f) + print(f"{cut_id}:\thyp={hyp}", file=f) + + +def write_error_stats( + f: TextIO, + test_set_name: str, + results: List[Tuple[str, str]], + enable_log: bool = True, +) -> float: + """Write statistics based on predicted results and reference transcripts. + + It will write the following to the given file: + + - WER + - number of insertions, deletions, substitutions, corrects and total + reference words. For example:: + + Errors: 23 insertions, 57 deletions, 212 substitutions, over 2606 + reference words (2337 correct) + + - The difference between the reference transcript and predicted result. + An instance is given below:: + + THE ASSOCIATION OF (EDISON->ADDISON) ILLUMINATING COMPANIES + + The above example shows that the reference word is `EDISON`, + but it is predicted to `ADDISON` (a substitution error). + + Another example is:: + + FOR THE FIRST DAY (SIR->*) I THINK + + The reference word `SIR` is missing in the predicted + results (a deletion error). + results: + An iterable of tuples. The first element is the cur_id, the second is + the reference transcript and the third element is the predicted result. + enable_log: + If True, also print detailed WER to the console. + Otherwise, it is written only to the given file. + Returns: + Return None. + """ + subs: Dict[Tuple[str, str], int] = defaultdict(int) + ins: Dict[str, int] = defaultdict(int) + dels: Dict[str, int] = defaultdict(int) + + # `words` stores counts per word, as follows: + # corr, ref_sub, hyp_sub, ins, dels + words: Dict[str, List[int]] = defaultdict(lambda: [0, 0, 0, 0, 0]) + num_corr = 0 + ERR = "*" + for cut_id, ref, hyp in results: + ali = kaldialign.align(ref, hyp, ERR) + for ref_word, hyp_word in ali: + if ref_word == ERR: + ins[hyp_word] += 1 + words[hyp_word][3] += 1 + elif hyp_word == ERR: + dels[ref_word] += 1 + words[ref_word][4] += 1 + elif hyp_word != ref_word: + subs[(ref_word, hyp_word)] += 1 + words[ref_word][1] += 1 + words[hyp_word][2] += 1 + else: + words[ref_word][0] += 1 + num_corr += 1 + ref_len = sum([len(r) for _, r, _ in results]) + sub_errs = sum(subs.values()) + ins_errs = sum(ins.values()) + del_errs = sum(dels.values()) + tot_errs = sub_errs + ins_errs + del_errs + tot_err_rate = "%.2f" % (100.0 * tot_errs / ref_len) + + if enable_log: + logging.info( + f"[{test_set_name}] %WER {tot_errs / ref_len:.2%} " + f"[{tot_errs} / {ref_len}, {ins_errs} ins, " + f"{del_errs} del, {sub_errs} sub ]" + ) + + print(f"%WER = {tot_err_rate}", file=f) + print( + f"Errors: {ins_errs} insertions, {del_errs} deletions, " + f"{sub_errs} substitutions, over {ref_len} reference " + f"words ({num_corr} correct)", + file=f, + ) + print( + "Search below for sections starting with PER-UTT DETAILS:, " + "SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:", + file=f, + ) + + print("", file=f) + print("PER-UTT DETAILS: corr or (ref->hyp) ", file=f) + for cut_id, ref, hyp in results: + ali = kaldialign.align(ref, hyp, ERR) + combine_successive_errors = True + if combine_successive_errors: + ali = [[[x], [y]] for x, y in ali] + for i in range(len(ali) - 1): + if ali[i][0] != ali[i][1] and ali[i + 1][0] != ali[i + 1][1]: + ali[i + 1][0] = ali[i][0] + ali[i + 1][0] + ali[i + 1][1] = ali[i][1] + ali[i + 1][1] + ali[i] = [[], []] + ali = [ + [ + list(filter(lambda a: a != ERR, x)), + list(filter(lambda a: a != ERR, y)), + ] + for x, y in ali + ] + ali = list(filter(lambda x: x != [[], []], ali)) + ali = [ + [ + ERR if x == [] else " ".join(x), + ERR if y == [] else " ".join(y), + ] + for x, y in ali + ] + + print( + f"{cut_id}:\t" + + " ".join( + ( + ref_word + if ref_word == hyp_word + else f"({ref_word}->{hyp_word})" + for ref_word, hyp_word in ali + ) + ), + file=f, + ) + + print("", file=f) + print("SUBSTITUTIONS: count ref -> hyp", file=f) + + for count, (ref, hyp) in sorted( + [(v, k) for k, v in subs.items()], reverse=True + ): + print(f"{count} {ref} -> {hyp}", file=f) + + print("", file=f) + print("DELETIONS: count ref", file=f) + for count, ref in sorted([(v, k) for k, v in dels.items()], reverse=True): + print(f"{count} {ref}", file=f) + + print("", file=f) + print("INSERTIONS: count hyp", file=f) + for count, hyp in sorted([(v, k) for k, v in ins.items()], reverse=True): + print(f"{count} {hyp}", file=f) + + print("", file=f) + print( + "PER-WORD STATS: word corr tot_errs count_in_ref count_in_hyp", file=f + ) + for _, word, counts in sorted( + [(sum(v[1:]), k, v) for k, v in words.items()], reverse=True + ): + (corr, ref_sub, hyp_sub, ins, dels) = counts + tot_errs = ref_sub + hyp_sub + ins + dels + ref_count = corr + ref_sub + dels + hyp_count = corr + hyp_sub + ins + + print(f"{word} {corr} {tot_errs} {ref_count} {hyp_count}", file=f) + return float(tot_err_rate) + + +class MetricsTracker(collections.defaultdict): + def __init__(self): + # Passing the type 'int' to the base-class constructor + # makes undefined items default to int() which is zero. + # This class will play a role as metrics tracker. + # It can record many metrics, including but not limited to loss. + super(MetricsTracker, self).__init__(int) + + def __add__(self, other: "MetricsTracker") -> "MetricsTracker": + ans = MetricsTracker() + for k, v in self.items(): + ans[k] = v + for k, v in other.items(): + ans[k] = ans[k] + v + return ans + + def __mul__(self, alpha: float) -> "MetricsTracker": + ans = MetricsTracker() + for k, v in self.items(): + ans[k] = v * alpha + return ans + + def __str__(self) -> str: + ans_frames = "" + ans_utterances = "" + for k, v in self.norm_items(): + norm_value = "%.4g" % v + if "utt_" not in k: + ans_frames += str(k) + "=" + str(norm_value) + ", " + else: + ans_utterances += str(k) + "=" + str(norm_value) + if k == "utt_duration": + ans_utterances += " frames, " + elif k == "utt_pad_proportion": + ans_utterances += ", " + else: + raise ValueError(f"Unexpected key: {k}") + frames = "%.2f" % self["frames"] + ans_frames += "over " + str(frames) + " frames. " + if ans_utterances != "": + utterances = "%.2f" % self["utterances"] + ans_utterances += "over " + str(utterances) + " utterances." + + return ans_frames + ans_utterances + + def norm_items(self) -> List[Tuple[str, float]]: + """ + Returns a list of pairs, like: + [('ctc_loss', 0.1), ('att_loss', 0.07)] + """ + num_frames = self["frames"] if "frames" in self else 1 + num_utterances = self["utterances"] if "utterances" in self else 1 + ans = [] + for k, v in self.items(): + if k == "frames" or k == "utterances": + continue + norm_value = ( + float(v) / num_frames + if "utt_" not in k + else float(v) / num_utterances + ) + ans.append((k, norm_value)) + return ans + + def reduce(self, device): + """ + Reduce using torch.distributed, which I believe ensures that + all processes get the total. + """ + keys = sorted(self.keys()) + s = torch.tensor([float(self[k]) for k in keys], device=device) + dist.all_reduce(s, op=dist.ReduceOp.SUM) + for k, v in zip(keys, s.cpu().tolist()): + self[k] = v + + def write_summary( + self, + tb_writer: SummaryWriter, + prefix: str, + batch_idx: int, + ) -> None: + """Add logging information to a TensorBoard writer. + + Args: + tb_writer: a TensorBoard writer + prefix: a prefix for the name of the loss, e.g. "train/valid_", + or "train/current_" + batch_idx: The current batch index, used as the x-axis of the plot. + """ + for k, v in self.norm_items(): + tb_writer.add_scalar(prefix + k, v, batch_idx) + + +def concat( + ragged: k2.RaggedTensor, value: int, direction: str +) -> k2.RaggedTensor: + """Prepend a value to the beginning of each sublist or append a value. + to the end of each sublist. + + Args: + ragged: + A ragged tensor with two axes. + value: + The value to prepend or append. + direction: + It can be either "left" or "right". If it is "left", we + prepend the value to the beginning of each sublist; + if it is "right", we append the value to the end of each + sublist. + + Returns: + Return a new ragged tensor, whose sublists either start with + or end with the given value. + + >>> a = k2.RaggedTensor([[1, 3], [5]]) + >>> a + [ [ 1 3 ] [ 5 ] ] + >>> concat(a, value=0, direction="left") + [ [ 0 1 3 ] [ 0 5 ] ] + >>> concat(a, value=0, direction="right") + [ [ 1 3 0 ] [ 5 0 ] ] + + """ + dtype = ragged.dtype + device = ragged.device + + assert ragged.num_axes == 2, f"num_axes: {ragged.num_axes}" + pad_values = torch.full( + size=(ragged.tot_size(0), 1), + fill_value=value, + device=device, + dtype=dtype, + ) + pad = k2.RaggedTensor(pad_values) + + if direction == "left": + ans = k2.ragged.cat([pad, ragged], axis=1) + elif direction == "right": + ans = k2.ragged.cat([ragged, pad], axis=1) + else: + raise ValueError( + f'Unsupported direction: {direction}. " \ + "Expect either "left" or "right"' + ) + return ans + + +def add_sos(ragged: k2.RaggedTensor, sos_id: int) -> k2.RaggedTensor: + """Add SOS to each sublist. + + Args: + ragged: + A ragged tensor with two axes. + sos_id: + The ID of the SOS symbol. + + Returns: + Return a new ragged tensor, where each sublist starts with SOS. + + >>> a = k2.RaggedTensor([[1, 3], [5]]) + >>> a + [ [ 1 3 ] [ 5 ] ] + >>> add_sos(a, sos_id=0) + [ [ 0 1 3 ] [ 0 5 ] ] + + """ + return concat(ragged, sos_id, direction="left") + + +def add_eos(ragged: k2.RaggedTensor, eos_id: int) -> k2.RaggedTensor: + """Add EOS to each sublist. + + Args: + ragged: + A ragged tensor with two axes. + eos_id: + The ID of the EOS symbol. + + Returns: + Return a new ragged tensor, where each sublist ends with EOS. + + >>> a = k2.RaggedTensor([[1, 3], [5]]) + >>> a + [ [ 1 3 ] [ 5 ] ] + >>> add_eos(a, eos_id=0) + [ [ 1 3 0 ] [ 5 0 ] ] + + """ + return concat(ragged, eos_id, direction="right") + + +def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor: + """ + Args: + lengths: + A 1-D tensor containing sentence lengths. + Returns: + Return a 2-D bool tensor, where masked positions + are filled with `True` and non-masked positions are + filled with `False`. + + >>> lengths = torch.tensor([1, 3, 2, 5]) + >>> make_pad_mask(lengths) + tensor([[False, True, True, True, True], + [False, False, False, True, True], + [False, False, True, True, True], + [False, False, False, False, False]]) + """ + assert lengths.ndim == 1, lengths.ndim + + max_len = lengths.max() + n = lengths.size(0) + + expaned_lengths = torch.arange(max_len).expand(n, max_len).to(lengths) + + return expaned_lengths >= lengths.unsqueeze(1) + + +# Copied and modified from https://github.com/wenet-e2e/wenet/blob/main/wenet/utils/mask.py +def subsequent_chunk_mask( + size: int, + chunk_size: int, + num_left_chunks: int = -1, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size) with chunk size, + this is for streaming encoder + Args: + size (int): size of mask + chunk_size (int): size of chunk + num_left_chunks (int): number of left chunks + <0: use full chunk + >=0: use num_left_chunks + device (torch.device): "cpu" or "cuda" or torch.Tensor.device + Returns: + torch.Tensor: mask + Examples: + >>> subsequent_chunk_mask(4, 2) + [[1, 1, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 1], + [1, 1, 1, 1]] + """ + ret = torch.zeros(size, size, device=device, dtype=torch.bool) + for i in range(size): + if num_left_chunks < 0: + start = 0 + else: + start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) + ending = min((i // chunk_size + 1) * chunk_size, size) + ret[i, start:ending] = True + return ret + + +def l1_norm(x): + return torch.sum(torch.abs(x)) + + +def l2_norm(x): + return torch.sum(torch.pow(x, 2)) + + +def linf_norm(x): + return torch.max(torch.abs(x)) + + +def measure_weight_norms( + model: nn.Module, norm: str = "l2" +) -> Dict[str, float]: + """ + Compute the norms of the model's parameters. + + :param model: a torch.nn.Module instance + :param norm: how to compute the norm. Available values: 'l1', 'l2', 'linf' + :return: a dict mapping from parameter's name to its norm. + """ + with torch.no_grad(): + norms = {} + for name, param in model.named_parameters(): + if norm == "l1": + val = l1_norm(param) + elif norm == "l2": + val = l2_norm(param) + elif norm == "linf": + val = linf_norm(param) + else: + raise ValueError(f"Unknown norm type: {norm}") + norms[name] = val.item() + return norms + + +def measure_gradient_norms( + model: nn.Module, norm: str = "l1" +) -> Dict[str, float]: + """ + Compute the norms of the gradients for each of model's parameters. + + :param model: a torch.nn.Module instance + :param norm: how to compute the norm. Available values: 'l1', 'l2', 'linf' + :return: a dict mapping from parameter's name to its gradient's norm. + """ + with torch.no_grad(): + norms = {} + for name, param in model.named_parameters(): + if norm == "l1": + val = l1_norm(param.grad) + elif norm == "l2": + val = l2_norm(param.grad) + elif norm == "linf": + val = linf_norm(param.grad) + else: + raise ValueError(f"Unknown norm type: {norm}") + norms[name] = val.item() + return norms + + +def optim_step_and_measure_param_change( + model: nn.Module, + old_parameters: Dict[str, nn.parameter.Parameter], +) -> Dict[str, float]: + """ + Measure the "relative change in parameters per minibatch." + It is understood as a ratio between the L2 norm of the difference between original and updates parameters, + and the L2 norm of the original parameter. It is given by the formula: + + .. math:: + \begin{aligned} + \delta = \frac{\Vert\theta - \theta_{new}\Vert^2}{\Vert\theta\Vert^2} + \end{aligned} + + This function is supposed to be used as follows: + + .. code-block:: python + + old_parameters = { + n: p.detach().clone() for n, p in model.named_parameters() + } + + optimizer.step() + + deltas = optim_step_and_measure_param_change(old_parameters) + + Args: + model: A torch.nn.Module instance. + old_parameters: + A Dict of named_parameters before optimizer.step(). + + Return: + A Dict containing the relative change for each parameter. + """ + relative_change = {} + with torch.no_grad(): + for n, p_new in model.named_parameters(): + p_orig = old_parameters[n] + delta = l2_norm(p_orig - p_new) / l2_norm(p_orig) + relative_change[n] = delta.item() + return relative_change + + +def load_averaged_model( + model_dir: str, + model: torch.nn.Module, + epoch: int, + avg: int, + device: torch.device, +): + """ + Load a model which is the average of all checkpoints + + :param model_dir: a str of the experiment directory + :param model: a torch.nn.Module instance + + :param epoch: the last epoch to load from + :param avg: how many models to average from + :param device: move model to this device + + :return: A model averaged + """ + + # start cannot be negative + start = max(epoch - avg + 1, 0) + filenames = [f"{model_dir}/epoch-{i}.pt" for i in range(start, epoch + 1)] + + logging.info(f"averaging {filenames}") + model.to(device) + model.load_state_dict(average_checkpoints(filenames, device=device)) + + return model + + +def tokenize_by_bpe_model( + sp: spm.SentencePieceProcessor, + txt: str, +) -> str: + """ + Tokenize text with bpe model. This function is from + https://github1s.com/wenet-e2e/wenet/blob/main/wenet/dataset/processor.py#L322-L342. + Args: + sp: spm.SentencePieceProcessor. + txt: str + + Return: + A new string which includes chars and bpes. + """ + tokens = [] + # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + pattern = re.compile(r"([\u4e00-\u9fff])") + # Example: + # txt = "你好 ITS'S OKAY 的" + # chars = ["你", "好", " ITS'S OKAY ", "的"] + chars = pattern.split(txt.upper()) + mix_chars = [w for w in chars if len(w.strip()) > 0] + for ch_or_w in mix_chars: + # ch_or_w is a single CJK charater(i.e., "你"), do nothing. + if pattern.fullmatch(ch_or_w) is not None: + tokens.append(ch_or_w) + # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), + # encode ch_or_w using bpe_model. + else: + for p in sp.encode_as_pieces(ch_or_w): + tokens.append(p) + txt_with_bpe = "/".join(tokens) + + return txt_with_bpe + + +def display_and_save_batch( + batch: dict, + params: AttributeDict, + sp: spm.SentencePieceProcessor, +) -> None: + """Display the batch statistics and save the batch into disk. + + Args: + batch: + A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()` + for the content in it. + params: + Parameters for training. See :func:`get_params`. + sp: + The BPE model. + """ + from lhotse.utils import uuid4 + + filename = f"{params.exp_dir}/batch-{uuid4()}.pt" + logging.info(f"Saving batch to {filename}") + torch.save(batch, filename) + + supervisions = batch["supervisions"] + features = batch["inputs"] + + logging.info(f"features shape: {features.shape}") + + y = sp.encode(supervisions["text"], out_type=int) + num_tokens = sum(len(i) for i in y) + logging.info(f"num tokens: {num_tokens}") From 85a62dcb1de1b954486c0fa54dca38c27cc60dc8 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Sat, 15 Oct 2022 23:52:33 -0400 Subject: [PATCH 030/154] bpe data preparation and training --- .../v1/local/validate_bpe_lexicon.py | 77 +++ egs/librispeech/v1/run_004_compute_bpe.sh | 16 +- hyperion/torch/data/asr_datamodule.py | 454 ++++++++++++++++++ 3 files changed, 544 insertions(+), 3 deletions(-) create mode 100755 egs/librispeech/v1/local/validate_bpe_lexicon.py create mode 100644 hyperion/torch/data/asr_datamodule.py diff --git a/egs/librispeech/v1/local/validate_bpe_lexicon.py b/egs/librispeech/v1/local/validate_bpe_lexicon.py new file mode 100755 index 00000000..36962933 --- /dev/null +++ b/egs/librispeech/v1/local/validate_bpe_lexicon.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script checks that there are no OOV tokens in the BPE-based lexicon. + +Usage example: + + python3 ./local/validate_bpe_lexicon.py \ + --lexicon /path/to/lexicon.txt \ + --bpe-model /path/to/bpe.model +""" + +import argparse +from pathlib import Path +from typing import List, Tuple + +import sentencepiece as spm + +from hyperion.utils.lexicon import read_lexicon + +# Map word to word pieces +Lexicon = List[Tuple[str, List[str]]] + + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--lexicon", + required=True, + type=Path, + help="Path to lexicon.txt", + ) + + parser.add_argument( + "--bpe-model", + required=True, + type=Path, + help="Path to bpe.model", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + assert args.lexicon.is_file(), args.lexicon + assert args.bpe_model.is_file(), args.bpe_model + + lexicon = read_lexicon(args.lexicon) + + sp = spm.SentencePieceProcessor() + sp.load(str(args.bpe_model)) + + word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size())))) + for word, pieces in lexicon: + for p in pieces: + if p not in word_pieces: + raise ValueError(f"The word {word} contains an OOV token {p}") + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/v1/run_004_compute_bpe.sh b/egs/librispeech/v1/run_004_compute_bpe.sh index 571205a8..f1fa36b1 100755 --- a/egs/librispeech/v1/run_004_compute_bpe.sh +++ b/egs/librispeech/v1/run_004_compute_bpe.sh @@ -5,9 +5,19 @@ # . ./cmd.sh . ./path.sh + + set -e nodes=fs01 storage_name=$(date +'%m_%d_%H_%M') +. ./datapath.sh + +vocab_sizes=( + # 5000 + 2000 + 1000 + 500 +) dl_dir=$PWD/download @@ -56,9 +66,9 @@ if [ $stage -le 3 ]; then if [ ! -f $lang_dir/transcript_words.txt ]; then echo "Generate data for BPE training" files=$( - find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" + find "$librispeech_root/train-clean-100" -name "*.trans.txt" + find "$librispeech_root/train-clean-360" -name "*.trans.txt" + find "$librispeech_root/train-other-500" -name "*.trans.txt" ) for f in ${files[@]}; do cat $f | cut -d " " -f 2- diff --git a/hyperion/torch/data/asr_datamodule.py b/hyperion/torch/data/asr_datamodule.py new file mode 100644 index 00000000..355ccc99 --- /dev/null +++ b/hyperion/torch/data/asr_datamodule.py @@ -0,0 +1,454 @@ +# Copyright 2021 Piotr Żelasko +# Copyright 2022 Xiaomi Corporation (Author: Mingshuang Luo) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import inspect +import logging +from functools import lru_cache +from pathlib import Path +from typing import Any, Dict, Optional + +import torch +from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy +from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures + CutConcatenate, + CutMix, + DynamicBucketingSampler, + K2SpeechRecognitionDataset, + PrecomputedFeatures, + SingleCutSampler, + SpecAugment, +) +from lhotse.dataset.input_strategies import ( # noqa F401 For AudioSamples + AudioSamples, + OnTheFlyFeatures, +) +from lhotse.utils import fix_random_seed +from torch.utils.data import DataLoader + +from icefall.utils import str2bool + + +class _SeedWorkers: + def __init__(self, seed: int): + self.seed = seed + + def __call__(self, worker_id: int): + fix_random_seed(self.seed + worker_id) + + +class LibriSpeechAsrDataModule: + """ + DataModule for k2 ASR experiments. + It assumes there is always one train and valid dataloader, + but there can be multiple test dataloaders (e.g. LibriSpeech test-clean + and test-other). + + It contains all the common data pipeline modules used in ASR + experiments, e.g.: + - dynamic batch size, + - bucketing samplers, + - cut concatenation, + - augmentation, + - on-the-fly feature extraction + + This class should be derived for specific corpora used in ASR tasks. + """ + + def __init__(self, args: argparse.Namespace): + self.args = args + + @classmethod + def add_arguments(cls, parser: argparse.ArgumentParser): + group = parser.add_argument_group( + title="ASR data related options", + description="These options are used for the preparation of " + "PyTorch DataLoaders from Lhotse CutSet's -- they control the " + "effective batch sizes, sampling strategies, applied data " + "augmentations, etc.", + ) + group.add_argument( + "--full-libri", + type=str2bool, + default=True, + help="When enabled, use 960h LibriSpeech. " + "Otherwise, use 100h subset.", + ) + group.add_argument( + "--manifest-dir", + type=Path, + default=Path("data/fbank"), + help="Path to directory with train/valid/test cuts.", + ) + group.add_argument( + "--max-duration", + type=int, + default=200.0, + help="Maximum pooled recordings duration (seconds) in a " + "single batch. You can reduce it if it causes CUDA OOM.", + ) + group.add_argument( + "--bucketing-sampler", + type=str2bool, + default=True, + help="When enabled, the batches will come from buckets of " + "similar duration (saves padding frames).", + ) + group.add_argument( + "--num-buckets", + type=int, + default=30, + help="The number of buckets for the DynamicBucketingSampler" + "(you might want to increase it for larger datasets).", + ) + group.add_argument( + "--concatenate-cuts", + type=str2bool, + default=False, + help="When enabled, utterances (cuts) will be concatenated " + "to minimize the amount of padding.", + ) + group.add_argument( + "--duration-factor", + type=float, + default=1.0, + help="Determines the maximum duration of a concatenated cut " + "relative to the duration of the longest cut in a batch.", + ) + group.add_argument( + "--gap", + type=float, + default=1.0, + help="The amount of padding (in seconds) inserted between " + "concatenated cuts. This padding is filled with noise when " + "noise augmentation is used.", + ) + group.add_argument( + "--on-the-fly-feats", + type=str2bool, + default=False, + help="When enabled, use on-the-fly cut mixing and feature " + "extraction. Will drop existing precomputed feature manifests " + "if available.", + ) + group.add_argument( + "--shuffle", + type=str2bool, + default=True, + help="When enabled (=default), the examples will be " + "shuffled for each epoch.", + ) + group.add_argument( + "--drop-last", + type=str2bool, + default=True, + help="Whether to drop last batch. Used by sampler.", + ) + group.add_argument( + "--return-cuts", + type=str2bool, + default=True, + help="When enabled, each batch will have the " + "field: batch['supervisions']['cut'] with the cuts that " + "were used to construct it.", + ) + + group.add_argument( + "--num-workers", + type=int, + default=2, + help="The number of training dataloader workers that " + "collect the batches.", + ) + + group.add_argument( + "--enable-spec-aug", + type=str2bool, + default=True, + help="When enabled, use SpecAugment for training dataset.", + ) + + group.add_argument( + "--spec-aug-time-warp-factor", + type=int, + default=80, + help="Used only when --enable-spec-aug is True. " + "It specifies the factor for time warping in SpecAugment. " + "Larger values mean more warping. " + "A value less than 1 means to disable time warp.", + ) + + group.add_argument( + "--enable-musan", + type=str2bool, + default=True, + help="When enabled, select noise from MUSAN and mix it" + "with training dataset. ", + ) + + group.add_argument( + "--input-strategy", + type=str, + default="PrecomputedFeatures", + help="AudioSamples or PrecomputedFeatures", + ) + + def train_dataloaders( + self, + cuts_train: CutSet, + sampler_state_dict: Optional[Dict[str, Any]] = None, + ) -> DataLoader: + """ + Args: + cuts_train: + CutSet for training. + sampler_state_dict: + The state dict for the training sampler. + """ + transforms = [] + if self.args.enable_musan: + logging.info("Enable MUSAN") + logging.info("About to get Musan cuts") + cuts_musan = load_manifest( + self.args.manifest_dir / "musan_cuts.jsonl.gz" + ) + transforms.append( + CutMix( + cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True + ) + ) + else: + logging.info("Disable MUSAN") + + if self.args.concatenate_cuts: + logging.info( + f"Using cut concatenation with duration factor " + f"{self.args.duration_factor} and gap {self.args.gap}." + ) + # Cut concatenation should be the first transform in the list, + # so that if we e.g. mix noise in, it will fill the gaps between + # different utterances. + transforms = [ + CutConcatenate( + duration_factor=self.args.duration_factor, gap=self.args.gap + ) + ] + transforms + + input_transforms = [] + if self.args.enable_spec_aug: + logging.info("Enable SpecAugment") + logging.info( + f"Time warp factor: {self.args.spec_aug_time_warp_factor}" + ) + # Set the value of num_frame_masks according to Lhotse's version. + # In different Lhotse's versions, the default of num_frame_masks is + # different. + num_frame_masks = 10 + num_frame_masks_parameter = inspect.signature( + SpecAugment.__init__ + ).parameters["num_frame_masks"] + if num_frame_masks_parameter.default == 1: + num_frame_masks = 2 + logging.info(f"Num frame mask: {num_frame_masks}") + input_transforms.append( + SpecAugment( + time_warp_factor=self.args.spec_aug_time_warp_factor, + num_frame_masks=num_frame_masks, + features_mask_size=27, + num_feature_masks=2, + frames_mask_size=100, + ) + ) + else: + logging.info("Disable SpecAugment") + + logging.info("About to create train dataset") + train = K2SpeechRecognitionDataset( + input_strategy=eval(self.args.input_strategy)(), + cut_transforms=transforms, + input_transforms=input_transforms, + return_cuts=self.args.return_cuts, + ) + + if self.args.on_the_fly_feats: + # NOTE: the PerturbSpeed transform should be added only if we + # remove it from data prep stage. + # Add on-the-fly speed perturbation; since originally it would + # have increased epoch size by 3, we will apply prob 2/3 and use + # 3x more epochs. + # Speed perturbation probably should come first before + # concatenation, but in principle the transforms order doesn't have + # to be strict (e.g. could be randomized) + # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa + # Drop feats to be on the safe side. + train = K2SpeechRecognitionDataset( + cut_transforms=transforms, + input_strategy=OnTheFlyFeatures( + Fbank(FbankConfig(num_mel_bins=80)) + ), + input_transforms=input_transforms, + return_cuts=self.args.return_cuts, + ) + + if self.args.bucketing_sampler: + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( + cuts_train, + max_duration=self.args.max_duration, + shuffle=self.args.shuffle, + num_buckets=self.args.num_buckets, + drop_last=self.args.drop_last, + ) + else: + logging.info("Using SingleCutSampler.") + train_sampler = SingleCutSampler( + cuts_train, + max_duration=self.args.max_duration, + shuffle=self.args.shuffle, + ) + logging.info("About to create train dataloader") + + if sampler_state_dict is not None: + logging.info("Loading sampler state dict") + train_sampler.load_state_dict(sampler_state_dict) + + # 'seed' is derived from the current random state, which will have + # previously been set in the main process. + seed = torch.randint(0, 100000, ()).item() + worker_init_fn = _SeedWorkers(seed) + + train_dl = DataLoader( + train, + sampler=train_sampler, + batch_size=None, + num_workers=self.args.num_workers, + persistent_workers=False, + worker_init_fn=worker_init_fn, + ) + + return train_dl + + def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader: + transforms = [] + if self.args.concatenate_cuts: + transforms = [ + CutConcatenate( + duration_factor=self.args.duration_factor, gap=self.args.gap + ) + ] + transforms + + logging.info("About to create dev dataset") + if self.args.on_the_fly_feats: + validate = K2SpeechRecognitionDataset( + cut_transforms=transforms, + input_strategy=OnTheFlyFeatures( + Fbank(FbankConfig(num_mel_bins=80)) + ), + return_cuts=self.args.return_cuts, + ) + else: + validate = K2SpeechRecognitionDataset( + cut_transforms=transforms, + return_cuts=self.args.return_cuts, + ) + valid_sampler = DynamicBucketingSampler( + cuts_valid, + max_duration=self.args.max_duration, + shuffle=False, + ) + logging.info("About to create dev dataloader") + valid_dl = DataLoader( + validate, + sampler=valid_sampler, + batch_size=None, + num_workers=2, + persistent_workers=False, + ) + + return valid_dl + + def test_dataloaders(self, cuts: CutSet) -> DataLoader: + logging.debug("About to create test dataset") + test = K2SpeechRecognitionDataset( + input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) + if self.args.on_the_fly_feats + else eval(self.args.input_strategy)(), + return_cuts=self.args.return_cuts, + ) + sampler = DynamicBucketingSampler( + cuts, + max_duration=self.args.max_duration, + shuffle=False, + ) + logging.debug("About to create test dataloader") + test_dl = DataLoader( + test, + batch_size=None, + sampler=sampler, + num_workers=self.args.num_workers, + ) + return test_dl + + @lru_cache() + def train_clean_100_cuts(self) -> CutSet: + logging.info("About to get train-clean-100 cuts") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz" + ) + + @lru_cache() + def train_clean_360_cuts(self) -> CutSet: + logging.info("About to get train-clean-360 cuts") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz" + ) + + @lru_cache() + def train_other_500_cuts(self) -> CutSet: + logging.info("About to get train-other-500 cuts") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz" + ) + + @lru_cache() + def dev_clean_cuts(self) -> CutSet: + logging.info("About to get dev-clean cuts") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz" + ) + + @lru_cache() + def dev_other_cuts(self) -> CutSet: + logging.info("About to get dev-other cuts") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz" + ) + + @lru_cache() + def test_clean_cuts(self) -> CutSet: + logging.info("About to get test-clean cuts") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz" + ) + + @lru_cache() + def test_other_cuts(self) -> CutSet: + logging.info("About to get test-other cuts") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz" + ) From 7e61cb0730cb06a0361840d5633ae8200f47b0c6 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Sun, 16 Oct 2022 02:39:05 -0400 Subject: [PATCH 031/154] remove redundent file --- hyperion/torch/data/asr_datamodule.py | 454 -------------------------- 1 file changed, 454 deletions(-) delete mode 100644 hyperion/torch/data/asr_datamodule.py diff --git a/hyperion/torch/data/asr_datamodule.py b/hyperion/torch/data/asr_datamodule.py deleted file mode 100644 index 355ccc99..00000000 --- a/hyperion/torch/data/asr_datamodule.py +++ /dev/null @@ -1,454 +0,0 @@ -# Copyright 2021 Piotr Żelasko -# Copyright 2022 Xiaomi Corporation (Author: Mingshuang Luo) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import inspect -import logging -from functools import lru_cache -from pathlib import Path -from typing import Any, Dict, Optional - -import torch -from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy -from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures - CutConcatenate, - CutMix, - DynamicBucketingSampler, - K2SpeechRecognitionDataset, - PrecomputedFeatures, - SingleCutSampler, - SpecAugment, -) -from lhotse.dataset.input_strategies import ( # noqa F401 For AudioSamples - AudioSamples, - OnTheFlyFeatures, -) -from lhotse.utils import fix_random_seed -from torch.utils.data import DataLoader - -from icefall.utils import str2bool - - -class _SeedWorkers: - def __init__(self, seed: int): - self.seed = seed - - def __call__(self, worker_id: int): - fix_random_seed(self.seed + worker_id) - - -class LibriSpeechAsrDataModule: - """ - DataModule for k2 ASR experiments. - It assumes there is always one train and valid dataloader, - but there can be multiple test dataloaders (e.g. LibriSpeech test-clean - and test-other). - - It contains all the common data pipeline modules used in ASR - experiments, e.g.: - - dynamic batch size, - - bucketing samplers, - - cut concatenation, - - augmentation, - - on-the-fly feature extraction - - This class should be derived for specific corpora used in ASR tasks. - """ - - def __init__(self, args: argparse.Namespace): - self.args = args - - @classmethod - def add_arguments(cls, parser: argparse.ArgumentParser): - group = parser.add_argument_group( - title="ASR data related options", - description="These options are used for the preparation of " - "PyTorch DataLoaders from Lhotse CutSet's -- they control the " - "effective batch sizes, sampling strategies, applied data " - "augmentations, etc.", - ) - group.add_argument( - "--full-libri", - type=str2bool, - default=True, - help="When enabled, use 960h LibriSpeech. " - "Otherwise, use 100h subset.", - ) - group.add_argument( - "--manifest-dir", - type=Path, - default=Path("data/fbank"), - help="Path to directory with train/valid/test cuts.", - ) - group.add_argument( - "--max-duration", - type=int, - default=200.0, - help="Maximum pooled recordings duration (seconds) in a " - "single batch. You can reduce it if it causes CUDA OOM.", - ) - group.add_argument( - "--bucketing-sampler", - type=str2bool, - default=True, - help="When enabled, the batches will come from buckets of " - "similar duration (saves padding frames).", - ) - group.add_argument( - "--num-buckets", - type=int, - default=30, - help="The number of buckets for the DynamicBucketingSampler" - "(you might want to increase it for larger datasets).", - ) - group.add_argument( - "--concatenate-cuts", - type=str2bool, - default=False, - help="When enabled, utterances (cuts) will be concatenated " - "to minimize the amount of padding.", - ) - group.add_argument( - "--duration-factor", - type=float, - default=1.0, - help="Determines the maximum duration of a concatenated cut " - "relative to the duration of the longest cut in a batch.", - ) - group.add_argument( - "--gap", - type=float, - default=1.0, - help="The amount of padding (in seconds) inserted between " - "concatenated cuts. This padding is filled with noise when " - "noise augmentation is used.", - ) - group.add_argument( - "--on-the-fly-feats", - type=str2bool, - default=False, - help="When enabled, use on-the-fly cut mixing and feature " - "extraction. Will drop existing precomputed feature manifests " - "if available.", - ) - group.add_argument( - "--shuffle", - type=str2bool, - default=True, - help="When enabled (=default), the examples will be " - "shuffled for each epoch.", - ) - group.add_argument( - "--drop-last", - type=str2bool, - default=True, - help="Whether to drop last batch. Used by sampler.", - ) - group.add_argument( - "--return-cuts", - type=str2bool, - default=True, - help="When enabled, each batch will have the " - "field: batch['supervisions']['cut'] with the cuts that " - "were used to construct it.", - ) - - group.add_argument( - "--num-workers", - type=int, - default=2, - help="The number of training dataloader workers that " - "collect the batches.", - ) - - group.add_argument( - "--enable-spec-aug", - type=str2bool, - default=True, - help="When enabled, use SpecAugment for training dataset.", - ) - - group.add_argument( - "--spec-aug-time-warp-factor", - type=int, - default=80, - help="Used only when --enable-spec-aug is True. " - "It specifies the factor for time warping in SpecAugment. " - "Larger values mean more warping. " - "A value less than 1 means to disable time warp.", - ) - - group.add_argument( - "--enable-musan", - type=str2bool, - default=True, - help="When enabled, select noise from MUSAN and mix it" - "with training dataset. ", - ) - - group.add_argument( - "--input-strategy", - type=str, - default="PrecomputedFeatures", - help="AudioSamples or PrecomputedFeatures", - ) - - def train_dataloaders( - self, - cuts_train: CutSet, - sampler_state_dict: Optional[Dict[str, Any]] = None, - ) -> DataLoader: - """ - Args: - cuts_train: - CutSet for training. - sampler_state_dict: - The state dict for the training sampler. - """ - transforms = [] - if self.args.enable_musan: - logging.info("Enable MUSAN") - logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "musan_cuts.jsonl.gz" - ) - transforms.append( - CutMix( - cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True - ) - ) - else: - logging.info("Disable MUSAN") - - if self.args.concatenate_cuts: - logging.info( - f"Using cut concatenation with duration factor " - f"{self.args.duration_factor} and gap {self.args.gap}." - ) - # Cut concatenation should be the first transform in the list, - # so that if we e.g. mix noise in, it will fill the gaps between - # different utterances. - transforms = [ - CutConcatenate( - duration_factor=self.args.duration_factor, gap=self.args.gap - ) - ] + transforms - - input_transforms = [] - if self.args.enable_spec_aug: - logging.info("Enable SpecAugment") - logging.info( - f"Time warp factor: {self.args.spec_aug_time_warp_factor}" - ) - # Set the value of num_frame_masks according to Lhotse's version. - # In different Lhotse's versions, the default of num_frame_masks is - # different. - num_frame_masks = 10 - num_frame_masks_parameter = inspect.signature( - SpecAugment.__init__ - ).parameters["num_frame_masks"] - if num_frame_masks_parameter.default == 1: - num_frame_masks = 2 - logging.info(f"Num frame mask: {num_frame_masks}") - input_transforms.append( - SpecAugment( - time_warp_factor=self.args.spec_aug_time_warp_factor, - num_frame_masks=num_frame_masks, - features_mask_size=27, - num_feature_masks=2, - frames_mask_size=100, - ) - ) - else: - logging.info("Disable SpecAugment") - - logging.info("About to create train dataset") - train = K2SpeechRecognitionDataset( - input_strategy=eval(self.args.input_strategy)(), - cut_transforms=transforms, - input_transforms=input_transforms, - return_cuts=self.args.return_cuts, - ) - - if self.args.on_the_fly_feats: - # NOTE: the PerturbSpeed transform should be added only if we - # remove it from data prep stage. - # Add on-the-fly speed perturbation; since originally it would - # have increased epoch size by 3, we will apply prob 2/3 and use - # 3x more epochs. - # Speed perturbation probably should come first before - # concatenation, but in principle the transforms order doesn't have - # to be strict (e.g. could be randomized) - # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa - # Drop feats to be on the safe side. - train = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=OnTheFlyFeatures( - Fbank(FbankConfig(num_mel_bins=80)) - ), - input_transforms=input_transforms, - return_cuts=self.args.return_cuts, - ) - - if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - drop_last=self.args.drop_last, - ) - else: - logging.info("Using SingleCutSampler.") - train_sampler = SingleCutSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - ) - logging.info("About to create train dataloader") - - if sampler_state_dict is not None: - logging.info("Loading sampler state dict") - train_sampler.load_state_dict(sampler_state_dict) - - # 'seed' is derived from the current random state, which will have - # previously been set in the main process. - seed = torch.randint(0, 100000, ()).item() - worker_init_fn = _SeedWorkers(seed) - - train_dl = DataLoader( - train, - sampler=train_sampler, - batch_size=None, - num_workers=self.args.num_workers, - persistent_workers=False, - worker_init_fn=worker_init_fn, - ) - - return train_dl - - def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader: - transforms = [] - if self.args.concatenate_cuts: - transforms = [ - CutConcatenate( - duration_factor=self.args.duration_factor, gap=self.args.gap - ) - ] + transforms - - logging.info("About to create dev dataset") - if self.args.on_the_fly_feats: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=OnTheFlyFeatures( - Fbank(FbankConfig(num_mel_bins=80)) - ), - return_cuts=self.args.return_cuts, - ) - else: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - return_cuts=self.args.return_cuts, - ) - valid_sampler = DynamicBucketingSampler( - cuts_valid, - max_duration=self.args.max_duration, - shuffle=False, - ) - logging.info("About to create dev dataloader") - valid_dl = DataLoader( - validate, - sampler=valid_sampler, - batch_size=None, - num_workers=2, - persistent_workers=False, - ) - - return valid_dl - - def test_dataloaders(self, cuts: CutSet) -> DataLoader: - logging.debug("About to create test dataset") - test = K2SpeechRecognitionDataset( - input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) - if self.args.on_the_fly_feats - else eval(self.args.input_strategy)(), - return_cuts=self.args.return_cuts, - ) - sampler = DynamicBucketingSampler( - cuts, - max_duration=self.args.max_duration, - shuffle=False, - ) - logging.debug("About to create test dataloader") - test_dl = DataLoader( - test, - batch_size=None, - sampler=sampler, - num_workers=self.args.num_workers, - ) - return test_dl - - @lru_cache() - def train_clean_100_cuts(self) -> CutSet: - logging.info("About to get train-clean-100 cuts") - return load_manifest_lazy( - self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz" - ) - - @lru_cache() - def train_clean_360_cuts(self) -> CutSet: - logging.info("About to get train-clean-360 cuts") - return load_manifest_lazy( - self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz" - ) - - @lru_cache() - def train_other_500_cuts(self) -> CutSet: - logging.info("About to get train-other-500 cuts") - return load_manifest_lazy( - self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz" - ) - - @lru_cache() - def dev_clean_cuts(self) -> CutSet: - logging.info("About to get dev-clean cuts") - return load_manifest_lazy( - self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz" - ) - - @lru_cache() - def dev_other_cuts(self) -> CutSet: - logging.info("About to get dev-other cuts") - return load_manifest_lazy( - self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz" - ) - - @lru_cache() - def test_clean_cuts(self) -> CutSet: - logging.info("About to get test-clean cuts") - return load_manifest_lazy( - self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz" - ) - - @lru_cache() - def test_other_cuts(self) -> CutSet: - logging.info("About to get test-other cuts") - return load_manifest_lazy( - self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz" - ) From d8efa4f77885b4784783a482ce07787041efae08 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 17 Oct 2022 14:16:43 -0400 Subject: [PATCH 032/154] added script for tsne plots of lre --- hyperion/bin/plot-vector-tsne.py | 209 ----------- hyperion/bin/plot_embedding_tsne.py | 345 ++++++++++++++++++ hyperion/bin/plot_embedding_tsne_per_class.py | 198 ++++++++++ .../apply-mvn-select-frames.py | 0 .../compute-mfcc-feats.py | 0 hyperion/{bin => bin_deprec2}/copy-feats.py | 0 .../{bin => bin_deprec2}/eval-cos-1vs1.py | 0 .../eval-linear-gbe-up.py | 0 .../{bin => bin_deprec2}/eval-linear-gbe.py | 0 .../{bin => bin_deprec2}/eval-linear-svmc.py | 0 .../eval-logistic-regression.py | 0 .../{bin => bin_deprec2}/eval-plda-1vs1.py | 0 .../{bin => bin_deprec2}/eval-plda-nvs1.py | 0 .../{bin => bin_deprec2}/merge-h5-files.py | 0 .../{bin => bin_deprec2}/pack-audio-files.py | 0 .../{bin => bin_deprec2}/plot-vector-hist.py | 0 .../{bin => bin_deprec2}/rttm-to-bin-vad.py | 0 .../segments-to-bin-vad.py | 0 .../torch-adv-finetune-xvec-from-wav.py | 0 .../torch-adv-finetune-xvec.py | 0 .../torch-compute-mfcc-feats.py | 0 .../{bin => bin_deprec2}/torch-eval-vae.py | 0 ...osine-scoring-from-adv-test-wav-wavegan.py | 0 ...l-xvec-cosine-scoring-from-adv-test-wav.py | 0 ...l-xvec-cosine-scoring-from-art-test-wav.py | 0 ...-eval-xvec-cosine-scoring-from-test-wav.py | 0 ...sine-scoring-from-transfer-adv-test-wav.py | 0 ...sine-scoring-from-transfer-art-test-wav.py | 0 .../torch-eval-xvec-logits-from-wav.py | 0 ...rch-extract-xvectors-from-wav-with-rttm.py | 0 ...torch-extract-xvectors-slidwin-from-wav.py | 0 .../torch-extract-xvectors-slidwin.py | 0 .../torch-extract-xvectors-vae-preproc.py | 0 .../torch-extract-xvectors.py | 0 ...ch-generate-adv-attacks-xvector-classif.py | 0 ...orch-generate-adv-attacks-xvector-verif.py | 0 .../torch-train-dc1d-ae.py | 0 .../{bin => bin_deprec2}/torch-train-dvae.py | 0 .../torch-train-efficientnet-xvec-from-wav.py | 0 .../torch-train-efficientnet-xvec.py | 0 .../torch-train-resnet-xvec-from-wav.py | 0 .../torch-train-resnet-xvec.py | 0 .../torch-train-resnet1d-xvec-from-wav.py | 0 .../torch-train-spinenet-xvec-from-wav.py | 0 .../torch-train-tdnn-xvec-from-wav.py | 0 .../torch-train-tdnn-xvec.py | 0 ...orch-train-transformer-xvec-v1-from-wav.py | 0 .../torch-train-transformer-xvec-v1.py | 0 .../{bin => bin_deprec2}/torch-train-vae.py | 0 .../torch-train-vq-dvae.py | 0 .../torch-train-vq-vae.py | 0 hyperion/{bin => bin_deprec2}/train-cw-up.py | 0 hyperion/{bin => bin_deprec2}/train-cw.py | 0 .../train-gaussianizer.py | 0 hyperion/{bin => bin_deprec2}/train-lda.py | 0 .../train-linear-gbe-up.py | 0 .../{bin => bin_deprec2}/train-linear-gbe.py | 0 .../{bin => bin_deprec2}/train-linear-svmc.py | 0 .../train-logistic-regression.py | 0 hyperion/{bin => bin_deprec2}/train-mvn.py | 0 hyperion/{bin => bin_deprec2}/train-nda.py | 0 hyperion/{bin => bin_deprec2}/train-pca.py | 0 hyperion/{bin => bin_deprec2}/train-plda.py | 0 .../data/class_weighted_seg_chunk_sampler.py | 106 +++++- hyperion/utils/info_table.py | 3 + 65 files changed, 643 insertions(+), 218 deletions(-) delete mode 100755 hyperion/bin/plot-vector-tsne.py create mode 100755 hyperion/bin/plot_embedding_tsne.py create mode 100755 hyperion/bin/plot_embedding_tsne_per_class.py rename hyperion/{bin => bin_deprec2}/apply-mvn-select-frames.py (100%) rename hyperion/{bin => bin_deprec2}/compute-mfcc-feats.py (100%) rename hyperion/{bin => bin_deprec2}/copy-feats.py (100%) rename hyperion/{bin => bin_deprec2}/eval-cos-1vs1.py (100%) rename hyperion/{bin => bin_deprec2}/eval-linear-gbe-up.py (100%) rename hyperion/{bin => bin_deprec2}/eval-linear-gbe.py (100%) rename hyperion/{bin => bin_deprec2}/eval-linear-svmc.py (100%) rename hyperion/{bin => bin_deprec2}/eval-logistic-regression.py (100%) rename hyperion/{bin => bin_deprec2}/eval-plda-1vs1.py (100%) rename hyperion/{bin => bin_deprec2}/eval-plda-nvs1.py (100%) rename hyperion/{bin => bin_deprec2}/merge-h5-files.py (100%) rename hyperion/{bin => bin_deprec2}/pack-audio-files.py (100%) rename hyperion/{bin => bin_deprec2}/plot-vector-hist.py (100%) rename hyperion/{bin => bin_deprec2}/rttm-to-bin-vad.py (100%) rename hyperion/{bin => bin_deprec2}/segments-to-bin-vad.py (100%) rename hyperion/{bin => bin_deprec2}/torch-adv-finetune-xvec-from-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-adv-finetune-xvec.py (100%) rename hyperion/{bin => bin_deprec2}/torch-compute-mfcc-feats.py (100%) rename hyperion/{bin => bin_deprec2}/torch-eval-vae.py (100%) rename hyperion/{bin => bin_deprec2}/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py (100%) rename hyperion/{bin => bin_deprec2}/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-eval-xvec-cosine-scoring-from-art-test-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-eval-xvec-cosine-scoring-from-test-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-eval-xvec-logits-from-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-extract-xvectors-from-wav-with-rttm.py (100%) rename hyperion/{bin => bin_deprec2}/torch-extract-xvectors-slidwin-from-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-extract-xvectors-slidwin.py (100%) rename hyperion/{bin => bin_deprec2}/torch-extract-xvectors-vae-preproc.py (100%) rename hyperion/{bin => bin_deprec2}/torch-extract-xvectors.py (100%) rename hyperion/{bin => bin_deprec2}/torch-generate-adv-attacks-xvector-classif.py (100%) rename hyperion/{bin => bin_deprec2}/torch-generate-adv-attacks-xvector-verif.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-dc1d-ae.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-dvae.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-efficientnet-xvec-from-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-efficientnet-xvec.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-resnet-xvec-from-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-resnet-xvec.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-resnet1d-xvec-from-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-spinenet-xvec-from-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-tdnn-xvec-from-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-tdnn-xvec.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-transformer-xvec-v1-from-wav.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-transformer-xvec-v1.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-vae.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-vq-dvae.py (100%) rename hyperion/{bin => bin_deprec2}/torch-train-vq-vae.py (100%) rename hyperion/{bin => bin_deprec2}/train-cw-up.py (100%) rename hyperion/{bin => bin_deprec2}/train-cw.py (100%) rename hyperion/{bin => bin_deprec2}/train-gaussianizer.py (100%) rename hyperion/{bin => bin_deprec2}/train-lda.py (100%) rename hyperion/{bin => bin_deprec2}/train-linear-gbe-up.py (100%) rename hyperion/{bin => bin_deprec2}/train-linear-gbe.py (100%) rename hyperion/{bin => bin_deprec2}/train-linear-svmc.py (100%) rename hyperion/{bin => bin_deprec2}/train-logistic-regression.py (100%) rename hyperion/{bin => bin_deprec2}/train-mvn.py (100%) rename hyperion/{bin => bin_deprec2}/train-nda.py (100%) rename hyperion/{bin => bin_deprec2}/train-pca.py (100%) rename hyperion/{bin => bin_deprec2}/train-plda.py (100%) diff --git a/hyperion/bin/plot-vector-tsne.py b/hyperion/bin/plot-vector-tsne.py deleted file mode 100755 index c4c30302..00000000 --- a/hyperion/bin/plot-vector-tsne.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import sys -import os -import argparse -import time -import logging - -import numpy as np -import matplotlib - -matplotlib.use("Agg") -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D as plt3d - -from sklearn.manifold import TSNE - -from hyperion.hyp_defs import config_logger -from hyperion.io import DataWriterFactory as DWF -from hyperion.helpers import VectorClassReader as VCR -from hyperion.np.transforms import TransformList, PCA - -colors = ["b", "g", "r", "c", "m", "y", "k"] -markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] - - -def plot_vector_tsne( - iv_file, - v_list, - preproc_file, - output_path, - save_embed, - output_dim, - perplexity, - exag, - lr, - num_iter, - init_method, - rng_seed, - verbose, - pca_dim, - max_classes, - **kwargs -): - - if preproc_file is not None: - preproc = TransformList.load(preproc_file) - else: - preproc = None - - vr_args = VCR.filter_args(**kwargs) - vcr = VCR(iv_file, v_list, preproc, **vr_args) - - x, class_ids = vcr.read() - - t1 = time.time() - - if pca_dim > 0: - pca = PCA(pca_dim=pca_dim) - pca.fit(x) - x = pca.predict(x) - - if not os.path.exists(output_path): - os.makedirs(ouput_path) - - tsne_obj = lambda n: TSNE( - n_components=n, - perplexity=perplexity, - early_exaggeration=exag, - learning_rate=lr, - n_iter=num_iter, - init=init_method, - random_state=rng_seed, - verbose=verbose, - ) - - if max_classes > 0: - index = class_ids < max_classes - x = x[index] - class_ids = class_ids[index] - - if output_dim > 3: - tsne = tsne_obj(output_dim) - y = tsne.fit_transform(x) - - if save_embed: - h5_file = "%s/embed_%dd.h5" % (output_path, ouput_dim) - hw = DWF.create(h5_file) - hw.write(vcr.u2c.key, y) - - tsne = tsne_obj(2) - y = tsne.fit_transform(x) - if save_embed: - h5_file = "%s/embed_2d.h5" % output_path - hw = DWF.create(h5_file) - hw.write(vcr.u2c.key, y) - - fig_file = "%s/tsne_2d.pdf" % (output_path) - # plt.scatter(y[:,0], y[:,1], c=class_ids, marker='x') - - color_marker = [(c, m) for m in markers for c in colors] - for c in np.unique(class_ids): - idx = class_ids == c - plt.scatter( - y[idx, 0], - y[idx, 1], - c=color_marker[c][0], - marker=color_marker[c][1], - label=vcr.class_names[c], - ) - - plt.legend() - plt.grid(True) - plt.show() - plt.savefig(fig_file) - plt.clf() - - # if max_classes > 0: - # fig_file = '%s/tsne_2d_n%d.pdf' % (output_path, max_classes) - # index = class_ids < max_classes - # plt.scatter(y[index,0], y[index,1], c=class_ids[index], marker='x') - # plt.grid(True) - # plt.show() - # plt.savefig(fig_file) - # plt.clf() - - tsne = tsne_obj(3) - y = tsne.fit_transform(x) - if save_embed: - h5_file = "%s/embed_3d.h5" % output_path - hw = DWF.create(h5_file) - hw.write(vcr.u2c.key, y) - - fig_file = "%s/tsne_3d.pdf" % (output_path) - fig = plt.figure() - ax = fig.add_subplot(111, projection="3d") - # ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') - for c in np.unique(class_ids): - idx = class_ids == c - ax.scatter( - y[idx, 0], - y[idx, 1], - y[idx, 2], - c=color_marker[c][0], - marker=color_marker[c][1], - label=vcr.class_names[c], - ) - - plt.grid(True) - plt.show() - plt.savefig(fig_file) - plt.clf() - - # if max_classes > 0: - # fig_file = '%s/tsne_3d_n%d.pdf' % (output_path, max_classes) - # index = class_ids < max_classes - # ax = fig.add_subplot(111, projection='3d') - # ax.scatter(y[index,0], y[index,1], y[index,2], c=class_ids[index], marker='x') - # plt.grid(True) - # plt.show() - # plt.savefig(fig_file) - # plt.clf() - - logging.info("Elapsed time: %.2f s." % (time.time() - t1)) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars="@", - description="Plots TSNE embeddings", - ) - - parser.add_argument("--iv-file", dest="iv_file", required=True) - parser.add_argument("--v-list", dest="v_list", required=True) - parser.add_argument("--preproc-file", dest="preproc_file", default=None) - - VCR.add_argparse_args(parser) - - parser.add_argument("--output-path", dest="output_path", required=True) - parser.add_argument( - "--save-embed", dest="save_embed", default=False, action="store_true" - ) - - parser.add_argument("--output-dim", dest="output_dim", type=int, default=3) - parser.add_argument("--perplexity", dest="perplexity", type=float, default=30) - parser.add_argument("--exag", dest="exag", type=float, default=12) - parser.add_argument("--lr", dest="lr", type=float, default=200) - parser.add_argument("--num-iter", dest="num_iter", type=int, default=1000) - parser.add_argument( - "--init-method", dest="init_method", default="pca", choices=["random", "pca"] - ) - parser.add_argument("--rng-seed", dest="rng_seed", type=int, default=1024) - parser.add_argument("--pca-dim", dest="pca_dim", type=int, default=50) - parser.add_argument("--max-classes", dest="max_classes", type=int, default=10) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - logging.debug(args) - - plot_vector_tsne(**vars(args)) diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py new file mode 100755 index 00000000..e514252f --- /dev/null +++ b/hyperion/bin/plot_embedding_tsne.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np +import pandas as pd +import matplotlib + +import matplotlib.pyplot as plt + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import PCA, SklTSNE, LNorm + +matplotlib.use("Agg") +colors = ["b", "g", "r", "c", "m", "y", "k"] +markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] + +color_marker = [(c, m) for m in markers for c in colors] + + +def plot_embedding_tsne( + train_v_file, + train_list, + pca_var_r, + prob_plot, + lnorm, + title, + max_classes, + unlabeled, + plot_class_names, + output_dir, + **kwargs, +): + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + train_reader = DRF.create(train_v_file) + x_trn = train_reader.read(train_segs["id"], squeeze=True) + del train_reader + logging.info("loaded %d samples", x_trn.shape[0]) + if lnorm: + x_trn = LNorm().predict(x_trn) + + if pca_var_r < 1: + pca = PCA(pca_var_r=pca_var_r) + pca.fit(x_trn) + x_pca = pca.predict(x_trn) + logging.info("pca-dim=%d", x_pca.shape[1]) + else: + x_pca = x_trn + + tsne_args = SklTSNE.filter_args(**kwargs["tsne"]) + tsne = SklTSNE(**tsne_args) + x_tsne = tsne.fit(x_pca) + p = np.random.rand(x_tsne.shape[0]) <= prob_plot + x_tsne = x_tsne[p] + logging.info("plots %d samples", x_tsne.shape[0]) + + if unlabeled: + plot_class_names = ["none"] + + for col in plot_class_names: + fig_file = f"{output_dir}/train_tsne_{col}.png" + if not unlabeled: + classes = train_segs.loc[p, col] + classes, class_ids = np.unique(classes, return_inverse=True) + if max_classes is not None: + index = class_ids < max_classes + x_tsne_filtered = x_tsne[index] + class_ids = class_ids[index] + else: + x_tsne_filtered = x_tsne + + else: + class_ids = np.zeros((len(x_tsne.shape[0]),), dtype=np.int) + classes = [None] + + for c in range(np.max(class_ids) + 1): + idx = class_ids == c + if not unlabeled: + logging.info("plot class %s with %d samples", classes[c], np.sum(idx)) + plt.scatter( + x_tsne_filtered[idx, 0], + x_tsne_filtered[idx, 1], + c=color_marker[c][0], + marker=color_marker[c][1], + label=classes[c], + ) + + if not unlabeled: + plt.legend() + plt.grid(True) + plt.title(title) + plt.savefig(fig_file) + plt.clf() + + # fig_file = "%s/tsne_3d.pdf" % (output_dir) + # fig = plt.figure() + # ax = fig.add_subplot(111, projection="3d") + # # ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') + # for c in np.unique(class_ids): + # idx = class_ids == c + # ax.scatter( + # y[idx, 0], + # y[idx, 1], + # y[idx, 2], + # c=color_marker[c][0], + # marker=color_marker[c][1], + # label=vcr.class_names[c], + # ) + + # plt.grid(True) + # plt.show() + # plt.savefig(fig_file) + # plt.clf() + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Projects embeddings using TSNE") + + parser.add_argument("--train-v-file", required=True) + parser.add_argument("--train-list", required=True) + + parser.add_argument("--pca-var-r", default=0.95, type=float) + parser.add_argument("--prob-plot", default=0.1, type=float) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + parser.add_argument("--unlabeled", default=False, action=ActionYesNo) + parser.add_argument( + "--plot-class-names", + default=["class_id"], + nargs="+", + help="names of the class columns we plot", + ) + parser.add_argument("--title", default="") + SklTSNE.add_class_args(parser, prefix="tsne") + + parser.add_argument( + "--max-classes", default=None, type=int, help="max number of clases to plot" + ) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + plot_embedding_tsne(**namespace_to_dict(args)) + + +# #!/usr/bin/env python +# """ +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# """ + +# import sys +# import os +# from jsonargparse import ( +# ArgumentParser, +# ActionConfigFile, +# ActionParser, +# namespace_to_dict, +# ) +# import time +# import logging + +# import numpy as np +# import pandas as pd +# import matplotlib + +# import matplotlib.pyplot as plt +# from mpl_toolkits.mplot3d import Axes3D as plt3d + +# from sklearn.manifold import TSNE + +# from hyperion.hyp_defs import config_logger +# from hyperion.io import DataWriterFactory as DWF +# from hyperion.helpers import VectorClassReader as VCR +# from hyperion.np.transforms import TransformList, PCA + +# matplotlib.use("Agg") +# colors = ["b", "g", "r", "c", "m", "y", "k"] +# markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] + + +# def plot_embedding_tsne( +# v_file, +# v_list, +# preproc_file, +# output_dir, +# save_embed, +# output_dim, +# perplexity, +# exag, +# lr, +# num_iter, +# init_method, +# rng_seed, +# verbose, +# pca_dim, +# max_classes, +# **kwargs +# ): + +# if preproc_file is not None: +# preproc = TransformList.load(preproc_file) +# else: +# preproc = None + +# vr_args = VCR.filter_args(**kwargs) +# vcr = VCR(iv_file, v_list, preproc, **vr_args) + +# x, class_ids = vcr.read() + +# t1 = time.time() + +# if pca_dim > 0: +# pca = PCA(pca_dim=pca_dim) +# pca.fit(x) +# x = pca.predict(x) + +# if not os.path.exists(output_path): +# os.makedirs(ouput_path) + +# tsne_obj = lambda n: TSNE( +# n_components=n, +# perplexity=perplexity, +# early_exaggeration=exag, +# learning_rate=lr, +# n_iter=num_iter, +# init=init_method, +# random_state=rng_seed, +# verbose=verbose, +# ) + +# if max_classes > 0: +# index = class_ids < max_classes +# x = x[index] +# class_ids = class_ids[index] + +# if output_dim > 3: +# tsne = tsne_obj(output_dim) +# y = tsne.fit_transform(x) + +# if save_embed: +# h5_file = "%s/embed_%dd.h5" % (output_path, ouput_dim) +# hw = DWF.create(h5_file) +# hw.write(vcr.u2c.key, y) + +# tsne = tsne_obj(2) +# y = tsne.fit_transform(x) +# if save_embed: +# h5_file = "%s/embed_2d.h5" % output_path +# hw = DWF.create(h5_file) +# hw.write(vcr.u2c.key, y) + +# fig_file = "%s/tsne_2d.pdf" % (output_path) +# # plt.scatter(y[:,0], y[:,1], c=class_ids, marker='x') + +# color_marker = [(c, m) for m in markers for c in colors] +# for c in np.unique(class_ids): +# idx = class_ids == c +# plt.scatter( +# y[idx, 0], +# y[idx, 1], +# c=color_marker[c][0], +# marker=color_marker[c][1], +# label=vcr.class_names[c], +# ) + +# plt.legend() +# plt.grid(True) +# plt.show() +# plt.savefig(fig_file) +# plt.clf() + +# # if max_classes > 0: +# # fig_file = '%s/tsne_2d_n%d.pdf' % (output_path, max_classes) +# # index = class_ids < max_classes +# # plt.scatter(y[index,0], y[index,1], c=class_ids[index], marker='x') +# # plt.grid(True) +# # plt.show() +# # plt.savefig(fig_file) +# # plt.clf() + +# tsne = tsne_obj(3) +# y = tsne.fit_transform(x) +# if save_embed: +# h5_file = "%s/embed_3d.h5" % output_path +# hw = DWF.create(h5_file) +# hw.write(vcr.u2c.key, y) + +# fig_file = "%s/tsne_3d.pdf" % (output_path) +# fig = plt.figure() +# ax = fig.add_subplot(111, projection="3d") +# # ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') +# for c in np.unique(class_ids): +# idx = class_ids == c +# ax.scatter( +# y[idx, 0], +# y[idx, 1], +# y[idx, 2], +# c=color_marker[c][0], +# marker=color_marker[c][1], +# label=vcr.class_names[c], +# ) + +# plt.grid(True) +# plt.show() +# plt.savefig(fig_file) +# plt.clf() + +# # if max_classes > 0: +# # fig_file = '%s/tsne_3d_n%d.pdf' % (output_path, max_classes) +# # index = class_ids < max_classes +# # ax = fig.add_subplot(111, projection='3d') +# # ax.scatter(y[index,0], y[index,1], y[index,2], c=class_ids[index], marker='x') +# # plt.grid(True) +# # plt.show() +# # plt.savefig(fig_file) +# # plt.clf() + +# logging.info("Elapsed time: %.2f s." % (time.time() - t1)) diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py new file mode 100755 index 00000000..5730cc06 --- /dev/null +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np +import pandas as pd +import matplotlib + +import matplotlib.pyplot as plt + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import cosine_scoring +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import PCA, SklTSNE, LNorm +from hyperion.np.clustering import AHC + + +matplotlib.use("Agg") +colors = ["b", "g", "r", "c", "m", "y", "k"] +markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] + +color_marker = [(c, m) for m in markers for c in colors] + + +def plot_embedding_tsne( + train_v_file, + train_list, + pca_var_r, + prob_plot, + lnorm, + title, + max_classes, + plot_class_name, + do_ahc, + cluster_tsne, + num_clusters, + ahc_thr, + output_dir, + **kwargs, +): + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + train_reader = DRF.create(train_v_file) + x_trn = train_reader.read(train_segs["id"], squeeze=True) + del train_reader + logging.info("loaded %d samples", x_trn.shape[0]) + if lnorm: + x_trn = LNorm().predict(x_trn) + + if pca_var_r < 1: + pca = PCA(pca_var_r=pca_var_r) + pca.fit(x_trn) + x_pca = pca.predict(x_trn) + logging.info("pca-dim=%d", x_pca.shape[1]) + else: + x_pca = x_trn + + class_ids = train_segs[plot_class_name] + classes, class_idx = np.unique(class_ids, return_inverse=True) + if max_classes is not None: + index = class_idx < max_classes + x_pca = x_pca[index] + class_idx = class_idx[index] + + tsne_args = SklTSNE.filter_args(**kwargs["tsne"]) + tsne = SklTSNE(**tsne_args) + if do_ahc: + ahc = AHC() + global_subclass_idx = np.zeros_like(class_idx) + + for c in range(np.max(class_idx) + 1): + fig_file = f"{output_dir}/train_tsne_{plot_class_name}_{classes[c]}.png" + idx = class_idx == c + logging.info("plot class %s with %d samples", classes[c], np.sum(idx)) + x_c = x_pca[idx] + x_tsne = tsne.fit(x_c) + if do_ahc: + if cluster_tsne: + # in the low dim space, we cannot use cosine scoring + x2 = np.sum(x_tsne ** 2, axis=1)[:, None] + d2 = x2 - 2 * np.dot(x_tsne, x_tsne.T) + x2.T + scores = -np.sqrt(d2) + else: + scores = cosine_scoring(x_c, x_c) + ahc.fit(scores) + if num_clusters is None: + subclass_idx_c = ahc.get_flat_clusters(ahc_thr) + else: + subclass_idx_c = ahc.get_flat_clusters(num_clusters, "num_clusters") + global_subclass_idx[idx] = subclass_idx_c + + p = np.random.rand(x_tsne.shape[0]) <= prob_plot + x_tsne = x_tsne[p] + logging.info("plots %d samples", x_tsne.shape[0]) + if do_ahc: + subclass_idx_c = subclass_idx_c[p] + for sc in range(min(np.max(subclass_idx_c) + 1, len(color_marker))): + idx_sc = subclass_idx_c == sc + plt.scatter( + x_tsne[idx_sc, 0], + x_tsne[idx_sc, 1], + c=color_marker[sc][0], + marker=color_marker[sc][1], + ) + else: + plt.scatter( + x_tsne[:, 0], + x_tsne[:, 1], + c=color_marker[0][0], + marker=color_marker[0][1], + ) + + # plt.legend() + plt.grid(True) + plt.title(f"{title} {classes[c]}") + plt.savefig(fig_file) + plt.clf() + + if do_ahc: + # subclass_ids = [f"{a}-{b}" for a, b in zip(class_ids, global_subclass_idx)] + # _, subclass_idx = np.unique(subclass_ids, return_inverse=True) + # train_segs["subclass_id"] = subclass_ids + train_segs["subclass_idx"] = global_subclass_idx + train_segs.save(output_dir / "segments.csv") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=( + "Projects embeddings using TSNE, " + "plots a TSNE per class to discover subclusters inside of the classes" + ) + ) + + parser.add_argument("--train-v-file", required=True) + parser.add_argument("--train-list", required=True) + + parser.add_argument("--pca-var-r", default=0.95, type=float) + parser.add_argument("--prob-plot", default=0.1, type=float) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + parser.add_argument( + "--plot-class-name", + default="class_id", + help="names of the class column we plot", + ) + parser.add_argument("--title", default="") + SklTSNE.add_class_args(parser, prefix="tsne") + + parser.add_argument( + "--max-classes", default=None, type=int, help="max number of clases to plot" + ) + parser.add_argument( + "--do-ahc", default=False, action=ActionYesNo, help="Do AHC on each class" + ) + parser.add_argument( + "--cluster-tsne", + default=False, + action=ActionYesNo, + help="if true, clustering is done after TSNE, otherwise after PCA", + ) + + parser.add_argument( + "--num-clusters", + default=None, + type=int, + help="if not None, number of clusters for AHC, discards ahc-threshold", + ) + parser.add_argument("--ahc-thr", default=0.7, type=float, help="AHC threshold") + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + plot_embedding_tsne(**namespace_to_dict(args)) diff --git a/hyperion/bin/apply-mvn-select-frames.py b/hyperion/bin_deprec2/apply-mvn-select-frames.py similarity index 100% rename from hyperion/bin/apply-mvn-select-frames.py rename to hyperion/bin_deprec2/apply-mvn-select-frames.py diff --git a/hyperion/bin/compute-mfcc-feats.py b/hyperion/bin_deprec2/compute-mfcc-feats.py similarity index 100% rename from hyperion/bin/compute-mfcc-feats.py rename to hyperion/bin_deprec2/compute-mfcc-feats.py diff --git a/hyperion/bin/copy-feats.py b/hyperion/bin_deprec2/copy-feats.py similarity index 100% rename from hyperion/bin/copy-feats.py rename to hyperion/bin_deprec2/copy-feats.py diff --git a/hyperion/bin/eval-cos-1vs1.py b/hyperion/bin_deprec2/eval-cos-1vs1.py similarity index 100% rename from hyperion/bin/eval-cos-1vs1.py rename to hyperion/bin_deprec2/eval-cos-1vs1.py diff --git a/hyperion/bin/eval-linear-gbe-up.py b/hyperion/bin_deprec2/eval-linear-gbe-up.py similarity index 100% rename from hyperion/bin/eval-linear-gbe-up.py rename to hyperion/bin_deprec2/eval-linear-gbe-up.py diff --git a/hyperion/bin/eval-linear-gbe.py b/hyperion/bin_deprec2/eval-linear-gbe.py similarity index 100% rename from hyperion/bin/eval-linear-gbe.py rename to hyperion/bin_deprec2/eval-linear-gbe.py diff --git a/hyperion/bin/eval-linear-svmc.py b/hyperion/bin_deprec2/eval-linear-svmc.py similarity index 100% rename from hyperion/bin/eval-linear-svmc.py rename to hyperion/bin_deprec2/eval-linear-svmc.py diff --git a/hyperion/bin/eval-logistic-regression.py b/hyperion/bin_deprec2/eval-logistic-regression.py similarity index 100% rename from hyperion/bin/eval-logistic-regression.py rename to hyperion/bin_deprec2/eval-logistic-regression.py diff --git a/hyperion/bin/eval-plda-1vs1.py b/hyperion/bin_deprec2/eval-plda-1vs1.py similarity index 100% rename from hyperion/bin/eval-plda-1vs1.py rename to hyperion/bin_deprec2/eval-plda-1vs1.py diff --git a/hyperion/bin/eval-plda-nvs1.py b/hyperion/bin_deprec2/eval-plda-nvs1.py similarity index 100% rename from hyperion/bin/eval-plda-nvs1.py rename to hyperion/bin_deprec2/eval-plda-nvs1.py diff --git a/hyperion/bin/merge-h5-files.py b/hyperion/bin_deprec2/merge-h5-files.py similarity index 100% rename from hyperion/bin/merge-h5-files.py rename to hyperion/bin_deprec2/merge-h5-files.py diff --git a/hyperion/bin/pack-audio-files.py b/hyperion/bin_deprec2/pack-audio-files.py similarity index 100% rename from hyperion/bin/pack-audio-files.py rename to hyperion/bin_deprec2/pack-audio-files.py diff --git a/hyperion/bin/plot-vector-hist.py b/hyperion/bin_deprec2/plot-vector-hist.py similarity index 100% rename from hyperion/bin/plot-vector-hist.py rename to hyperion/bin_deprec2/plot-vector-hist.py diff --git a/hyperion/bin/rttm-to-bin-vad.py b/hyperion/bin_deprec2/rttm-to-bin-vad.py similarity index 100% rename from hyperion/bin/rttm-to-bin-vad.py rename to hyperion/bin_deprec2/rttm-to-bin-vad.py diff --git a/hyperion/bin/segments-to-bin-vad.py b/hyperion/bin_deprec2/segments-to-bin-vad.py similarity index 100% rename from hyperion/bin/segments-to-bin-vad.py rename to hyperion/bin_deprec2/segments-to-bin-vad.py diff --git a/hyperion/bin/torch-adv-finetune-xvec-from-wav.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py similarity index 100% rename from hyperion/bin/torch-adv-finetune-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py diff --git a/hyperion/bin/torch-adv-finetune-xvec.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec.py similarity index 100% rename from hyperion/bin/torch-adv-finetune-xvec.py rename to hyperion/bin_deprec2/torch-adv-finetune-xvec.py diff --git a/hyperion/bin/torch-compute-mfcc-feats.py b/hyperion/bin_deprec2/torch-compute-mfcc-feats.py similarity index 100% rename from hyperion/bin/torch-compute-mfcc-feats.py rename to hyperion/bin_deprec2/torch-compute-mfcc-feats.py diff --git a/hyperion/bin/torch-eval-vae.py b/hyperion/bin_deprec2/torch-eval-vae.py similarity index 100% rename from hyperion/bin/torch-eval-vae.py rename to hyperion/bin_deprec2/torch-eval-vae.py diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py similarity index 100% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py similarity index 100% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py similarity index 100% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py similarity index 100% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py similarity index 100% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py similarity index 100% rename from hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py diff --git a/hyperion/bin/torch-eval-xvec-logits-from-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py similarity index 100% rename from hyperion/bin/torch-eval-xvec-logits-from-wav.py rename to hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py diff --git a/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py b/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py similarity index 100% rename from hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py rename to hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py diff --git a/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py similarity index 100% rename from hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py rename to hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py diff --git a/hyperion/bin/torch-extract-xvectors-slidwin.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py similarity index 100% rename from hyperion/bin/torch-extract-xvectors-slidwin.py rename to hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py diff --git a/hyperion/bin/torch-extract-xvectors-vae-preproc.py b/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py similarity index 100% rename from hyperion/bin/torch-extract-xvectors-vae-preproc.py rename to hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py diff --git a/hyperion/bin/torch-extract-xvectors.py b/hyperion/bin_deprec2/torch-extract-xvectors.py similarity index 100% rename from hyperion/bin/torch-extract-xvectors.py rename to hyperion/bin_deprec2/torch-extract-xvectors.py diff --git a/hyperion/bin/torch-generate-adv-attacks-xvector-classif.py b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py similarity index 100% rename from hyperion/bin/torch-generate-adv-attacks-xvector-classif.py rename to hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py diff --git a/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py similarity index 100% rename from hyperion/bin/torch-generate-adv-attacks-xvector-verif.py rename to hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py diff --git a/hyperion/bin/torch-train-dc1d-ae.py b/hyperion/bin_deprec2/torch-train-dc1d-ae.py similarity index 100% rename from hyperion/bin/torch-train-dc1d-ae.py rename to hyperion/bin_deprec2/torch-train-dc1d-ae.py diff --git a/hyperion/bin/torch-train-dvae.py b/hyperion/bin_deprec2/torch-train-dvae.py similarity index 100% rename from hyperion/bin/torch-train-dvae.py rename to hyperion/bin_deprec2/torch-train-dvae.py diff --git a/hyperion/bin/torch-train-efficientnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py similarity index 100% rename from hyperion/bin/torch-train-efficientnet-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py diff --git a/hyperion/bin/torch-train-efficientnet-xvec.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py similarity index 100% rename from hyperion/bin/torch-train-efficientnet-xvec.py rename to hyperion/bin_deprec2/torch-train-efficientnet-xvec.py diff --git a/hyperion/bin/torch-train-resnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py similarity index 100% rename from hyperion/bin/torch-train-resnet-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py diff --git a/hyperion/bin/torch-train-resnet-xvec.py b/hyperion/bin_deprec2/torch-train-resnet-xvec.py similarity index 100% rename from hyperion/bin/torch-train-resnet-xvec.py rename to hyperion/bin_deprec2/torch-train-resnet-xvec.py diff --git a/hyperion/bin/torch-train-resnet1d-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py similarity index 100% rename from hyperion/bin/torch-train-resnet1d-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py diff --git a/hyperion/bin/torch-train-spinenet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py similarity index 100% rename from hyperion/bin/torch-train-spinenet-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py diff --git a/hyperion/bin/torch-train-tdnn-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py similarity index 100% rename from hyperion/bin/torch-train-tdnn-xvec-from-wav.py rename to hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py diff --git a/hyperion/bin/torch-train-tdnn-xvec.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec.py similarity index 100% rename from hyperion/bin/torch-train-tdnn-xvec.py rename to hyperion/bin_deprec2/torch-train-tdnn-xvec.py diff --git a/hyperion/bin/torch-train-transformer-xvec-v1-from-wav.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py similarity index 100% rename from hyperion/bin/torch-train-transformer-xvec-v1-from-wav.py rename to hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py diff --git a/hyperion/bin/torch-train-transformer-xvec-v1.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py similarity index 100% rename from hyperion/bin/torch-train-transformer-xvec-v1.py rename to hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py diff --git a/hyperion/bin/torch-train-vae.py b/hyperion/bin_deprec2/torch-train-vae.py similarity index 100% rename from hyperion/bin/torch-train-vae.py rename to hyperion/bin_deprec2/torch-train-vae.py diff --git a/hyperion/bin/torch-train-vq-dvae.py b/hyperion/bin_deprec2/torch-train-vq-dvae.py similarity index 100% rename from hyperion/bin/torch-train-vq-dvae.py rename to hyperion/bin_deprec2/torch-train-vq-dvae.py diff --git a/hyperion/bin/torch-train-vq-vae.py b/hyperion/bin_deprec2/torch-train-vq-vae.py similarity index 100% rename from hyperion/bin/torch-train-vq-vae.py rename to hyperion/bin_deprec2/torch-train-vq-vae.py diff --git a/hyperion/bin/train-cw-up.py b/hyperion/bin_deprec2/train-cw-up.py similarity index 100% rename from hyperion/bin/train-cw-up.py rename to hyperion/bin_deprec2/train-cw-up.py diff --git a/hyperion/bin/train-cw.py b/hyperion/bin_deprec2/train-cw.py similarity index 100% rename from hyperion/bin/train-cw.py rename to hyperion/bin_deprec2/train-cw.py diff --git a/hyperion/bin/train-gaussianizer.py b/hyperion/bin_deprec2/train-gaussianizer.py similarity index 100% rename from hyperion/bin/train-gaussianizer.py rename to hyperion/bin_deprec2/train-gaussianizer.py diff --git a/hyperion/bin/train-lda.py b/hyperion/bin_deprec2/train-lda.py similarity index 100% rename from hyperion/bin/train-lda.py rename to hyperion/bin_deprec2/train-lda.py diff --git a/hyperion/bin/train-linear-gbe-up.py b/hyperion/bin_deprec2/train-linear-gbe-up.py similarity index 100% rename from hyperion/bin/train-linear-gbe-up.py rename to hyperion/bin_deprec2/train-linear-gbe-up.py diff --git a/hyperion/bin/train-linear-gbe.py b/hyperion/bin_deprec2/train-linear-gbe.py similarity index 100% rename from hyperion/bin/train-linear-gbe.py rename to hyperion/bin_deprec2/train-linear-gbe.py diff --git a/hyperion/bin/train-linear-svmc.py b/hyperion/bin_deprec2/train-linear-svmc.py similarity index 100% rename from hyperion/bin/train-linear-svmc.py rename to hyperion/bin_deprec2/train-linear-svmc.py diff --git a/hyperion/bin/train-logistic-regression.py b/hyperion/bin_deprec2/train-logistic-regression.py similarity index 100% rename from hyperion/bin/train-logistic-regression.py rename to hyperion/bin_deprec2/train-logistic-regression.py diff --git a/hyperion/bin/train-mvn.py b/hyperion/bin_deprec2/train-mvn.py similarity index 100% rename from hyperion/bin/train-mvn.py rename to hyperion/bin_deprec2/train-mvn.py diff --git a/hyperion/bin/train-nda.py b/hyperion/bin_deprec2/train-nda.py similarity index 100% rename from hyperion/bin/train-nda.py rename to hyperion/bin_deprec2/train-nda.py diff --git a/hyperion/bin/train-pca.py b/hyperion/bin_deprec2/train-pca.py similarity index 100% rename from hyperion/bin/train-pca.py rename to hyperion/bin_deprec2/train-pca.py diff --git a/hyperion/bin/train-plda.py b/hyperion/bin_deprec2/train-plda.py similarity index 100% rename from hyperion/bin/train-plda.py rename to hyperion/bin_deprec2/train-plda.py diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 7dfb8a35..620d4d36 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -6,6 +6,7 @@ import math from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import logging +import time import numpy as np import pandas as pd @@ -90,6 +91,12 @@ def __init__( self._set_num_chunks_per_seg_epoch(num_chunks_per_seg_epoch) self._compute_len() + # fast mapping from classes to segments + self.map_class_to_segs = self.seg_set.df[ + ["id", self.class_name, self.length_name] + ] + self.map_class_to_segs.set_index(self.class_name, drop=False, inplace=True) + self._gather_class_info() self._set_class_weights() @@ -160,8 +167,21 @@ def _gather_class_info(self): self.class_info["min_seg_duration"] = min_dur self.class_info["total_duration"] = total_dur - self.map_idx_to_ids = self.class_info[["class_idx", "id"]] - self.map_idx_to_ids.set_index("class_idx", inplace=True) + # we need the mapping from class index to id + self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]] + self.map_class_idx_to_ids.set_index("class_idx", inplace=True) + + # we need the list of segments from each class + # to speed up segment sampling + # searching then in each batch, it is too slow + map_class_to_segs = self.seg_set.df[["id", self.class_name]].set_index( + self.class_name + ) + self.map_class_to_segs_idx = {} + for class_id in self.class_info["id"].values: + seg_ids = map_class_to_segs.loc[class_id, "id"] + seg_idx = self.seg_set.get_loc(seg_ids) + self.map_class_to_segs_idx[class_id] = seg_idx def _set_class_weights(self): if self.weight_mode == "uniform": @@ -249,20 +269,27 @@ def _sample_classes(self, num_classes, chunk_length): class_idx = self.class_info.loc[class_ids, "class_idx"] class_idx = self.get_hard_prototypes(class_idx) # map back to class ids - class_ids = self.map_idx_to_ids.loc[class_idx] + class_ids = self.map_class_idx_to_ids.loc[class_idx] return class_ids - def _sample_segs(self, class_ids, chunk_length): + def _sample_segs0(self, class_ids, chunk_length): seg_ids = [] for c in class_ids: # for each class we sample segments longer than chunk length # get segments belonging to c - seg_mask = (self.seg_set[self.class_name] == c) & ( - self.seg_set[self.length_name] >= chunk_length - ) - seg_ids_c = self.seg_set.loc[seg_mask, "id"].values + # t1 = time.time() + segs_c = self.map_class_to_segs.loc[c] + # seg_idx_c = self.map_class_to_segs.index.get_loc(c) + if self.class_info.loc[c, "min_seg_duration"] < chunk_length: + segs_c = segs_c[segs_c[self.length_name] >= chunk_length] + # seg_idx_c = seg_idx_c[self.seg_set.loc[seg_idx_c, self.length_name]>chunk_length] + + # t2 = time.time() + seg_ids_c = segs_c["id"].values + # seg_ids_c = self.seg_set.loc[seg_idx_c, "id"].values + # t3 = time.time() # sample num_segs_per_class random segments if len(seg_ids_c) == 0: print(chunk_length, c, self.class_info.loc[c], flush=True) @@ -273,8 +300,11 @@ def _sample_segs(self, class_ids, chunk_length): size=(self.num_segs_per_class,), generator=self.rng, ).numpy() + elif self.seg_weight_mode == "data-prior": - weights = self.seg_set.loc[seg_mask, self.length_name].values + # weights = self.seg_set.loc[seg_mask, self.length_name].values + weights = segs_c[self.length_name].values + # weights = self.seg_set.loc[seg_idx_c, self.length_name].values weights /= weights.sum() sel_seg_idx_c = torch.multinomial( torch.from_numpy(weights), @@ -282,10 +312,68 @@ def _sample_segs(self, class_ids, chunk_length): replacement=True, generator=self.rng, ).numpy() + # t4 = time.time() else: raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode) sel_seg_ids_c = list(seg_ids_c[sel_seg_idx_c]) + # t5 = time.time() seg_ids.extend(sel_seg_ids_c) + # t6 = time.time() + # logging.info( + # "stime %f %f %f %f %f", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5 + # ) + + return seg_ids + + def _sample_segs(self, class_ids, chunk_length): + + dur_col_idx = self.seg_set.get_col_idx(self.length_name) + id_col_idx = self.seg_set.get_col_idx("id") + + seg_ids = [] + for c in class_ids: + # for each class we sample segments longer than chunk length + # get segments belonging to c + t1 = time.time() + seg_idx_c = self.map_class_to_segs_idx[c] + t2 = time.time() + durs = self.seg_set.iloc[seg_idx_c, dur_col_idx].values + if self.class_info.loc[c, "min_seg_duration"] < chunk_length: + mask = durs >= chunk_length + seg_idx_c = seg_idx_c[mask] + durs = durs[mask] + + t3 = time.time() + # sample num_segs_per_class random segments + if len(seg_idx_c) == 0: + print(chunk_length, c, self.class_info.loc[c], flush=True) + if self.seg_weight_mode == "uniform": + sel_seg_idx_c = torch.randint( + low=0, + high=len(seg_idx_c), + size=(self.num_segs_per_class,), + generator=self.rng, + ).numpy() + + elif self.seg_weight_mode == "data-prior": + weights = durs / durs.sum() + sel_seg_idx_c = torch.multinomial( + torch.from_numpy(weights), + num_samples=self.num_segs_per_class, + replacement=True, + generator=self.rng, + ).numpy() + t4 = time.time() + else: + raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode) + + sel_seg_ids_c = list(self.seg_set.iloc[sel_seg_idx_c, id_col_idx]) + t5 = time.time() + seg_ids.extend(sel_seg_ids_c) + t6 = time.time() + logging.info( + "stime %f %f %f %f %f", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5 + ) return seg_ids diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 61033d16..b38bd3fe 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -404,6 +404,9 @@ def get_loc(self, keys): return np.nonzero(loc)[0] else: return list(range(loc.start, loc.stop, loc.step)) + + def get_col_idx(self, keys): + return self.df.columns.get_loc(keys) \ No newline at end of file From 7a6dca1645abef247a9c696c5295fa80db644976 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Mon, 17 Oct 2022 15:58:32 -0400 Subject: [PATCH 033/154] add data preparation for recognition --- egs/librispeech/v1/default_config.sh | 1 + .../v1/global_conf/config_transducer_v1.sh | 50 ++++++++ egs/librispeech/v1/run_011_train_asr.sh | 27 +++-- hyperion/bin/train_wav2vec2transducer.py | 14 ++- hyperion/torch/data/audio_dataset.py | 59 ++++++++- hyperion/utils/text_info.py | 114 ++++++++++++++++++ 6 files changed, 248 insertions(+), 17 deletions(-) create mode 120000 egs/librispeech/v1/default_config.sh create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v1.sh create mode 100644 hyperion/utils/text_info.py diff --git a/egs/librispeech/v1/default_config.sh b/egs/librispeech/v1/default_config.sh new file mode 120000 index 00000000..2b6239b6 --- /dev/null +++ b/egs/librispeech/v1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_transducer_v1.sh \ No newline at end of file diff --git a/egs/librispeech/v1/global_conf/config_transducer_v1.sh b/egs/librispeech/v1/global_conf/config_transducer_v1.sh new file mode 100644 index 00000000..c0a07257 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v1.sh @@ -0,0 +1,50 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage3_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index dc4e1dee..cd68587e 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -8,7 +8,7 @@ set -e stage=1 -ngpu=4 +ngpu=1 config_file=default_config.sh interactive=false num_workers="" @@ -19,11 +19,13 @@ use_wandb=false . $config_file . datapath.sh -list_dir=data/${nnet_data}_proc_audio_no_sil +train_dir=data/${nnet_data}/ +val_dir=data/dev_clean/ #add extra args from the command line arguments if [ -n "$num_workers" ];then extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" fi if [ "$use_tb" == "true" ];then extra_args="$extra_args --trainer.use-tensorboard" @@ -33,9 +35,9 @@ if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi -if [ "$use_wandb" == "true" ];then - extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" -fi +# if [ "$use_wandb" == "true" ];then +# extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +# fi # Network Training @@ -45,15 +47,14 @@ if [ $stage -le 1 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - train_wav2vec2xvector.py $nnet_type \ + train_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ - --data.train.dataset.time-durs-file $list_dir/utt2dur \ - --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ - --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ - --data.val.dataset.time-durs-file $list_dir/utt2dur \ - --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ --num-gpus $ngpu diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 7f6fffef..01db6960 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -25,6 +25,7 @@ from hyperion.torch.utils import ddp from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.data import AudioDataset as AD +# from hyperion.torch.data import LibriSpeechAsrDataModule as ASRD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import HFWav2Vec2Transducer @@ -35,7 +36,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = Sampler.filter_args(**kwargs["sampler"]) @@ -139,10 +139,18 @@ def make_parser(model_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.train.dataset.class_file", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.val.dataset.class_file", action=ActionParser(parser=data_parser)) + parser.add_argument("--data.train.dataset.text_file", action=ActionParser(parser=data_parser)) + parser.add_argument("--data.val.dataset.text_file", action=ActionParser(parser=data_parser)) parser.add_argument("--data.train.data_loader.num_workers", action=ActionParser(parser=data_parser)) parser.add_argument("--data.val.data_loader.num_workers", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--bpe-model", + type=str, + default="data/lang_bpe_500/bpe.model", + help="Path to the BPE model", + ) + # parser.link_arguments( # "data.train.dataset.class_file", "data.val.dataset.class_file" # ) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 8875676f..058b7902 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -18,10 +18,15 @@ from ...utils.utt2info import Utt2Info from ...np.augment import SpeechAugment + +import k2 +import sentencepiece as spm + from torch.utils.data import Dataset import torch.distributed as dist from hyperion.np import augment +from hyperion.utils.util import read_2column_text class AudioDataset1(Dataset): @@ -458,6 +463,8 @@ def __init__( segments_file, class_names=None, class_files=None, + bpe_model=None, + text_files=None, time_durs_file=None, aug_cfgs=None, num_augs=1, @@ -506,6 +513,15 @@ def __init__( logging.info("loading class-info files") self._load_class_infos(class_names, class_files, is_val) + + if bpe_model is not None: + logging.info("loading bpe models") + self._load_bpe_model(bpe_model, is_val) + + if text_files is not None: + logging.info("loading text files") + self._load_text(text_files, is_val) + self.return_segment_info = ( [] if return_segment_info is None else return_segment_info ) @@ -514,6 +530,23 @@ def __init__( self.num_augs = num_augs self._create_augmenters(aug_cfgs) + + def _load_bpe_model(self, bpe_model, is_val): + self.sp = spm.SentencePieceProcessor() + self.sp.load(params.bpe_model) + blank_id = self.sp.piece_to_id("") + vocab_size = self.sp.get_piece_size() + + def _load_text(self, text_file, is_val): + #TODO: load bpe and text into data structure + if text_file is None: + return + if self.rank == 0: + logging.info("loading text file %s" % text_file) + self.text_info = TextInfo.load(text_file, self.sp) + + + def _load_class_infos(self, class_names, class_files, is_val): self.class_info = {} if class_names is None: @@ -642,8 +675,26 @@ def _get_segment_info(self, seg_id): return r - def __getitem__(self, segment): + def _get_text_info(self, seg_id): + #TODO: bpe labels from data structure for getitem + r = [] + # converts the class_ids to integers + for info_name in self.return_segment_info: + seg_info = self.seg_set.loc[seg_id, info_name] + if info_name in self.class_info: + # if the type of information is a class-id + # we use the class information table to + # convert from id to integer + class_info = self.class_info[info_name] + idx = class_info.loc[seg_info, "class_idx"] + seg_info = idx + + r.append(seg_info) + + return r + + def __getitem__(self, segment): seg_id, start, duration = self._parse_segment_item(segment) x, fs = self._read_audio(seg_id, start, duration) if self.augmenters: @@ -665,6 +716,11 @@ def __getitem__(self, segment): seg_info = self._get_segment_info(seg_id) r.extend(seg_info) + # adds the text labels + text_info = self._get_text_info(seg_id) + r.extend(text_info) + + return (*r,) @staticmethod @@ -678,6 +734,7 @@ def filter_args(**kwargs): "num_augs", "class_names", "class_files", + "text_files", "return_segment_info", "return_orig", "time_durs_file", diff --git a/hyperion/utils/text_info.py b/hyperion/utils/text_info.py new file mode 100644 index 00000000..44a71d0a --- /dev/null +++ b/hyperion/utils/text_info.py @@ -0,0 +1,114 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from pathlib import Path + +import numpy as np +import pandas as pd + +from .info_table import InfoTable + + +def read_2column_text(path: Union[Path, str]) -> Dict[str, str]: + """Read a text file having 2 column as dict object. + + Examples: + wav.scp: + key1 /some/path/a.wav + key2 /some/path/b.wav + + >>> read_2column_text('wav.scp') + {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'} + + """ + assert check_argument_types() + + data = {} + with Path(path).open("r", encoding="utf-8") as f: + for linenum, line in enumerate(f, 1): + sps = line.rstrip().split(maxsplit=1) + if len(sps) == 1: + k, v = sps[0], "" + else: + k, v = sps + if k in data: + raise RuntimeError(f"{k} is duplicated ({path}:{linenum})") + data[k] = v + return data + + + +class TextInfo(InfoTable): + def __init__(self, df): + super().__init__(df) + if "class_idx" not in self.df: + self.add_class_idx() + + if "weights" not in self.df: + self.set_uniform_weights() + else: + self.df["weights"] /= self.df["weigths"].sum() + + + def add_class_idx(self): + self.df["class_idx"] = [i for i in range(len(self.df))] + + def set_uniform_weights(self): + self.df["weights"] = 1 / len(self.df) + + def set_weights(self, weights): + self.df["weights"] = weights / weights.sum() + + def exp_weights(self, x): + weights = self.df["weights"] ** x + self.set_weights(weights) + + def set_zero_weight(self, id): + self.df.loc[id, "weights"] = 0 + self.df["weights"] /= self.df["weights"].sum() + + @property + def weights(self, id): + return self.df.loc[id, "weights"] + + @property + def num_classes(self): + return self.df["class_idx"].values.max() + 1 + + @classmethod + def load(cls, file_path, sp): + #TODO: load text information + """Loads utt2info list from text file. + + Args: + file_path: File to read the list. + sp: SentencePieceProcessor for bpe. + Returns: + Utt2Info object + """ + # y: k2.RaggedTensor, + # A ragged tensor with 2 axes [utt][label]. It contains labels of each utterance. + + texts = read_2column_text(file_path) + # {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'} + for utterance_id in texts: + texts[utterance_id] + + y = sp.encode(texts, out_type=int) + y = k2.RaggedTensor(y).to(device) + + file_path = Path(file_path) + ext = file_path.suffix + if ext == "": + # if no extension we load as kaldi utt2spk file + df = pd.read_csv( + file_path, + sep=" ", + header=None, + names=["id"], + dtype={"id": np.str}, + ) + return cls(df) + + return super().load(file_path, sep) From a10083941d9c91a5382c2a62d132985771217eaf Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 17 Oct 2022 22:10:20 -0400 Subject: [PATCH 034/154] updated finetune xvector script --- hyperion/bin/finetune_xvector_from_wav.py | 163 ++++++++++++++++++---- 1 file changed, 133 insertions(+), 30 deletions(-) diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index b5a7f63b..a960ebeb 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -20,32 +20,52 @@ from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.utils import ddp -from hyperion.torch.models import XVector as XVec + +# from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler + +# from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import TDNNXVector as TDXVec +from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "efficientnet": EXVec, + "tdnn": TDXVec, + "transformer": TFXVec, + "spinenet": SpineXVec, +} def init_data(partition, rank, num_gpus, **kwargs): kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) - sampler_args = Sampler.filter_args(**kwargs["sampler"]) + sampler_args = kwargs["sampler"] if rank == 0: logging.info("{} audio dataset args={}".format(partition, ad_args)) logging.info("{} sampler args={}".format(partition, sampler_args)) logging.info("init %s dataset", partition) - ad_args["is_val"] = partition == "val" + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val dataset = AD(**ad_args) if rank == 0: logging.info("init %s samplers", partition) - sampler = Sampler(dataset, **sampler_args) + sampler = SegSamplerFactory.create(dataset, **sampler_args) if rank == 0: logging.info("init %s dataloader", partition) @@ -70,13 +90,13 @@ def init_feats(rank, **kwargs): return feat_extractor -def init_xvector(num_classes, in_model_path, rank, **kwargs): - xvec_args = XVec.filter_finetune_args(**kwargs["model"]) +def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_finetune_args(**kwargs["model"]) if rank == 0: logging.info("xvector network ft args={}".format(xvec_args)) xvec_args["num_classes"] = num_classes - model = TML.load(in_model_path) - model.rebuild_output_layer(**xvec_args) + model = TML.load(in_model_file) + model.change_config(**xvec_args) if rank == 0: logging.info("x-vector-model={}".format(model)) return model @@ -99,9 +119,9 @@ def train_xvec(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) feat_extractor = init_feats(**kwargs) - model = init_xvector(train_loader.dataset.num_classes, **kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) - trn_args = Trainer.filter_args(**kwargs) + trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) metrics = {"acc": CategoricalAccuracy()} @@ -119,14 +139,15 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() -if __name__ == "__main__": +def make_parser(xvec_class): + parser = ArgumentParser() - parser = ArgumentParser(description="Fine-tune x-vector model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) - Sampler.add_class_args(train_parser, prefix="sampler") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", type=int, @@ -136,7 +157,7 @@ def train_xvec(gpu_id, args): val_parser = ArgumentParser(prog="") AD.add_class_args(val_parser, prefix="dataset", skip={}) - Sampler.add_class_args(val_parser, prefix="sampler") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", type=int, @@ -148,41 +169,123 @@ def train_xvec(gpu_id, args): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) parser.link_arguments( - "data.train.dataset.class_file", "data.val.dataset.class_file" + "data.train.dataset.class_files", "data.val.dataset.class_files" ) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) - parser.link_arguments( - "data.train.sampler.batch_size", "data.val.sampler.batch_size" - ) AF.add_class_args(parser, prefix="feats") - parser.add_argument("--in-model-path", required=True) - - XVec.add_finetune_args(parser, prefix="model") + xvec_class.add_finetune_args(parser, prefix="model") + parser.add_argument("--in-model-file", required=True) Trainer.add_class_args( - parser, prefix="trainer", train_modes=XVec.valid_train_modes() + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", type=int, default=1123581321, help="random seed") parser.add_argument( "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int ) - parser.add_argument("--local_rank", default=0, type=int) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Fine-tune x-vector model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) args = parser.parse_args() - gpu_id = args.local_rank - del args.local_rank + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] if gpu_id == 0: try: - config_file = Path(args.exp_path) / "config.yaml" + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass + args_sc.xvec_class = xvec_dict[xvec_type] # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") - train_xvec(gpu_id, args) + train_xvec(gpu_id, args_sc) + + +# if __name__ == "__main__": + +# parser = ArgumentParser(description="Fine-tune x-vector model from audio files") +# parser.add_argument("--cfg", action=ActionConfigFile) + +# train_parser = ArgumentParser(prog="") +# AD.add_class_args(train_parser, prefix="dataset", skip={}) +# Sampler.add_class_args(train_parser, prefix="sampler") +# train_parser.add_argument( +# "--data_loader.num-workers", +# type=int, +# default=5, +# help="num_workers of data loader", +# ) + +# val_parser = ArgumentParser(prog="") +# AD.add_class_args(val_parser, prefix="dataset", skip={}) +# Sampler.add_class_args(val_parser, prefix="sampler") +# val_parser.add_argument( +# "--data_loader.num-workers", +# type=int, +# default=5, +# help="num_workers of data loader", +# ) +# data_parser = ArgumentParser(prog="") +# data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) +# data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) +# parser.add_argument("--data", action=ActionParser(parser=data_parser)) +# parser.link_arguments( +# "data.train.dataset.class_file", "data.val.dataset.class_file" +# ) +# parser.link_arguments( +# "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" +# ) +# parser.link_arguments( +# "data.train.sampler.batch_size", "data.val.sampler.batch_size" +# ) + +# AF.add_class_args(parser, prefix="feats") +# parser.add_argument("--in-model-path", required=True) + +# XVec.add_finetune_args(parser, prefix="model") +# Trainer.add_class_args( +# parser, prefix="trainer", train_modes=XVec.valid_train_modes() +# ) +# ddp.add_ddp_args(parser) + +# parser.add_argument("--seed", type=int, default=1123581321, help="random seed") +# parser.add_argument( +# "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int +# ) +# parser.add_argument("--local_rank", default=0, type=int) + +# args = parser.parse_args() +# gpu_id = args.local_rank +# del args.local_rank + +# if gpu_id == 0: +# try: +# config_file = Path(args.exp_path) / "config.yaml" +# parser.save(args, str(config_file), format="yaml", overwrite=True) +# except: +# pass + +# # torch docs recommend using forkserver +# multiprocessing.set_start_method("forkserver") +# train_xvec(gpu_id, args) From 566341669c2f0f50b3159d2b5b9a72852a8855d3 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 20 Oct 2022 11:07:01 -0400 Subject: [PATCH 035/154] added labels to gbe and lsvmc --- hyperion/np/classifiers/linear_gbe.py | 84 +++++++++++++---------- hyperion/np/classifiers/linear_svmc.py | 93 ++++++++++++++------------ hyperion/np/metrics/acc.py | 4 +- hyperion/np/np_model.py | 8 ++- hyperion/np/transforms/cent_whiten.py | 23 ++++--- hyperion/np/transforms/pca.py | 33 ++++----- 6 files changed, 140 insertions(+), 105 deletions(-) diff --git a/hyperion/np/classifiers/linear_gbe.py b/hyperion/np/classifiers/linear_gbe.py index c786cb50..00a8b1bf 100644 --- a/hyperion/np/classifiers/linear_gbe.py +++ b/hyperion/np/classifiers/linear_gbe.py @@ -5,6 +5,7 @@ import logging import numpy as np +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo from scipy.special import gammaln from ...hyp_defs import float_cpu @@ -30,6 +31,7 @@ class LinearGBE(NPModel): prior_nu: if given, it overwrites nu in the prior object. post_beta: if given, it fixes the value of beta in the posterior, overwriting the beta computed by the fit function. post_nu: if given, it fixes the value of nu in the posterior, overwriting the beta computed by the fit function. + labels: list of class labels. """ def __init__( @@ -48,6 +50,7 @@ def __init__( prior_nu=None, post_beta=None, post_nu=None, + labels=None, **kwargs ): @@ -73,8 +76,15 @@ def __init__( self.post_beta = post_beta self.post_nu = post_nu + self.set_labels(labels) self._compute_Ab() + def set_labels(self, labels): + if isinstance(labels, np.ndarray): + labels = list(labels) + + self.labels = labels + def get_config(self): """ Returns: @@ -90,6 +100,7 @@ def get_config(self): "prior_nu": self.prior_nu, "post_beta": self.post_beta, "post_nu": self.post_nu, + "labels": self.labels, } base_config = super().get_config() @@ -259,7 +270,6 @@ def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): p_theta = sample_weight[:, None] * p_theta N = np.sum(p_theta, axis=0) - F = np.dot(p_theta.T, x) if self.update_mu: @@ -337,8 +347,8 @@ def filter_class_args(**kwargs): valid_args = ( "update_mu", "update_W", - "no_update_mu", - "no_update_W", + "update_mu", + "update_W", "balance_class_weight", "prior", "prior_beta", @@ -348,11 +358,6 @@ def filter_class_args(**kwargs): "name", ) d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - if "no_update_mu" in d: - d["update_mu"] = not d["no_update_mu"] - if "no_update_W" in d: - d["update_W"] = not d["no_update_W"] - return d filter_train_args = filter_class_args @@ -364,61 +369,67 @@ def add_class_args(parser, prefix=None): parser: jsonargparse object prefix: argument prefix. """ - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "no-update-mu", - default=False, - action="store_true", + "--update-mu", + default=True, + action=ActionYesNo, + nargs="?", help="do not update mu", ) parser.add_argument( - p1 + "no-update-W", - default=False, - action="store_true", + "--update-W", + default=True, + action=ActionYesNo, + nargs="?", help="do not update W", ) parser.add_argument( - p1 + "balance-class-weight", + "--balance-class-weight", default=False, - action="store_true", + action=ActionYesNo, + nargs="?", help="Balances the weight of each class when computing W", ) parser.add_argument( - p1 + "prior", default=None, help="prior file for MAP adaptation" + "--prior", default=None, help="prior file for MAP adaptation" ) parser.add_argument( - p1 + "prior-beta", + "--prior-beta", default=16, type=float, help="relevance factor for the means", ) parser.add_argument( - p1 + "prior-nu", + "--prior-nu", default=16, type=float, help="relevance factor for the variances", ) parser.add_argument( - p1 + "post-beta", + "--post-beta", default=None, type=float, help="relevance factor for the means", ) parser.add_argument( - p1 + "post-nu", + "--post-nu", default=None, type=float, help="relevance factor for the variances", ) - parser.add_argument(p1 + "name", default="lgbe", help="model name") + parser.add_argument("--name", default="lgbe", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) @staticmethod - def filter_eval_args(prefix, **kwargs): + def filter_eval_args(**kwargs): """Extracts the evaluation time hyperparams of the class from a dictionary. Returns: @@ -434,20 +445,19 @@ def add_eval_args(parser, prefix=None): parser: jsonargparse object prefix: argument prefix. """ - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") - parser.add_argument(p1 + "model-file", required=True, help=("model file")) parser.add_argument( - p1 + "normalize", + "--normalize", default=False, - action="store_true", + action=ActionYesNo, + nargs="?", help=("normalizes the ouput probabilities to sum to one"), ) parser.add_argument( - p1 + "eval-method", + "--eval-method", default="linear", choices=["linear", "llk", "predictive"], help=( @@ -455,6 +465,10 @@ def add_eval_args(parser, prefix=None): "or predictive distribution" ), ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args add_argparse_train_args = add_class_args diff --git a/hyperion/np/classifiers/linear_svmc.py b/hyperion/np/classifiers/linear_svmc.py index df14a16e..cb95e903 100644 --- a/hyperion/np/classifiers/linear_svmc.py +++ b/hyperion/np/classifiers/linear_svmc.py @@ -5,6 +5,7 @@ import logging import numpy as np +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo from sklearn.svm import LinearSVC as SVC @@ -61,6 +62,7 @@ class LinearSVMC(NPModel): verbose: int, default: 0 balance_class_weight: if True and class_weight is None, it makes class_weight="balanced". lr_seed: seed form RandomState, used when random_state is None. + labels: list of class labels """ def __init__( @@ -81,7 +83,8 @@ def __init__( verbose=0, balance_class_weight=True, lr_seed=1024, - **kwargs + labels=None, + **kwargs, ): super().__init__(**kwargs) @@ -95,7 +98,6 @@ def __init__( self.use_bias = use_bias self.bias_scaling = bias_scaling self.balance_class_weight = balance_class_weight - logging.debug(class_weight) self.svm = SVC( penalty=penalty, C=C, @@ -117,6 +119,8 @@ def __init__( if b is not None: self.svm.intercept_ = b + self.set_labels(labels) + @property def A(self): return self.svm.coef_.T @@ -125,6 +129,12 @@ def A(self): def b(self): return self.svm.intercept_ * self.bias_scaling + def set_labels(self, labels): + if isinstance(labels, np.ndarray): + labels = list(labels) + + self.labels = labels + def get_config(self): """Gets configuration hyperparams. Returns: @@ -134,8 +144,9 @@ def get_config(self): "use_bias": self.use_bias, "bias_scaling": self.bias_scaling, "balance_class_weight": self.balance_class_weight, + "labels": self.labels, } - base_config = super(LinearSVMC, self).get_config() + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def predict(self, x, eval_type="logit"): @@ -203,7 +214,7 @@ def load_params(cls, f, config): return cls(**kwargs) @staticmethod - def filter_class_args(prefix=None, **kwargs): + def filter_class_args(**kwargs): """Extracts the hyperparams of the class from a dictionary. Returns: @@ -236,42 +247,35 @@ def add_class_args(parser, prefix=None): parser: jsonargparse object prefix: argument prefix. """ - if prefix is None: - p1 = "--" - p2 = "" - else: - p1 = "--" + prefix + "." - p2 = prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "penalty", + "--penalty", default="l2", choices=["l2", "l1"], help="used to specify the norm used in the penalization", ) parser.add_argument( - p1 + "c", - dest=(p2 + "C"), + "--c", + dest="C", default=1.0, type=float, help="inverse of regularization strength", ) parser.add_argument( - p1 + "loss", + "--loss", default="squared_hinge", choices=["hinge", "squared_hinge"], help="type of loss", ) parser.add_argument( - p1 + "no-use-bias", - dest=(p2 + "use_bias"), - default=True, - action="store_false", - help="Not use bias", + "--use-bias", default=True, action=ActionYesNo, nargs="?", help="Use bias", ) parser.add_argument( - p1 + "bias-scaling", + "--bias-scaling", default=1.0, type=float, help=( @@ -280,19 +284,19 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "lr-seed", default=1024, type=int, help="random number generator seed" + "--lr-seed", default=1024, type=int, help="random number generator seed" ) parser.add_argument( - p1 + "max-iter", + "--max-iter", default=100, type=int, help="only for the newton-cg, sag and lbfgs solvers", ) parser.add_argument( - p1 + "no-dual", - dest=(p2 + "dual"), + "--dual", default=True, - action="store_false", + action=ActionYesNo, + nargs="?", help=( "dual or primal formulation. " "Dual formulation is only implemented for " @@ -300,10 +304,10 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "tol", default=1e-4, type=float, help="tolerance for stopping criteria" + "--tol", default=1e-4, type=float, help="tolerance for stopping criteria" ) parser.add_argument( - p1 + "multi-class", + "--multi-class", default="ovr", choices=["ovr", "crammer_singer"], help=( @@ -312,29 +316,33 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "verbose", + "--verbose", default=0, type=int, help="For the liblinear and lbfgs solvers", ) parser.add_argument( - p1 + "balance-class-weight", + "--balance-class-weight", default=False, - action="store_true", + action=ActionYesNo, help="Balances the weight of each class when computing W", ) - parser.add_argument(p1 + "name", default="svc", help="model name") + parser.add_argument("--name", default="svc", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) @staticmethod - def filter_eval_args(prefix, **kwargs): + def filter_eval_args(**kwargs): """Extracts the evaluation time hyperparams of the class from a dictionary. Returns: Hyperparameters to evaluate the class. """ - valid_args = ("model_file", "eval_type") + valid_args = "eval_type" return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod @@ -344,21 +352,22 @@ def add_eval_args(parser, prefix=None): parser: jsonargparse object prefix: argument prefix. """ - if prefix is None: - p1 = "--" - p2 = "" - else: - p1 = "--" + prefix + "." - p2 = prefix + "." - - parser.add_argument(p1 + "model-file", required=True, help=("model file")) + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + parser.add_argument( - p1 + "eval-type", + "--eval-type", default="logit", choices=["logit", "bin-logpost", "bin-post", "cat-logpost", "cat-post"], help=("type of evaluation"), ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + # for backward compatibility filter_train_args = filter_class_args add_argparse_args = add_class_args diff --git a/hyperion/np/metrics/acc.py b/hyperion/np/metrics/acc.py index daea183e..148981f5 100644 --- a/hyperion/np/metrics/acc.py +++ b/hyperion/np/metrics/acc.py @@ -22,4 +22,6 @@ def compute_accuracy(y_true, y_pred, normalize=True, sample_weight=None): Returns: Accuracy or number of correctly classified samples. """ - return accuracy_score(y_true, y_pred, normalize, sample_weight) + return accuracy_score( + y_true, y_pred, normalize=normalize, sample_weight=sample_weight + ) diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py index a53135e6..db49f6d5 100644 --- a/hyperion/np/np_model.py +++ b/hyperion/np/np_model.py @@ -195,8 +195,14 @@ def get_config(self): def to_json(self, **kwargs): """Returns model config as json string.""" - # Piece of code borrowed from keras + def get_json_type(obj): + # if obj is a np list of strings + if isinstance(obj, np.ndarray) and obj.ndim == 1: + if isinstance(obj[0], str): + return list(obj) + + # Piece of code borrowed from keras # if obj is any numpy type if type(obj).__module__ == np.__name__: return obj.item() diff --git a/hyperion/np/transforms/cent_whiten.py b/hyperion/np/transforms/cent_whiten.py index e700dbe8..5f71c173 100644 --- a/hyperion/np/transforms/cent_whiten.py +++ b/hyperion/np/transforms/cent_whiten.py @@ -2,7 +2,7 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import numpy as np import h5py @@ -155,25 +155,28 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "update-mu", - default=True, + "--update-mu", + default=ActionYesNo, type=bool, help=("updates centering parameter"), ) parser.add_argument( - p1 + "update-T", + "--update-T", default=True, - type=bool, + type=ActionYesNo, help=("updates whitening parameter"), ) - parser.add_argument(p1 + "name", default="lnorm") + parser.add_argument("--name", default="lnorm") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args diff --git a/hyperion/np/transforms/pca.py b/hyperion/np/transforms/pca.py index 6d6ff7b1..36f6012b 100644 --- a/hyperion/np/transforms/pca.py +++ b/hyperion/np/transforms/pca.py @@ -2,6 +2,7 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import numpy as np import h5py @@ -186,11 +187,7 @@ def load_params(cls, f, config): """ param_list = ["mu", "T"] params = cls._load_params_to_dict(f, config["name"], param_list) - return cls( - mu=params["mu"], - T=params["T"], - **config, - ) + return cls(mu=params["mu"], T=params["T"], **config,) @classmethod def load_mat(cls, file_path): @@ -211,35 +208,39 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "update-mu", + "--update-mu", default=True, - type=bool, + action=ActionYesNo, help=("updates centering parameter"), ) parser.add_argument( - p1 + "update-T", + "--update-T", default=True, - type=bool, + action=ActionYesNo, help=("updates whitening parameter"), ) parser.add_argument( - p1 + "pca-dim", default=None, type=int, help=("output dimension of PCA") + "--pca-dim", default=None, type=int, help=("output dimension of PCA") ) parser.add_argument( - p1 + "pca-var-r", + "--pca-var-r", default=None, - type=int, + type=float, help=("proportion of variance to keep when choosing the PCA dimension"), ) parser.add_argument("--name", dest="name", default="pca") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args From 2121d87860f3c8271259c48e1623a67c92c96fc0 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 21 Oct 2022 20:36:24 -0400 Subject: [PATCH 036/154] fixed class weighted sampler --- hyperion/bin/finetune_xvector_from_wav.py | 17 +++ hyperion/torch/data/audio_dataset.py | 6 +- .../data/class_weighted_seg_chunk_sampler.py | 121 +++++++----------- hyperion/torch/data/seg_sampler_factory.py | 4 +- hyperion/utils/info_table.py | 11 ++ 5 files changed, 82 insertions(+), 77 deletions(-) diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index a960ebeb..0f23fb0a 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -102,6 +102,22 @@ def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): return model +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + if not train_loader.batch_sampler.hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + if not val_loader.batch_sampler.hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + def train_xvec(gpu_id, args): config_logger(args.verbose) @@ -120,6 +136,7 @@ def train_xvec(gpu_id, args): val_loader = init_data(partition="val", **kwargs) feat_extractor = init_feats(**kwargs) model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 8875676f..8c69c3e1 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -590,7 +590,11 @@ def num_classes(self): def _parse_segment_item(self, segment): if isinstance(segment, (tuple, list)): seg_id, start, duration = segment - assert duration <= self.seg_set.loc[seg_id].duration + assert duration <= self.seg_set.loc[seg_id].duration, ( + f"{seg_id} with start={start} duration " + f"({self.seg_set.loc[seg_id].duration}) < " + f"chunk duration ({duration})" + ) else: seg_id, start, duration = segment, 0, 0 diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 620d4d36..27ad4d33 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -91,11 +91,11 @@ def __init__( self._set_num_chunks_per_seg_epoch(num_chunks_per_seg_epoch) self._compute_len() - # fast mapping from classes to segments - self.map_class_to_segs = self.seg_set.df[ - ["id", self.class_name, self.length_name] - ] - self.map_class_to_segs.set_index(self.class_name, drop=False, inplace=True) + # # fast mapping from classes to segments + # self.map_class_to_segs = self.seg_set.df[ + # ["id", self.class_name, self.length_name] + # ] + # self.map_class_to_segs.set_index(self.class_name, drop=False, inplace=True) self._gather_class_info() self._set_class_weights() @@ -103,7 +103,10 @@ def __init__( self.set_hard_prototypes(affinity_matrix) logging.info( - "batches/epoch=%d min-batch-size=%d, max-batch-size=%d avg-batch-size/gpu=%.2f avg-classes/batch=%.2f samples/(seg*epoch)=%d", + ( + "sampler batches/epoch=%d min-batch-size=%d, max-batch-size=%d " + "avg-batch-size/gpu=%.2f avg-classes/batch=%.2f samples/(seg*epoch)=%d" + ), self._len, self.min_batch_size, self.max_batch_size, @@ -179,7 +182,7 @@ def _gather_class_info(self): ) self.map_class_to_segs_idx = {} for class_id in self.class_info["id"].values: - seg_ids = map_class_to_segs.loc[class_id, "id"] + seg_ids = map_class_to_segs.loc[class_id, "id"].values seg_idx = self.seg_set.get_loc(seg_ids) self.map_class_to_segs_idx[class_id] = seg_idx @@ -246,7 +249,7 @@ def _get_class_weights(self, chunk_length): # chunk length and put weight to 0 zero_idx = self.class_info["max_seg_duration"] < chunk_length if not np.any(zero_idx): - return self.class_info["weights"].values + return torch.as_tensor(self.class_info["weights"].values) class_weights = self.class_info["weights"].values.copy() class_weights[zero_idx] = 0.0 @@ -266,65 +269,13 @@ def _sample_classes(self, num_classes, chunk_length): class_ids = self.class_info.iloc[row_idx].id.values if self.hard_prototype_mining: # map class ids to class indexes - class_idx = self.class_info.loc[class_ids, "class_idx"] + class_idx = self.class_info.loc[class_ids, "class_idx"].values class_idx = self.get_hard_prototypes(class_idx) # map back to class ids - class_ids = self.map_class_idx_to_ids.loc[class_idx] + class_ids = self.map_class_idx_to_ids.loc[class_idx, "id"].values return class_ids - def _sample_segs0(self, class_ids, chunk_length): - - seg_ids = [] - for c in class_ids: - # for each class we sample segments longer than chunk length - # get segments belonging to c - # t1 = time.time() - segs_c = self.map_class_to_segs.loc[c] - # seg_idx_c = self.map_class_to_segs.index.get_loc(c) - if self.class_info.loc[c, "min_seg_duration"] < chunk_length: - segs_c = segs_c[segs_c[self.length_name] >= chunk_length] - # seg_idx_c = seg_idx_c[self.seg_set.loc[seg_idx_c, self.length_name]>chunk_length] - - # t2 = time.time() - seg_ids_c = segs_c["id"].values - # seg_ids_c = self.seg_set.loc[seg_idx_c, "id"].values - # t3 = time.time() - # sample num_segs_per_class random segments - if len(seg_ids_c) == 0: - print(chunk_length, c, self.class_info.loc[c], flush=True) - if self.seg_weight_mode == "uniform": - sel_seg_idx_c = torch.randint( - low=0, - high=len(seg_ids_c), - size=(self.num_segs_per_class,), - generator=self.rng, - ).numpy() - - elif self.seg_weight_mode == "data-prior": - # weights = self.seg_set.loc[seg_mask, self.length_name].values - weights = segs_c[self.length_name].values - # weights = self.seg_set.loc[seg_idx_c, self.length_name].values - weights /= weights.sum() - sel_seg_idx_c = torch.multinomial( - torch.from_numpy(weights), - num_samples=self.num_segs_per_class, - replacement=True, - generator=self.rng, - ).numpy() - # t4 = time.time() - else: - raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode) - sel_seg_ids_c = list(seg_ids_c[sel_seg_idx_c]) - # t5 = time.time() - seg_ids.extend(sel_seg_ids_c) - # t6 = time.time() - # logging.info( - # "stime %f %f %f %f %f", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5 - # ) - - return seg_ids - def _sample_segs(self, class_ids, chunk_length): dur_col_idx = self.seg_set.get_col_idx(self.length_name) @@ -334,21 +285,21 @@ def _sample_segs(self, class_ids, chunk_length): for c in class_ids: # for each class we sample segments longer than chunk length # get segments belonging to c - t1 = time.time() + # t1 = time.time() seg_idx_c = self.map_class_to_segs_idx[c] - t2 = time.time() + # t2 = time.time() durs = self.seg_set.iloc[seg_idx_c, dur_col_idx].values if self.class_info.loc[c, "min_seg_duration"] < chunk_length: mask = durs >= chunk_length seg_idx_c = seg_idx_c[mask] durs = durs[mask] - t3 = time.time() + # t3 = time.time() # sample num_segs_per_class random segments if len(seg_idx_c) == 0: - print(chunk_length, c, self.class_info.loc[c], flush=True) + logging.error("no segments found with class=%s dur=%d", c, chunk_length) if self.seg_weight_mode == "uniform": - sel_seg_idx_c = torch.randint( + sel_idx = torch.randint( low=0, high=len(seg_idx_c), size=(self.num_segs_per_class,), @@ -357,23 +308,24 @@ def _sample_segs(self, class_ids, chunk_length): elif self.seg_weight_mode == "data-prior": weights = durs / durs.sum() - sel_seg_idx_c = torch.multinomial( + sel_idx = torch.multinomial( torch.from_numpy(weights), num_samples=self.num_segs_per_class, replacement=True, generator=self.rng, ).numpy() - t4 = time.time() + # t4 = time.time() else: raise ValueError("unknown seg-weight-mode=%s", self.seg_weight_mode) + sel_seg_idx_c = seg_idx_c[sel_idx] sel_seg_ids_c = list(self.seg_set.iloc[sel_seg_idx_c, id_col_idx]) - t5 = time.time() + # t5 = time.time() seg_ids.extend(sel_seg_ids_c) - t6 = time.time() - logging.info( - "stime %f %f %f %f %f", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5 - ) + # t6 = time.time() + # logging.info( + # "stime %f %f %f %f %f", t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5 + # ) return seg_ids @@ -395,12 +347,33 @@ def __next__(self): if self.batch == self._len: raise StopIteration + # t1 = time.time() chunk_length = self._sample_chunk_length() + # t2 = time.time() batch_size = self._compute_batch_size(chunk_length) + # t3 = time.time() num_classes = self._compute_num_classes_per_batch(batch_size) + # t4 = time.time() class_ids = self._sample_classes(num_classes, chunk_length) + # t5 = time.time() seg_ids = self._sample_segs(class_ids, chunk_length) + # t6 = time.time() chunks = self._sample_chunks(seg_ids, chunk_length) + # t7 = time.time() + # print( + # "next", + # t2 - t1, + # t3 - t2, + # t4 - t3, + # t5 - t4, + # t6 - t5, + # t7 - t6, + # batch_size, + # num_classes, + # self.min_batch_size, + # len(chunks), + # flush=True, + # ) if self.batch == 0: logging.info("batch 0 uttidx=%s", str(chunks[:10])) diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index 3093a532..251d937b 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -132,7 +132,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--min-batch-size", type=int, - default=1, + default=64, help=("minimum batch size per gpu"), ) parser.add_argument( @@ -146,7 +146,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--batch-size", - default=128, + default=None, type=int, help=("deprecated, use min-batch-size instead"), ) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index b38bd3fe..217f1f9a 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -39,6 +39,14 @@ def clone(self): def __len__(self): return self.df.__len__ + @property + def __str__(self): + return self.df.__str__ + + @property + def __repr__(self): + return self.df.__repr__ + @property def iat(self): return self.df.iat @@ -397,6 +405,9 @@ def reset_index(self): self.df.set_index("id", drop=False, inplace=True) def get_loc(self, keys): + if isinstance(keys, (list, np.ndarray)): + return self.df.index.get_indexer(keys) + loc = self.df.index.get_loc(keys) if isinstance(loc, int): return loc From 115d00ed9d8bb47ff0d997b6ad20fea639ec03bd Mon Sep 17 00:00:00 2001 From: neillu23 Date: Mon, 24 Oct 2022 00:21:01 -0400 Subject: [PATCH 037/154] Add Mix Musan noise, and update the transducer data preparation --- egs/librispeech/v1/conf/clsp.conf | 11 + egs/librispeech/v1/conf/fbank80_16k.yaml | 7 - egs/librispeech/v1/conf/reverb_noise_aug.yaml | 35 ++++ ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 45 +++++ .../conf/wav2vec2base960h_ecapatdnn512x2.yaml | 37 ---- .../v1/global_conf/config_transducer_v1.sh | 8 +- ...nn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh | 55 ----- egs/librispeech/v1/local/make_musan.py | 189 ++++++++++++++++++ egs/librispeech/v1/local/make_musan.sh | 48 +++++ egs/librispeech/v1/local/make_rirs_data.sh | 29 +++ egs/librispeech/v1/run_003_compute_fbank.sh | 67 ------- .../v1/run_003_prepare_noises_rirs.sh | 67 +++++++ .../v1/run_010_prepare_asr_train_data.sh | 42 ---- egs/librispeech/v1/run_011_train_asr.sh | 4 +- egs/librispeech/v1/steps_xvec | 1 + hyperion/bin/train_wav2vec2transducer.py | 18 +- hyperion/bin/train_wav2vec2xvector.py | 2 +- hyperion/torch/data/audio_dataset.py | 68 ++++--- hyperion/utils/info_table.py | 7 +- hyperion/utils/text_info.py | 56 +++--- 20 files changed, 511 insertions(+), 285 deletions(-) create mode 100644 egs/librispeech/v1/conf/clsp.conf delete mode 100644 egs/librispeech/v1/conf/fbank80_16k.yaml create mode 100644 egs/librispeech/v1/conf/reverb_noise_aug.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml delete mode 100644 egs/librispeech/v1/conf/wav2vec2base960h_ecapatdnn512x2.yaml delete mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh create mode 100755 egs/librispeech/v1/local/make_musan.py create mode 100755 egs/librispeech/v1/local/make_musan.sh create mode 100755 egs/librispeech/v1/local/make_rirs_data.sh delete mode 100755 egs/librispeech/v1/run_003_compute_fbank.sh create mode 100755 egs/librispeech/v1/run_003_prepare_noises_rirs.sh delete mode 100755 egs/librispeech/v1/run_010_prepare_asr_train_data.sh create mode 120000 egs/librispeech/v1/steps_xvec diff --git a/egs/librispeech/v1/conf/clsp.conf b/egs/librispeech/v1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/librispeech/v1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/librispeech/v1/conf/fbank80_16k.yaml b/egs/librispeech/v1/conf/fbank80_16k.yaml deleted file mode 100644 index 88bae69e..00000000 --- a/egs/librispeech/v1/conf/fbank80_16k.yaml +++ /dev/null @@ -1,7 +0,0 @@ -sample_frequency: 16000 -frame_length: 25 -low_freq: 20 -high_freq: 7600 -num_filters: 80 -snip_edges: false -use_energy: false diff --git a/egs/librispeech/v1/conf/reverb_noise_aug.yaml b/egs/librispeech/v1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/librispeech/v1/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml new file mode 100644 index 00000000..737f42cf --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -0,0 +1,45 @@ +data: + train: + dataset: + # max_chunk_length: 3.0 + # min_chunk_length: 3.0 + aug_cfgs: [conf/reverb_noise_aug.yaml] + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 + val: + dataset: + # max_chunk_length: 4.0 + # min_chunk_length: 4.0 + aug_cfgs: [conf/reverb_noise_aug.yaml] + wav_scale: 1 + sampler: + batch_size: 32 + iters_per_epoch: 6 + data_loader: + num_workers: 8 +model: +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-4 + warmup_steps: 1500 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/wav2vec2base960h_ecapatdnn512x2.yaml b/egs/librispeech/v1/conf/wav2vec2base960h_ecapatdnn512x2.yaml deleted file mode 100644 index 85964372..00000000 --- a/egs/librispeech/v1/conf/wav2vec2base960h_ecapatdnn512x2.yaml +++ /dev/null @@ -1,37 +0,0 @@ -hf_feats: - pretrained_model_path: facebook/wav2vec2-base-960h -xvector: - resnet_enc: - in_feats: 80 - in_conv_channels: 512 - in_kernel_size: 5 - in_stride: 1 - resb_type: seres2bn - resb_repeats: - - 1 - - 1 - resb_channels: - - 512 - resb_kernel_sizes: - - 3 - resb_dilations: - - 2 - - 3 - resb_strides: - - 1 - res2net_width_factor: 1 - res2net_scale: 8 - se_r: 4 - multilayer: true - multilayer_concat: true - endpoint_channels: 1536 - pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 128 - embed_dim: 256 - cos_scale: 30.0 - margin: 0.3 - margin_warmup_epochs: 20.0 - dropout_rate: 0.0 -feat_fusion_start: 2 -feat_fusion_method: weighted-avg diff --git a/egs/librispeech/v1/global_conf/config_transducer_v1.sh b/egs/librispeech/v1/global_conf/config_transducer_v1.sh index c0a07257..e6f7eac0 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v1.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v1.sh @@ -4,14 +4,14 @@ hf_model_name=wav2vec2xlsr300m #vad -vad_config=conf/vad_16k.yaml +# vad_config=conf/vad_16k.yaml # x-vector training nnet_data=train_clean_100 # x-vector cfg -nnet_type=hf_wav2vec2resnet1d +nnet_type=hf_wav2vec2transducer nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s1_args="" @@ -22,13 +22,13 @@ nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/model_ep0060.pth -nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v1.0.yaml +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name nnet_s2=$nnet_s2_dir/model_ep0020.pth -nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage3_v1.0.yaml +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s3_args="" nnet_s3_name=${nnet_name}.s3 nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh deleted file mode 100644 index 942fb336..00000000 --- a/egs/librispeech/v1/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh +++ /dev/null @@ -1,55 +0,0 @@ -# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 - -# hugging face model -hf_model_name=wav2vec2base - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wav2vec2resnet1d - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -dropout=0 -embed_dim=256 -lr=0.05 -s=30 -margin_warmup=20 -margin=0.3 -nnet_num_epochs=70 - - -lr=0.001 -#lr=0.005 -xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75 --model conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml --data.train.dataset.max-chunk-length 2 --data.train.dataset.min-chunk-length 2" - -nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v12 #v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/swa_model_ep0076.pth -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/model_ep0030.pth -nnet=$nnet_dir/model_ep0040.pth -nnet=$nnet_dir/model_ep0020.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/librispeech/v1/local/make_musan.py b/egs/librispeech/v1/local/make_musan.py new file mode 100755 index 00000000..b0ae6846 --- /dev/null +++ b/egs/librispeech/v1/local/make_musan.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + + +def prepare_music(root_dir, fs, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_speech(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_noise(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def main(): + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/v1/local/make_musan.sh b/egs/librispeech/v1/local/make_musan.sh new file mode 100755 index 00000000..4a6d30f9 --- /dev/null +++ b/egs/librispeech/v1/local/make_musan.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +use_vocals='Y' + +. parse_options.sh || exit 1; + +if [ $# -ne 3 ];then + echo "Usage: $0 [options] "; + echo "e.g.: $0 /export/corpora/JHU/musan 8 data" + exit 1; +fi + +in_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} $fs ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf $data_dir/musan.tmp + diff --git a/egs/librispeech/v1/local/make_rirs_data.sh b/egs/librispeech/v1/local/make_rirs_data.sh new file mode 100755 index 00000000..c6652eda --- /dev/null +++ b/egs/librispeech/v1/local/make_rirs_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# +# Apache 2.0. +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom" +fi + +rir_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir + +rir_list=$rir_dir/rir_list +if [ "$fs" -eq 16 ];then + awk '{ key=$5; sub(/.*\//,"",key); print key,$5 }' $rir_list > $data_dir/wav.scp +else + awk '{ +key=$5; sub(/.*\//,"",key); +print key,"sox "$5" -r 8000 -t wav -b 16 -e signed-integer - |" }' \ + $rir_list > $data_dir/wav.scp +fi +awk '{ key=$5; sub(/.*\//,"",key); print key,$4 }' $rir_list > $data_dir/rir2room + diff --git a/egs/librispeech/v1/run_003_compute_fbank.sh b/egs/librispeech/v1/run_003_compute_fbank.sh deleted file mode 100755 index 0f5966a8..00000000 --- a/egs/librispeech/v1/run_003_compute_fbank.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -fbankdir=`pwd`/exp/fbank - -stage=1 -config_file=default_config.sh -feat_vers="numpy" - -. parse_options.sh || exit 1; - -if [ "$feat_vers" == "kaldi" ];then - make_fbank=steps/make_fbank.sh - fbank_cfg=conf/fbank80_16k.conf -else - fbank_cfg=conf/fbank80_16k.yaml - if [ "$feat_vers" == "numpy" ];then - make_fbank=steps_pyfe/make_fbank.sh - else - make_fbank=steps_pyfe/make_torch_fbank.sh - fi -fi - - -# Make filterbanks -if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $fbankdir/storage ]; then - dir_name=$USER/hyp-data/librispeech/v1/$storage_name/fbank/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $fbankdir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $fbankdir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $fbankdir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $fbankdir/storage - else - echo "we don't distribute data between multiple machines" - fi - fi -fi - -if [ $stage -le 2 ];then - for name in dev_clean test_clean dev_other test_other train_clean_100 train_clean_360 train_other_500; - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - $make_fbank --write-utt2num-frames true --fbank-config $fbank_cfg --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_fbank/$name $fbankdir - utils/fix_data_dir.sh data/${name} - done - -fi - diff --git a/egs/librispeech/v1/run_003_prepare_noises_rirs.sh b/egs/librispeech/v1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..a448af9a --- /dev/null +++ b/egs/librispeech/v1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation + +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then + ln -s ../../sre19-cmn2/v1/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/librispeech/v1/run_010_prepare_asr_train_data.sh b/egs/librispeech/v1/run_010_prepare_asr_train_data.sh deleted file mode 100755 index 5936fbf4..00000000 --- a/egs/librispeech/v1/run_010_prepare_asr_train_data.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Copyright -# 2020 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -if [ $stage -le 2 ]; then - # This script preprocess audio for x-vector training - steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ - data/${nnet_data} data/${nnet_data}_proc_audio_no_sil exp/${nnet_data}_proc_audio_no_sil - hyp_utils/kaldi/utils/fix_data_dir.sh data/${nnet_data}_proc_audio_no_sil - -fi - -if [ $stage -le 3 ]; then - # Now, we remove files with less than 4s - hyp_utils/remove_short_audios.sh --min-len 4 data/${nnet_data}_proc_audio_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 4 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${nnet_data}_proc_audio_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh \ - data/${nnet_data}_proc_audio_no_sil \ - data/${nnet_data}_proc_audio_no_sil/lists_xvec -fi - -exit diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index cd68587e..85d2e918 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -51,12 +51,12 @@ if [ $stage -le 1 ]; then --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ --data.train.dataset.time-durs-file $train_dir/utt2dur \ - --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ --data.val.dataset.time-durs-file $val_dir/utt2dur \ - --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ --num-gpus $ngpu + # --data.train.dataset.text-file $train_dir/text \ + # --data.val.dataset.text-file $val_dir/text \ fi diff --git a/egs/librispeech/v1/steps_xvec b/egs/librispeech/v1/steps_xvec new file mode 120000 index 00000000..289276b7 --- /dev/null +++ b/egs/librispeech/v1/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors/ \ No newline at end of file diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 01db6960..07a6a31a 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -116,7 +116,7 @@ def make_parser(model_class): parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") - AD.add_class_args(train_parser, prefix="dataset", skip={}) + AD.add_class_args(train_parser, prefix="dataset", skip={"segments_file"}) Sampler.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", @@ -126,7 +126,7 @@ def make_parser(model_class): ) val_parser = ArgumentParser(prog="") - AD.add_class_args(val_parser, prefix="dataset", skip={}) + AD.add_class_args(val_parser, prefix="dataset", skip={"segments_file"}) Sampler.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", @@ -139,10 +139,16 @@ def make_parser(model_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.train.dataset.text_file", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.val.dataset.text_file", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.train.data_loader.num_workers", action=ActionParser(parser=data_parser)) - parser.add_argument("--data.val.data_loader.num_workers", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + parser.add_argument("--data.val.dataset.text_file", type=str) + parser.add_argument("--data.train.data_loader.num_workers", type=int, + default=5,) + parser.add_argument("--data.val.data_loader.num_workers", type=int, + default=5,) parser.add_argument( "--bpe-model", diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 8c30faaf..d2ef9715 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -145,7 +145,7 @@ def make_parser(model_class): parser.add_argument("--data", action=ActionParser(parser=data_parser)) parser.link_arguments( - "data.train.dataset.class_file", "data.val.dataset.class_file" + "data.train.dataset.class_files", "data.val.dataset.class_files" ) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 058b7902..58905ef8 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -26,7 +26,6 @@ import torch.distributed as dist from hyperion.np import augment -from hyperion.utils.util import read_2column_text class AudioDataset1(Dataset): @@ -460,7 +459,7 @@ class AudioDataset(Dataset): def __init__( self, audio_file, - segments_file, + segments_file=None, class_names=None, class_files=None, bpe_model=None, @@ -491,24 +490,25 @@ def __init__( self.r = AR(audio_file, wav_scale=wav_scale) + if rank == 0: logging.info("loading segments file %s" % segments_file) - - self.seg_set = SegmentSet.load(segments_file) - if rank == 0: - logging.info("dataset contains %d seqs" % len(self.seg_set)) - - self.is_val = is_val - if time_durs_file is not None: + if segments_file is not None: + self.seg_set = SegmentSet.load(segments_file) if rank == 0: - logging.info("loading durations file %s" % time_durs_file) + logging.info("dataset contains %d seqs" % len(self.seg_set)) - time_durs = SegmentSet.load(time_durs_file) - self.seg_set["duration"] = time_durs.loc[ - self.seg_set["id"] - ].class_id.values.astype(np.float, copy=False) - else: - assert "duration" in self.seg_set + self.is_val = is_val + if time_durs_file is not None: + if rank == 0: + logging.info("loading durations file %s" % time_durs_file) + + time_durs = SegmentSet.load(time_durs_file) + self.seg_set["duration"] = time_durs.loc[ + self.seg_set["id"] + ].class_id.values.astype(np.float, copy=False) + else: + assert "duration" in self.seg_set logging.info("loading class-info files") self._load_class_infos(class_names, class_files, is_val) @@ -520,7 +520,7 @@ def __init__( if text_files is not None: logging.info("loading text files") - self._load_text(text_files, is_val) + self._load_text_infos(text_files, is_val) self.return_segment_info = ( [] if return_segment_info is None else return_segment_info @@ -537,7 +537,7 @@ def _load_bpe_model(self, bpe_model, is_val): blank_id = self.sp.piece_to_id("") vocab_size = self.sp.get_piece_size() - def _load_text(self, text_file, is_val): + def _load_text_infos(self, text_file, is_val): #TODO: load bpe and text into data structure if text_file is None: return @@ -682,19 +682,22 @@ def _get_text_info(self, seg_id): # converts the class_ids to integers for info_name in self.return_segment_info: seg_info = self.seg_set.loc[seg_id, info_name] - if info_name in self.class_info: - # if the type of information is a class-id - # we use the class information table to - # convert from id to integer - class_info = self.class_info[info_name] - idx = class_info.loc[seg_info, "class_idx"] + if info_name in self.text_info: + # if the type of information is a text + # we use the text information table to + # convert from id to text labels + text_info = self.text_info[info_name] + idx = text_info.loc[seg_info, "class_idx"] seg_info = idx + y = sp.encode(text, out_type=int) + y = k2.RaggedTensor(y).to(device) - r.append(seg_info) + r.append(y) return r def __getitem__(self, segment): + #TODO: check the start/end time for Recognition seg_id, start, duration = self._parse_segment_item(segment) x, fs = self._read_audio(seg_id, start, duration) if self.augmenters: @@ -712,9 +715,10 @@ def __getitem__(self, segment): else: r = [x] - # adds the segment labels - seg_info = self._get_segment_info(seg_id) - r.extend(seg_info) + #TODO: Add it back for both case + # # adds the segment labels + # seg_info = self._get_segment_info(seg_id) + # r.extend(seg_info) # adds the text labels text_info = self._get_text_info(seg_id) @@ -787,6 +791,14 @@ def add_class_args(parser, prefix=None, skip={}): ), ) + parser.add_argument( + "--text-file", + default=None, + help=( + "text file" + ), + ) + parser.add_argument( "--aug-cfgs", default=None, diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 25632941..4433327d 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -88,13 +88,14 @@ def save(self, file_path, sep=None): self.df.to_csv(file_path, sep=sep, index=False) @classmethod - def load(cls, file_path, sep=None): + def load(cls, file_path, sep=None, name="class_id"): """Loads utt2info list from text file. Args: file_path: File to read the list. sep: Separator between the key and file_path in the text file. dtype: Dictionary with the dtypes of each column. + name: name for the data to be loaded Returns: Utt2Info object """ @@ -106,8 +107,8 @@ def load(cls, file_path, sep=None): file_path, sep=" ", header=None, - names=["id", "class_id"], - dtype={"id": np.str, "class_id": np.str}, + names=["id", name], + dtype={"id": np.str, name: np.str}, ) else: if sep is None: diff --git a/hyperion/utils/text_info.py b/hyperion/utils/text_info.py index 44a71d0a..4d7e2c3a 100644 --- a/hyperion/utils/text_info.py +++ b/hyperion/utils/text_info.py @@ -42,18 +42,11 @@ def read_2column_text(path: Union[Path, str]) -> Dict[str, str]: class TextInfo(InfoTable): def __init__(self, df): super().__init__(df) - if "class_idx" not in self.df: - self.add_class_idx() - if "weights" not in self.df: self.set_uniform_weights() else: self.df["weights"] /= self.df["weigths"].sum() - - def add_class_idx(self): - self.df["class_idx"] = [i for i in range(len(self.df))] - def set_uniform_weights(self): self.df["weights"] = 1 / len(self.df) @@ -77,38 +70,35 @@ def num_classes(self): return self.df["class_idx"].values.max() + 1 @classmethod - def load(cls, file_path, sp): - #TODO: load text information + def load(cls, file_path, sp, sep=None): """Loads utt2info list from text file. Args: file_path: File to read the list. - sp: SentencePieceProcessor for bpe. + sp: SentencePieceProcessor from the BPE model + sep: Separator between the key and file_path in the text file. + dtype: Dictionary with the dtypes of each column. Returns: Utt2Info object """ - # y: k2.RaggedTensor, - # A ragged tensor with 2 axes [utt][label]. It contains labels of each utterance. - - texts = read_2column_text(file_path) - # {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'} - for utterance_id in texts: - texts[utterance_id] - - y = sp.encode(texts, out_type=int) - y = k2.RaggedTensor(y).to(device) + #TODO: load text information + """Loads utt2info list from text file. + Args: + file_path: File to read the list. + sp: SentencePieceProcessor for bpe. + Returns: + Utt2Info object + """ + # # y: k2.RaggedTensor, + # # A ragged tensor with 2 axes [utt][label]. It contains labels of each utterance. + # y = sp.encode(texts, out_type=int) + # y = k2.RaggedTensor(y).to(device) file_path = Path(file_path) - ext = file_path.suffix - if ext == "": - # if no extension we load as kaldi utt2spk file - df = pd.read_csv( - file_path, - sep=" ", - header=None, - names=["id"], - dtype={"id": np.str}, - ) - return cls(df) - - return super().load(file_path, sep) + text_df = super().load(file_path, sep, name="text_label") + # for i, text in enumerate(text_df["text_label"]): + # y = sp.encode(text, out_type=int) + # y = k2.RaggedTensor(y).to(device) + # text_df["text_label"][i] = y + + return text_df From dac0cc5e3c152b19e7fe23c75a7499b77eaa6bfb Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 25 Oct 2022 10:53:36 -0400 Subject: [PATCH 038/154] fix weighted sampler for unpresent classes, and changed bucket creation --- hyperion/torch/data/bucketing_seg_sampler.py | 19 +++++++------------ .../data/class_weighted_seg_chunk_sampler.py | 16 ++++++++++++---- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index 85b6772e..224660bb 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -36,30 +36,25 @@ def __init__( self._compute_len() self.depleted_buckets = torch.zeros((num_buckets,), dtype=torch.bool) - @staticmethod - def create_buckets(self, seg_ids, seg_lengths): - sort_idx = torch.argsort(seg_lengths) - sort_ids = seg_ids[sort_idx] - sort_lengths = seg_lengths[sort_ids] - cum_lengths = torch.cumsum(sort_lengths) + def create_buckets(self): + sort_idx = torch.argsort(self.seg_set[self.length_column].values) + sorted_seg_set = self.seg_set.iloc[sort_idx] + cum_lengths = torch.cumsum(sorted_seg_set[self.length_column].values) bucket_length = cum_lengths[-1] / self.num_buckets buckets = [] for i in range(self.num_buckets): bucket_idx = (cum_lengths <= bucket_length) & (cum_lengths > 0) - bucket_i = sort_ids[bucket_idx] + bucket_i = sorted_seg_set.loc[bucket_idx] buckets.append(bucket_i) cum_lengths -= bucket_length return buckets def _create_bucket_samplers(self): - buckets = self.create_buckets( - self.dataset["ids"], self.dataset[self.length_column] - ) + buckets = self.create_buckets() bucket_samplers = [] for i in range(self.num_buckets): - dataset_i = self.dataset.create_bucket(buckets[i]) - sampler_i = self.base_sampler(dataset_i, self.seed, **self.base_kwargs) + sampler_i = self.base_sampler(buckets[i], self.seed, **self.base_kwargs) bucket_samplers.append(sampler_i) self.bucket_samplers = bucket_samplers diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 27ad4d33..05b222c7 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -213,6 +213,17 @@ def set_hard_prototypes(self, affinity_matrix): self.hard_prototypes = None return + # don't sample hard negs from classes with zero weigth or absent + zero_w = self.class_info["weights"] == 0 + if np.any(zero_w): + zero_w_idx = self.class_info.loc[zero_w, "class_idx"].values + affinity_matrix[:, zero_w_idx] = -1000 + + for i in range(affinity_matrix.size(1)): + mask_i = self.class_info["class_idx"] == i + if np.all(mask_i == 0): + affinity_matrix[:, i] = -1000 + # affinity_matrix[np.diag(affinity_matrix.shape[0])] = -1.0 # hard prototypes for a class are itself and k-1 closest to it. self.hard_prototypes = torch.topk( @@ -260,10 +271,7 @@ def _get_class_weights(self, chunk_length): def _sample_classes(self, num_classes, chunk_length): weights = self._get_class_weights(chunk_length) row_idx = torch.multinomial( - weights, - num_samples=num_classes, - replacement=True, - generator=self.rng, + weights, num_samples=num_classes, replacement=True, generator=self.rng, ).numpy() class_ids = self.class_info.iloc[row_idx].id.values From 1ff37201ef388859084b9c1cc7153803359eb489 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Tue, 25 Oct 2022 22:49:57 -0400 Subject: [PATCH 039/154] update loading text and batch into the transducer --- ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 2 + .../v1/global_conf/config_transducer_v1.sh | 1 + egs/librispeech/v1/local/data_prep.sh | 3 + egs/librispeech/v1/run_011_train_asr.sh | 28 +++++-- hyperion/bin/train_wav2vec2transducer.py | 83 +++++++++++++------ hyperion/torch/data/__init__.py | 1 + hyperion/torch/data/audio_dataset.py | 32 ++++--- hyperion/torch/data/bucketing_seg_sampler.py | 15 ++-- hyperion/utils/utils.py | 1 + 9 files changed, 118 insertions(+), 48 deletions(-) diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index 737f42cf..3c9385c7 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -6,6 +6,7 @@ data: aug_cfgs: [conf/reverb_noise_aug.yaml] wav_scale: 1 sampler: + sampler_type: 'bucketing_seg_sampler' batch_size: 32 iters_per_epoch: 6 data_loader: @@ -17,6 +18,7 @@ data: aug_cfgs: [conf/reverb_noise_aug.yaml] wav_scale: 1 sampler: + sampler_type: 'bucketing_seg_sampler' batch_size: 32 iters_per_epoch: 6 data_loader: diff --git a/egs/librispeech/v1/global_conf/config_transducer_v1.sh b/egs/librispeech/v1/global_conf/config_transducer_v1.sh index e6f7eac0..00b34870 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v1.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v1.sh @@ -9,6 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=train_clean_100 +bpe_model=data/lang_bpe_1000/bpe.model # x-vector cfg nnet_type=hf_wav2vec2transducer diff --git a/egs/librispeech/v1/local/data_prep.sh b/egs/librispeech/v1/local/data_prep.sh index c903d45b..0923aeab 100755 --- a/egs/librispeech/v1/local/data_prep.sh +++ b/egs/librispeech/v1/local/data_prep.sh @@ -72,6 +72,9 @@ done spk2utt=$dst/spk2utt utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 +utils/data/get_utt2dur.sh $dst +awk 'sub(/ *$/, "", $0)' $dst/utt2dur > $dst/utt2dur2 +mv $dst/utt2dur2 $dst/utt2dur ntrans=$(wc -l <$trans) nutt2spk=$(wc -l <$utt2spk) diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index 85d2e918..61f00be4 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -35,9 +35,9 @@ if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi -# if [ "$use_wandb" == "true" ];then -# extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" -# fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi # Network Training @@ -50,14 +50,26 @@ if [ $stage -le 1 ]; then train_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ - --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ --data.val.dataset.audio-file $val_dir/wav.scp \ - --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ --num-gpus $ngpu - # --data.train.dataset.text-file $train_dir/text \ - # --data.val.dataset.text-file $val_dir/text \ - + +# --cfg $xvec_train_base_cfg $xvec_train_args $extra_args \ +# --data.train.dataset.audio-file $list_dir/wav.scp \ +# --data.train.dataset.time-durs-file $list_dir/utt2dur \ +# --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ +# --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ +# --data.val.dataset.audio-file $list_dir/wav.scp \ +# --data.val.dataset.time-durs-file $list_dir/utt2dur \ +# --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ +# --trainer.exp-path $nnet_dir $args \ fi if [ $stage -le 2 ]; then diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 07a6a31a..360c2c0d 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -26,7 +26,8 @@ from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.data import AudioDataset as AD # from hyperion.torch.data import LibriSpeechAsrDataModule as ASRD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +# from hyperion.torch.data import BucketingSegSampler as Sampler +from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import HFWav2Vec2Transducer @@ -38,19 +39,22 @@ def init_data(partition, rank, num_gpus, **kwargs): kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) - sampler_args = Sampler.filter_args(**kwargs["sampler"]) + sampler_args = kwargs["sampler"] if rank == 0: logging.info("{} audio dataset args={}".format(partition, ad_args)) logging.info("{} sampler args={}".format(partition, sampler_args)) logging.info("init %s dataset", partition) - ad_args["is_val"] = partition == "val" + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + print("ad_args", ad_args) dataset = AD(**ad_args) if rank == 0: logging.info("init %s samplers", partition) - - sampler = Sampler(dataset, **sampler_args) + print("sampler_args", sampler_args) + sampler = SegSamplerFactory.create(dataset, **sampler_args) if rank == 0: logging.info("init %s dataloader", partition) @@ -69,7 +73,7 @@ def init_model(num_classes, rank, model_class, **kwargs): if rank == 0: logging.info("model network args={}".format(model_args)) # TODO: check model_args - model_args["transducer"]["num_classes"] = num_classes + model_args["num_classes"] = num_classes model = model_class(**model_args) if rank == 0: logging.info("model={}".format(model)) @@ -86,13 +90,20 @@ def train_model(gpu_id, args): torch.manual_seed(args.seed) set_float_cpu("float32") - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank + # ddp_args = ddp.filter_ddp_args(**kwargs) + # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + # kwargs["rank"] = rank + # for Debug + rank = 0 + kwargs["rank"] = 0 + device = "cpu" + world_size=1 + train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.num_classes, **kwargs) + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) + # model = init_model(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: @@ -116,8 +127,9 @@ def make_parser(model_class): parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") - AD.add_class_args(train_parser, prefix="dataset", skip={"segments_file"}) - Sampler.add_class_args(train_parser, prefix="sampler") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + # Sampler.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", type=int, @@ -126,8 +138,9 @@ def make_parser(model_class): ) val_parser = ArgumentParser(prog="") - AD.add_class_args(val_parser, prefix="dataset", skip={"segments_file"}) - Sampler.add_class_args(val_parser, prefix="sampler") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + # Sampler.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", type=int, @@ -144,19 +157,41 @@ def make_parser(model_class): "--data.train.dataset.text_file", type=str, ) - parser.add_argument("--data.val.dataset.text_file", type=str) - parser.add_argument("--data.train.data_loader.num_workers", type=int, - default=5,) - parser.add_argument("--data.val.data_loader.num_workers", type=int, - default=5,) parser.add_argument( - "--bpe-model", - type=str, - default="data/lang_bpe_500/bpe.model", - help="Path to the BPE model", + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + + # parser.add_argument( + # "--data.val.dataset.bpe_model", + # type=str, + # ) + + + # parser.add_argument("--data.train.data_loader.num_workers", type=int, + # default=5,) + # parser.add_argument("--data.val.data_loader.num_workers", type=int, + # default=5,) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments( + "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" ) + + # parser.add_argument( + # "--bpe-model", + # type=str, + # default="data/lang_bpe_500/bpe.model", + # help="Path to the BPE model", + # ) + # parser.link_arguments( # "data.train.dataset.class_file", "data.val.dataset.class_file" # ) @@ -209,5 +244,5 @@ def make_parser(model_class): args_sc.model_class = model_dict[model_type] # torch docs recommend using forkserver - multiprocessing.set_start_method("forkserver") + # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index aebcfe8a..e289acbf 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -10,5 +10,6 @@ from .audio_dataset import AudioDataset # samplers +from .bucketing_seg_sampler import BucketingSegSampler from .weighted_seq_sampler import ClassWeightedSeqSampler from .seg_sampler_factory import SegSamplerFactory diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 7ec30bc3..11e8cede 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -26,7 +26,7 @@ import torch.distributed as dist from hyperion.np import augment - +import pdb class AudioDataset1(Dataset): def __init__( @@ -453,7 +453,7 @@ def add_class_args(parser, prefix=None, skip={"audio_file", "key_file"}): from ...utils.class_info import ClassInfo from ...utils.segment_set import SegmentSet - +from ...utils.text import read_text class AudioDataset(Dataset): def __init__( @@ -463,7 +463,7 @@ def __init__( class_names=None, class_files=None, bpe_model=None, - text_files=None, + text_file=None, time_durs_file=None, aug_cfgs=None, num_augs=1, @@ -518,10 +518,9 @@ def __init__( logging.info("loading bpe models") self._load_bpe_model(bpe_model, is_val) - if text_files is not None: + if text_file is not None: logging.info("loading text files") - self._load_text_infos(text_files, is_val) - + self._load_text_infos(text_file, is_val) self.return_segment_info = ( [] if return_segment_info is None else return_segment_info ) @@ -532,8 +531,10 @@ def __init__( def _load_bpe_model(self, bpe_model, is_val): + if self.rank == 0: + logging.info("loading bpe file %s" % bpe_model) self.sp = spm.SentencePieceProcessor() - self.sp.load(params.bpe_model) + self.sp.load(bpe_model) blank_id = self.sp.piece_to_id("") vocab_size = self.sp.get_piece_size() @@ -543,7 +544,9 @@ def _load_text_infos(self, text_file, is_val): return if self.rank == 0: logging.info("loading text file %s" % text_file) - self.text_info = TextInfo.load(text_file, self.sp) + + text = read_text(text_file) + self.seg_set["text"] = text.loc[self.seg_set["id"]].text @@ -742,7 +745,8 @@ def filter_args(**kwargs): "num_augs", "class_names", "class_files", - "text_files", + "bpe_model", + "text_file", "return_segment_info", "return_orig", "time_durs_file", @@ -795,11 +799,19 @@ def add_class_args(parser, prefix=None, skip={}): ), ) + parser.add_argument( + "--bpe-model", + default=None, + help=( + "bpe model for the text label" + ), + ) + parser.add_argument( "--text-file", default=None, help=( - "text file" + "text file with words labels for each utterances" ), ) diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index 224660bb..37794377 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -8,7 +8,6 @@ import logging import numpy as np - import torch from .hyp_sampler import HypSampler from .seg_sampler import SegSampler @@ -37,14 +36,18 @@ def __init__( self.depleted_buckets = torch.zeros((num_buckets,), dtype=torch.bool) def create_buckets(self): - sort_idx = torch.argsort(self.seg_set[self.length_column].values) + sort_idx = torch.argsort(torch.from_numpy(self.seg_set[self.length_column].values)) sorted_seg_set = self.seg_set.iloc[sort_idx] - cum_lengths = torch.cumsum(sorted_seg_set[self.length_column].values) + cum_lengths = torch.cumsum(torch.from_numpy(sorted_seg_set[self.length_column].values),dim=0) bucket_length = cum_lengths[-1] / self.num_buckets buckets = [] for i in range(self.num_buckets): - bucket_idx = (cum_lengths <= bucket_length) & (cum_lengths > 0) - bucket_i = sorted_seg_set.loc[bucket_idx] + bucket_bool = (cum_lengths <= bucket_length) & (cum_lengths > 0) + bucket_idx = [] + for i, bo in enumerate(bucket_bool): + if bo: + bucket_idx.append(i) + bucket_i = sorted_seg_set.iloc[bucket_idx] buckets.append(bucket_i) cum_lengths -= bucket_length @@ -54,7 +57,7 @@ def _create_bucket_samplers(self): buckets = self.create_buckets() bucket_samplers = [] for i in range(self.num_buckets): - sampler_i = self.base_sampler(buckets[i], self.seed, **self.base_kwargs) + sampler_i = self.base_sampler(buckets[i], **self.base_kwargs) bucket_samplers.append(sampler_i) self.bucket_samplers = bucket_samplers diff --git a/hyperion/utils/utils.py b/hyperion/utils/utils.py index 1663fb03..2da78581 100644 --- a/hyperion/utils/utils.py +++ b/hyperion/utils/utils.py @@ -352,6 +352,7 @@ def store_transcripts( print(f"{cut_id}:\thyp={hyp}", file=f) + def write_error_stats( f: TextIO, test_set_name: str, From a7f1e53063f76f03941cfbedd0ce62bec6b6cc91 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Tue, 25 Oct 2022 23:23:53 -0400 Subject: [PATCH 040/154] initial loading transducer model --- ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 2 +- .../v1/conf/wav2vec2xlsr300m_transducer.yaml | 40 + hyperion/bin/train_wav2vec2transducer.py | 178 +- hyperion/torch/models/transducer/__init__.py | 3 + hyperion/torch/models/transducer/conformer.py | 1450 +++++++++++++++++ hyperion/torch/models/transducer/decoder.py | 97 ++ hyperion/torch/models/transducer/joiner.py | 54 + .../wav2transducer/hf_wav2vec2_transducer.py | 48 +- hyperion/utils/text_info.py | 104 -- 9 files changed, 1814 insertions(+), 162 deletions(-) create mode 100644 egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml create mode 100644 hyperion/torch/models/transducer/conformer.py create mode 100644 hyperion/torch/models/transducer/decoder.py create mode 100644 hyperion/torch/models/transducer/joiner.py delete mode 100644 hyperion/utils/text_info.py diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index 3c9385c7..e7cfc8ef 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -23,7 +23,7 @@ data: iters_per_epoch: 6 data_loader: num_workers: 8 -model: +model: wav2vec2xlsr300m_transducer.yaml trainer: optim: opt_type: sgd diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml new file mode 100644 index 00000000..fe71a40c --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml @@ -0,0 +1,40 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +transducer: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 5 + intertop_margin: 0.1 + dropout_rate: 0.0 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 360c2c0d..7e87c180 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -4,6 +4,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import sys +import pdb import os from pathlib import Path from jsonargparse import ( @@ -25,11 +26,14 @@ from hyperion.torch.utils import ddp from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.data import AudioDataset as AD -# from hyperion.torch.data import LibriSpeechAsrDataModule as ASRD -# from hyperion.torch.data import BucketingSegSampler as Sampler from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import HFWav2Vec2Transducer +from hyperion.torch.models.transducer import Conformer +from hyperion.torch.models.transducer import Decoder +from hyperion.torch.models.transducer import Joiner + + model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, @@ -80,6 +84,133 @@ def init_model(num_classes, rank, model_class, **kwargs): return model + + +def get_params() -> AttributeDict: + """Return a dict containing training parameters. + + All training related parameters that are not passed from the commandline + are saved in the variable `params`. + + Commandline options are merged into `params` after they are parsed, so + you can also access them via `params`. + + Explanation of options saved in `params`: + + - best_train_loss: Best training loss so far. It is used to select + the model that has the lowest training loss. It is + updated during the training. + + - best_valid_loss: Best validation loss so far. It is used to select + the model that has the lowest validation loss. It is + updated during the training. + + - best_train_epoch: It is the epoch that has the best training loss. + + - best_valid_epoch: It is the epoch that has the best validation loss. + + - batch_idx_train: Used to writing statistics to tensorboard. It + contains number of batches trained so far across + epochs. + + - log_interval: Print training loss if batch_idx % log_interval` is 0 + + - reset_interval: Reset statistics if batch_idx % reset_interval is 0 + + - valid_interval: Run validation if batch_idx % valid_interval is 0 + + - feature_dim: The model input dim. It has to match the one used + in computing features. + + - subsampling_factor: The subsampling factor for the model. + + - attention_dim: Hidden dim for multi-head attention model. + + - num_decoder_layers: Number of decoder layer of transformer decoder. + + - warm_step: The warm_step for Noam optimizer. + """ + params = AttributeDict( + { + "best_train_loss": float("inf"), + "best_valid_loss": float("inf"), + "best_train_epoch": -1, + "best_valid_epoch": -1, + "batch_idx_train": 0, + "log_interval": 50, + "reset_interval": 200, + "valid_interval": 3000, # For the 100h subset, use 800 + # parameters for conformer + "feature_dim": 80, + "encoder_out_dim": 512, + "subsampling_factor": 4, + "attention_dim": 512, + "nhead": 8, + "dim_feedforward": 2048, + "num_encoder_layers": 12, + "vgg_frontend": False, + # decoder params + "decoder_embedding_dim": 1024, + "num_decoder_layers": 2, + "decoder_hidden_dim": 512, + # parameters for Noam + "warm_step": 80000, # For the 100h subset, use 8k + "env_info": get_env_info(), + } + ) + + return params + + +def get_encoder_model(params: AttributeDict): + # TODO: We can add an option to switch between Conformer and Transformer + encoder = Conformer( + num_features=params.feature_dim, + output_dim=params.encoder_out_dim, + subsampling_factor=params.subsampling_factor, + d_model=params.attention_dim, + nhead=params.nhead, + dim_feedforward=params.dim_feedforward, + num_encoder_layers=params.num_encoder_layers, + vgg_frontend=params.vgg_frontend, + ) + return encoder + + +def get_decoder_model(params: AttributeDict): + decoder = Decoder( + vocab_size=params.vocab_size, + embedding_dim=params.decoder_embedding_dim, + blank_id=params.blank_id, + num_layers=params.num_decoder_layers, + hidden_dim=params.decoder_hidden_dim, + output_dim=params.encoder_out_dim, + ) + return decoder + + +def get_joiner_model(params: AttributeDict): + joiner = Joiner( + input_dim=params.encoder_out_dim, + output_dim=params.vocab_size, + ) + return joiner + + +def get_transducer_model(params: AttributeDict): + encoder = get_encoder_model(params) + decoder = get_decoder_model(params) + joiner = get_joiner_model(params) + + model = Transducer( + encoder=encoder, + decoder=decoder, + joiner=joiner, + ) + return model + + + def train_model(gpu_id, args): config_logger(args.verbose) @@ -90,20 +221,20 @@ def train_model(gpu_id, args): torch.manual_seed(args.seed) set_float_cpu("float32") - # ddp_args = ddp.filter_ddp_args(**kwargs) - # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - # kwargs["rank"] = rank + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank - # for Debug - rank = 0 - kwargs["rank"] = 0 - device = "cpu" - world_size=1 + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) - # model = init_model(train_loader.dataset.num_classes, **kwargs) + # model = init_model(train_loader.dataset.num_classes.values())[0], **kwargs) + model = init_model(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: @@ -163,19 +294,8 @@ def make_parser(model_class): type=str, ) - parser.add_argument("--data.val.dataset.text_file", type=str) - - - # parser.add_argument( - # "--data.val.dataset.bpe_model", - # type=str, - # ) - - - # parser.add_argument("--data.train.data_loader.num_workers", type=int, - # default=5,) - # parser.add_argument("--data.val.data_loader.num_workers", type=int, - # default=5,) + parser.add_argument("--data.val.dataset.text_file", type=str) + parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) @@ -184,14 +304,6 @@ def make_parser(model_class): "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" ) - - # parser.add_argument( - # "--bpe-model", - # type=str, - # default="data/lang_bpe_500/bpe.model", - # help="Path to the BPE model", - # ) - # parser.link_arguments( # "data.train.dataset.class_file", "data.val.dataset.class_file" # ) diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py index 20372911..ee711a8d 100644 --- a/hyperion/torch/models/transducer/__init__.py +++ b/hyperion/torch/models/transducer/__init__.py @@ -5,3 +5,6 @@ """ from .transducer import Transducer +from .conformer import Conformer +from .decoder import Decoder +from .joiner import Joiner \ No newline at end of file diff --git a/hyperion/torch/models/transducer/conformer.py b/hyperion/torch/models/transducer/conformer.py new file mode 100644 index 00000000..2bf63320 --- /dev/null +++ b/hyperion/torch/models/transducer/conformer.py @@ -0,0 +1,1450 @@ +#!/usr/bin/env python3 +# Copyright (c) 2021 University of Chinese Academy of Sciences (author: Han Zhu) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import math +import warnings +from typing import List, Optional, Tuple + +import torch +from torch import Tensor, nn +from transformer import Transformer + +from icefall.utils import make_pad_mask, subsequent_chunk_mask + + +class Conformer(Transformer): + """ + Args: + num_features (int): Number of input features + output_dim (int): Number of output dimension + subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers) + d_model (int): attention dimension + nhead (int): number of head + dim_feedforward (int): feedforward dimention + num_encoder_layers (int): number of encoder layers + dropout (float): dropout rate + cnn_module_kernel (int): Kernel size of convolution module + normalize_before (bool): whether to use layer_norm before the first block. + vgg_frontend (bool): whether to use vgg frontend. + dynamic_chunk_training (bool): whether to use dynamic chunk training, if + you want to train a streaming model, this is expected to be True. + When setting True, it will use a masking strategy to make the attention + see only limited left and right context. + short_chunk_threshold (float): a threshold to determinize the chunk size + to be used in masking training, if the randomly generated chunk size + is greater than ``max_len * short_chunk_threshold`` (max_len is the + max sequence length of current batch) then it will use + full context in training (i.e. with chunk size equals to max_len). + This will be used only when dynamic_chunk_training is True. + short_chunk_size (int): see docs above, if the randomly generated chunk + size equals to or less than ``max_len * short_chunk_threshold``, the + chunk size will be sampled uniformly from 1 to short_chunk_size. + This also will be used only when dynamic_chunk_training is True. + num_left_chunks (int): the left context (in chunks) attention can see, the + chunk size is decided by short_chunk_threshold and short_chunk_size. + A minus value means seeing full left context. + This also will be used only when dynamic_chunk_training is True. + causal (bool): Whether to use causal convolution in conformer encoder + layer. This MUST be True when using dynamic_chunk_training. + """ + + def __init__( + self, + num_features: int, + output_dim: int, + subsampling_factor: int = 4, + d_model: int = 256, + nhead: int = 4, + dim_feedforward: int = 2048, + num_encoder_layers: int = 12, + dropout: float = 0.1, + cnn_module_kernel: int = 31, + normalize_before: bool = True, + vgg_frontend: bool = False, + dynamic_chunk_training: bool = False, + short_chunk_threshold: float = 0.75, + short_chunk_size: int = 25, + num_left_chunks: int = -1, + causal: bool = False, + ) -> None: + super(Conformer, self).__init__( + num_features=num_features, + output_dim=output_dim, + subsampling_factor=subsampling_factor, + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + num_encoder_layers=num_encoder_layers, + dropout=dropout, + normalize_before=normalize_before, + vgg_frontend=vgg_frontend, + ) + + self.encoder_layers = num_encoder_layers + self.d_model = d_model + self.cnn_module_kernel = cnn_module_kernel + self.causal = causal + + self.dynamic_chunk_training = dynamic_chunk_training + self.short_chunk_threshold = short_chunk_threshold + self.short_chunk_size = short_chunk_size + self.num_left_chunks = num_left_chunks + + self.encoder_pos = RelPositionalEncoding(d_model, dropout) + + encoder_layer = ConformerEncoderLayer( + d_model, + nhead, + dim_feedforward, + dropout, + cnn_module_kernel, + normalize_before, + causal, + ) + self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers) + self.normalize_before = normalize_before + if self.normalize_before: + self.after_norm = nn.LayerNorm(d_model) + else: + # Note: TorchScript detects that self.after_norm could be used inside forward() + # and throws an error without this change. + self.after_norm = identity + + self._init_state: List[torch.Tensor] = [torch.empty(0)] + + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + Returns: + Return a tuple containing 2 tensors: + - logits, its shape is (batch_size, output_seq_len, output_dim) + - logit_lens, a tensor of shape (batch_size,) containing the number + of frames in `logits` before padding. + """ + x = self.encoder_embed(x) + x, pos_emb = self.encoder_pos(x) + x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) + + # Caution: We assume the subsampling factor is 4! + + # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning + # + # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0 + lengths = (((x_lens - 1) >> 1) - 1) >> 1 + + assert x.size(0) == lengths.max().item() + + src_key_padding_mask = make_pad_mask(lengths) + + if self.dynamic_chunk_training: + assert ( + self.causal + ), "Causal convolution is required for streaming conformer." + max_len = x.size(0) + chunk_size = torch.randint(1, max_len, (1,)).item() + if chunk_size > (max_len * self.short_chunk_threshold): + chunk_size = max_len + else: + chunk_size = chunk_size % self.short_chunk_size + 1 + + mask = ~subsequent_chunk_mask( + size=x.size(0), + chunk_size=chunk_size, + num_left_chunks=self.num_left_chunks, + device=x.device, + ) + x = self.encoder( + x, pos_emb, mask=mask, src_key_padding_mask=src_key_padding_mask + ) # (T, N, C) + else: + x = self.encoder( + x, pos_emb, mask=None, src_key_padding_mask=src_key_padding_mask + ) # (T, N, C) + + if self.normalize_before: + x = self.after_norm(x) + + logits = self.encoder_output_layer(x) + logits = logits.permute(1, 0, 2) # (T, N, C) ->(N, T, C) + + return logits, lengths + + @torch.jit.export + def get_init_state( + self, left_context: int, device: torch.device + ) -> List[torch.Tensor]: + """Return the initial cache state of the model. + + Args: + left_context: The left context size (in frames after subsampling). + + Returns: + Return the initial state of the model, it is a list containing two + tensors, the first one is the cache for attentions which has a shape + of (num_encoder_layers, left_context, encoder_dim), the second one + is the cache of conv_modules which has a shape of + (num_encoder_layers, cnn_module_kernel - 1, encoder_dim). + + NOTE: the returned tensors are on the given device. + """ + if ( + len(self._init_state) == 2 + and self._init_state[0].size(1) == left_context + ): + # Note: It is OK to share the init state as it is + # not going to be modified by the model + return self._init_state + + init_states: List[torch.Tensor] = [ + torch.zeros( + ( + self.encoder_layers, + left_context, + self.d_model, + ), + device=device, + ), + torch.zeros( + ( + self.encoder_layers, + self.cnn_module_kernel - 1, + self.d_model, + ), + device=device, + ), + ] + + self._init_state = init_states + + return init_states + + @torch.jit.export + def streaming_forward( + self, + x: torch.Tensor, + x_lens: torch.Tensor, + states: Optional[List[torch.Tensor]] = None, + processed_lens: Optional[Tensor] = None, + left_context: int = 64, + right_context: int = 0, + chunk_size: int = 16, + simulate_streaming: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + states: + The decode states for previous frames which contains the cached data. + It has two elements, the first element is the attn_cache which has + a shape of (encoder_layers, left_context, batch, attention_dim), + the second element is the conv_cache which has a shape of + (encoder_layers, cnn_module_kernel-1, batch, conv_dim). + Note: states will be modified in this function. + processed_lens: + How many frames (after subsampling) have been processed for each sequence. + left_context: + How many previous frames the attention can see in current chunk. + Note: It's not that each individual frame has `left_context` frames + of left context, some have more. + right_context: + How many future frames the attention can see in current chunk. + Note: It's not that each individual frame has `right_context` frames + of right context, some have more. + chunk_size: + The chunk size for decoding, this will be used to simulate streaming + decoding using masking. + simulate_streaming: + If setting True, it will use a masking strategy to simulate streaming + fashion (i.e. every chunk data only see limited left context and + right context). The whole sequence is supposed to be send at a time + When using simulate_streaming. + Returns: + Return a tuple containing 2 tensors: + - logits, its shape is (batch_size, output_seq_len, output_dim) + - logit_lens, a tensor of shape (batch_size,) containing the number + of frames in `logits` before padding. + - states, the updated states(i.e. caches) including the information + of current chunk. + """ + + # x: [N, T, C] + # Caution: We assume the subsampling factor is 4! + + # lengths = ((x_lens - 1) // 2 - 1) // 2 # issue an warning + # + # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0 + lengths = (((x_lens - 1) >> 1) - 1) >> 1 + + if not simulate_streaming: + assert states is not None + assert processed_lens is not None + assert ( + len(states) == 2 + and states[0].shape + == (self.encoder_layers, left_context, x.size(0), self.d_model) + and states[1].shape + == ( + self.encoder_layers, + self.cnn_module_kernel - 1, + x.size(0), + self.d_model, + ) + ), f"""The length of states MUST be equal to 2, and the shape of + first element should be {(self.encoder_layers, left_context, x.size(0), self.d_model)}, + given {states[0].shape}. the shape of second element should be + {(self.encoder_layers, self.cnn_module_kernel - 1, x.size(0), self.d_model)}, + given {states[1].shape}.""" + + lengths -= 2 # we will cut off 1 frame on each side of encoder_embed output + src_key_padding_mask = make_pad_mask(lengths) + + processed_mask = torch.arange(left_context, device=x.device).expand( + x.size(0), left_context + ) + processed_lens = processed_lens.view(x.size(0), 1) + processed_mask = (processed_lens <= processed_mask).flip(1) + + src_key_padding_mask = torch.cat( + [processed_mask, src_key_padding_mask], dim=1 + ) + + embed = self.encoder_embed(x) + + # cut off 1 frame on each size of embed as they see the padding + # value which causes a training and decoding mismatch. + embed = embed[:, 1:-1, :] + + embed, pos_enc = self.encoder_pos(embed, left_context) + embed = embed.permute(1, 0, 2) # (B, T, F) -> (T, B, F) + + x, states = self.encoder.chunk_forward( + embed, + pos_enc, + src_key_padding_mask=src_key_padding_mask, + states=states, + left_context=left_context, + right_context=right_context, + ) # (T, B, F) + else: + assert states is None + states = [] # just to make torch.script.jit happy + src_key_padding_mask = make_pad_mask(lengths) + x = self.encoder_embed(x) + x, pos_emb = self.encoder_pos(x) + x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) + + assert x.size(0) == lengths.max().item() + + num_left_chunks = -1 + if left_context >= 0: + assert left_context % chunk_size == 0 + num_left_chunks = left_context // chunk_size + + mask = ~subsequent_chunk_mask( + size=x.size(0), + chunk_size=chunk_size, + num_left_chunks=num_left_chunks, + device=x.device, + ) + x = self.encoder( + x, + pos_emb, + mask=mask, + src_key_padding_mask=src_key_padding_mask, + ) # (T, N, C) + + if self.normalize_before: + x = self.after_norm(x) + + logits = self.encoder_output_layer(x) + logits = logits.permute(1, 0, 2) # (T, N, C) ->(N, T, C) + + return logits, lengths, states + + +class ConformerEncoderLayer(nn.Module): + """ + ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks. + See: "Conformer: Convolution-augmented Transformer for Speech Recognition" + + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + cnn_module_kernel (int): Kernel size of convolution module. + normalize_before (bool): whether to use layer_norm before the first block. + causal (bool): Whether to use causal convolution in conformer encoder + layer. This MUST be True when using dynamic_chunk_training and streaming decoding. + + Examples:: + >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8) + >>> src = torch.rand(10, 32, 512) + >>> pos_emb = torch.rand(32, 19, 512) + >>> out = encoder_layer(src, pos_emb) + """ + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + cnn_module_kernel: int = 31, + normalize_before: bool = True, + causal: bool = False, + ) -> None: + super(ConformerEncoderLayer, self).__init__() + self.self_attn = RelPositionMultiheadAttention( + d_model, nhead, dropout=0.0 + ) + + self.feed_forward = nn.Sequential( + nn.Linear(d_model, dim_feedforward), + Swish(), + nn.Dropout(dropout), + nn.Linear(dim_feedforward, d_model), + ) + + self.feed_forward_macaron = nn.Sequential( + nn.Linear(d_model, dim_feedforward), + Swish(), + nn.Dropout(dropout), + nn.Linear(dim_feedforward, d_model), + ) + + self.conv_module = ConvolutionModule( + d_model, cnn_module_kernel, causal=causal + ) + + self.norm_ff_macaron = nn.LayerNorm( + d_model + ) # for the macaron style FNN module + self.norm_ff = nn.LayerNorm(d_model) # for the FNN module + self.norm_mha = nn.LayerNorm(d_model) # for the MHA module + + self.ff_scale = 0.5 + + self.norm_conv = nn.LayerNorm(d_model) # for the CNN module + self.norm_final = nn.LayerNorm( + d_model + ) # for the final output of the block + + self.dropout = nn.Dropout(dropout) + + self.normalize_before = normalize_before + + def forward( + self, + src: Tensor, + pos_emb: Tensor, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: + """ + Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + pos_emb: Positional embedding tensor (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + Shape: + src: (S, N, E). + pos_emb: (N, 2*S-1, E). + src_mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, N is the batch size, E is the feature number + """ + # macaron style feed forward module + residual = src + if self.normalize_before: + src = self.norm_ff_macaron(src) + src = residual + self.ff_scale * self.dropout( + self.feed_forward_macaron(src) + ) + if not self.normalize_before: + src = self.norm_ff_macaron(src) + + # multi-headed self-attention module + residual = src + if self.normalize_before: + src = self.norm_mha(src) + + src_att = self.self_attn( + src, + src, + src, + pos_emb=pos_emb, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask, + )[0] + src = residual + self.dropout(src_att) + if not self.normalize_before: + src = self.norm_mha(src) + + # convolution module + residual = src + if self.normalize_before: + src = self.norm_conv(src) + + src, _ = self.conv_module(src) + src = residual + self.dropout(src) + + if not self.normalize_before: + src = self.norm_conv(src) + + # feed forward module + residual = src + if self.normalize_before: + src = self.norm_ff(src) + src = residual + self.ff_scale * self.dropout(self.feed_forward(src)) + if not self.normalize_before: + src = self.norm_ff(src) + + if self.normalize_before: + src = self.norm_final(src) + + return src + + @torch.jit.export + def chunk_forward( + self, + src: Tensor, + pos_emb: Tensor, + states: List[Tensor], + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + left_context: int = 0, + right_context: int = 0, + ) -> Tuple[Tensor, List[Tensor]]: + """ + Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + pos_emb: Positional embedding tensor (required). + states: + The decode states for previous frames which contains the cached data. + It has two elements, the first element is the attn_cache which has + a shape of (left_context, batch, attention_dim), + the second element is the conv_cache which has a shape of + (cnn_module_kernel-1, batch, conv_dim). + Note: states will be modified in this function. + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + left_context: + How many previous frames the attention can see in current chunk. + Note: It's not that each individual frame has `left_context` frames + of left context, some have more. + right_context: + How many future frames the attention can see in current chunk. + Note: It's not that each individual frame has `right_context` frames + of right context, some have more. + Shape: + src: (S, N, E). + pos_emb: (N, 2*(S+left_context)-1, E). + src_mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, N is the batch size, E is the feature number + """ + + # macaron style feed forward module + residual = src + if self.normalize_before: + src = self.norm_ff_macaron(src) + src = residual + self.ff_scale * self.dropout( + self.feed_forward_macaron(src) + ) + if not self.normalize_before: + src = self.norm_ff_macaron(src) + + # multi-headed self-attention module + residual = src + if self.normalize_before: + src = self.norm_mha(src) + + # We put the attention cache this level (i.e. before linear transformation) + # to save memory consumption, when decoding in streaming fashion, the + # batch size would be thousands (for 32GB machine), if we cache key & val + # separately, it needs extra several GB memory. + # TODO(WeiKang): Move cache to self_attn level (i.e. cache key & val + # separately) if needed. + key = torch.cat([states[0], src], dim=0) + val = key + if right_context > 0: + states[0] = key[ + -(left_context + right_context) : -right_context, ... # noqa + ] + else: + states[0] = key[-left_context:, ...] + + src_att = self.self_attn( + src, + key, + val, + pos_emb=pos_emb, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask, + left_context=left_context, + )[0] + src = residual + self.dropout(src_att) + if not self.normalize_before: + src = self.norm_mha(src) + + # convolution module + residual = src + if self.normalize_before: + src = self.norm_conv(src) + + src, conv_cache = self.conv_module( + src, states[1], right_context=right_context + ) + states[1] = conv_cache + src = residual + self.dropout(src) + + if not self.normalize_before: + src = self.norm_conv(src) + + # feed forward module + residual = src + if self.normalize_before: + src = self.norm_ff(src) + src = residual + self.ff_scale * self.dropout(self.feed_forward(src)) + if not self.normalize_before: + src = self.norm_ff(src) + + if self.normalize_before: + src = self.norm_final(src) + + return src, states + + +class ConformerEncoder(nn.Module): + r"""ConformerEncoder is a stack of N encoder layers + + Args: + encoder_layer: an instance of the ConformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + + Examples:: + >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8) + >>> conformer_encoder = ConformerEncoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 32, 512) + >>> pos_emb = torch.rand(32, 19, 512) + >>> out = conformer_encoder(src, pos_emb) + """ + + def __init__(self, encoder_layer: nn.Module, num_layers: int) -> None: + super().__init__() + self.layers = nn.ModuleList( + [copy.deepcopy(encoder_layer) for i in range(num_layers)] + ) + self.num_layers = num_layers + + def forward( + self, + src: Tensor, + pos_emb: Tensor, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: + r"""Pass the input through the encoder layers in turn. + + Args: + src: the sequence to the encoder (required). + pos_emb: Positional embedding tensor (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + Shape: + + Shape: + src: (S, N, E). + pos_emb: (N, 2*S-1, E). + mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number + + """ + output = src + + for layer_index, mod in enumerate(self.layers): + output = mod( + output, + pos_emb, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + ) + return output + + @torch.jit.export + def chunk_forward( + self, + src: Tensor, + pos_emb: Tensor, + states: List[Tensor], + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + left_context: int = 0, + right_context: int = 0, + ) -> Tuple[Tensor, List[Tensor]]: + r"""Pass the input through the encoder layers in turn. + + Args: + src: the sequence to the encoder (required). + pos_emb: Positional embedding tensor (required). + states: + The decode states for previous frames which contains the cached data. + It has two elements, the first element is the attn_cache which has + a shape of (encoder_layers, left_context, batch, attention_dim), + the second element is the conv_cache which has a shape of + (encoder_layers, cnn_module_kernel-1, batch, conv_dim). + Note: states will be modified in this function. + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + left_context: + How many previous frames the attention can see in current chunk. + Note: It's not that each individual frame has `left_context` frames + of left context, some have more. + right_context: + How many future frames the attention can see in current chunk. + Note: It's not that each individual frame has `right_context` frames + of right context, some have more. + Shape: + src: (S, N, E). + pos_emb: (N, 2*(S+left_context)-1, E). + mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number + + """ + assert not self.training + output = src + + for layer_index, mod in enumerate(self.layers): + cache = [states[0][layer_index], states[1][layer_index]] + output, cache = mod.chunk_forward( + output, + pos_emb, + states=cache, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + left_context=left_context, + right_context=right_context, + ) + states[0][layer_index] = cache[0] + states[1][layer_index] = cache[1] + + return output, states + + +class RelPositionalEncoding(torch.nn.Module): + """Relative positional encoding module. + + See : Appendix B in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" + Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/embedding.py + + Args: + d_model: Embedding dimension. + dropout_rate: Dropout rate. + max_len: Maximum input length. + + """ + + def __init__( + self, d_model: int, dropout_rate: float, max_len: int = 5000 + ) -> None: + """Construct an PositionalEncoding object.""" + super(RelPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: Tensor, left_context: int = 0) -> None: + """Reset the positional encodings.""" + x_size_1 = x.size(1) + left_context + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x_size_1 * 2 - 1: + # Note: TorchScript doesn't implement operator== for torch.Device + if self.pe.dtype != x.dtype or str(self.pe.device) != str( + x.device + ): + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vector and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i Tuple[Tensor, Tensor]: + """Add positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). + left_context (int): left context (in frames) used during streaming decoding. + this is used only in real streaming decoding, in other circumstances, + it MUST be 0. + Returns: + torch.Tensor: Encoded tensor (batch, time, `*`). + torch.Tensor: Encoded tensor (batch, 2*time-1, `*`). + + """ + self.extend_pe(x, left_context) + x = x * self.xscale + x_size_1 = x.size(1) + left_context + pos_emb = self.pe[ + :, + self.pe.size(1) // 2 + - x_size_1 + + 1 : self.pe.size(1) // 2 # noqa E203 + + x.size(1), + ] + return self.dropout(x), self.dropout(pos_emb) + + +class RelPositionMultiheadAttention(nn.Module): + r"""Multi-Head Attention layer with relative position encoding + + See reference: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" + + Args: + embed_dim: total dimension of the model. + num_heads: parallel attention heads. + dropout: a Dropout layer on attn_output_weights. Default: 0.0. + + Examples:: + + >>> rel_pos_multihead_attn = RelPositionMultiheadAttention(embed_dim, num_heads) + >>> attn_output, attn_output_weights = multihead_attn(query, key, value, pos_emb) + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + ) -> None: + super(RelPositionMultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + + self.in_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=True) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True) + + # linear transformation for positional encoding. + self.linear_pos = nn.Linear(embed_dim, embed_dim, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(num_heads, self.head_dim)) + self.pos_bias_v = nn.Parameter(torch.Tensor(num_heads, self.head_dim)) + + self._reset_parameters() + + def _reset_parameters(self) -> None: + nn.init.xavier_uniform_(self.in_proj.weight) + nn.init.constant_(self.in_proj.bias, 0.0) + nn.init.constant_(self.out_proj.bias, 0.0) + + nn.init.xavier_uniform_(self.pos_bias_u) + nn.init.xavier_uniform_(self.pos_bias_v) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + pos_emb: Tensor, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + left_context: int = 0, + ) -> Tuple[Tensor, Optional[Tensor]]: + r""" + Args: + query, key, value: map a query and a set of key-value pairs to an output. + pos_emb: Positional embedding tensor + key_padding_mask: if provided, specified padding elements in the key will + be ignored by the attention. When given a binary mask and a value is True, + the corresponding value on the attention layer will be ignored. When given + a byte mask and a value is non-zero, the corresponding value on the attention + layer will be ignored + need_weights: output attn_output_weights. + attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all + the batches while a 3D mask allows to specify a different mask for the entries of each batch. + left_context (int): left context (in frames) used during streaming decoding. + this is used only in real streaming decoding, in other circumstances, + it MUST be 0. + + Shape: + - Inputs: + - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. + - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - pos_emb: :math:`(N, 2*L-1, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. + - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. + If a ByteTensor is provided, the non-zero positions will be ignored while the position + with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. + 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, + S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked + positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend + while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` + is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + + - Outputs: + - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, + E is the embedding dimension. + - attn_output_weights: :math:`(N, L, S)` where N is the batch size, + L is the target sequence length, S is the source sequence length. + """ + return self.multi_head_attention_forward( + query, + key, + value, + pos_emb, + self.embed_dim, + self.num_heads, + self.in_proj.weight, + self.in_proj.bias, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + left_context=left_context, + ) + + def rel_shift(self, x: Tensor, left_context: int = 0) -> Tensor: + """Compute relative positional encoding. + + Args: + x: Input tensor (batch, head, time1, 2*time1-1). + time1 means the length of query vector. + left_context (int): left context (in frames) used during streaming decoding. + this is used only in real streaming decoding, in other circumstances, + it MUST be 0. + + Returns: + Tensor: tensor of shape (batch, head, time1, time2) + (note: time2 has the same value as time1, but it is for + the key, while time1 is for the query). + """ + (batch_size, num_heads, time1, n) = x.shape + time2 = time1 + left_context + + assert ( + n == left_context + 2 * time1 - 1 + ), f"{n} == {left_context} + 2 * {time1} - 1" + + # Note: TorchScript requires explicit arg for stride() + batch_stride = x.stride(0) + head_stride = x.stride(1) + time1_stride = x.stride(2) + n_stride = x.stride(3) + return x.as_strided( + (batch_size, num_heads, time1, time2), + (batch_stride, head_stride, time1_stride - n_stride, n_stride), + storage_offset=n_stride * (time1 - 1), + ) + + def multi_head_attention_forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + pos_emb: Tensor, + embed_dim_to_check: int, + num_heads: int, + in_proj_weight: Tensor, + in_proj_bias: Tensor, + dropout_p: float, + out_proj_weight: Tensor, + out_proj_bias: Tensor, + training: bool = True, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + left_context: int = 0, + ) -> Tuple[Tensor, Optional[Tensor]]: + r""" + Args: + query, key, value: map a query and a set of key-value pairs to an output. + pos_emb: Positional embedding tensor + embed_dim_to_check: total dimension of the model. + num_heads: parallel attention heads. + in_proj_weight, in_proj_bias: input projection weight and bias. + dropout_p: probability of an element to be zeroed. + out_proj_weight, out_proj_bias: the output projection weight and bias. + training: apply dropout if is ``True``. + key_padding_mask: if provided, specified padding elements in the key will + be ignored by the attention. This is an binary mask. When the value is True, + the corresponding value on the attention layer will be filled with -inf. + need_weights: output attn_output_weights. + attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all + the batches while a 3D mask allows to specify a different mask for the entries of each batch. + left_context (int): left context (in frames) used during streaming decoding. + this is used only in real streaming decoding, in other circumstances, + it MUST be 0. + + Shape: + Inputs: + - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. + - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - pos_emb: :math:`(N, 2*L-1, E)` or :math:`(1, 2*L-1, E)` where L is the target sequence + length, N is the batch size, E is the embedding dimension. + - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. + If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions + will be unchanged. If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. + 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, + S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked + positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend + while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` + are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + + Outputs: + - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, + E is the embedding dimension. + - attn_output_weights: :math:`(N, L, S)` where N is the batch size, + L is the target sequence length, S is the source sequence length. + """ + + tgt_len, bsz, embed_dim = query.size() + assert embed_dim == embed_dim_to_check + assert key.size(0) == value.size(0) and key.size(1) == value.size(1) + + head_dim = embed_dim // num_heads + assert ( + head_dim * num_heads == embed_dim + ), "embed_dim must be divisible by num_heads" + scaling = float(head_dim) ** -0.5 + + if torch.equal(query, key) and torch.equal(key, value): + # self-attention + q, k, v = nn.functional.linear( + query, in_proj_weight, in_proj_bias + ).chunk(3, dim=-1) + + elif torch.equal(key, value): + # encoder-decoder attention + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = 0 + _end = embed_dim + _w = in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + q = nn.functional.linear(query, _w, _b) + + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = embed_dim + _end = None + _w = in_proj_weight[_start:, :] + if _b is not None: + _b = _b[_start:] + k, v = nn.functional.linear(key, _w, _b).chunk(2, dim=-1) + + else: + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = 0 + _end = embed_dim + _w = in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + q = nn.functional.linear(query, _w, _b) + + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = embed_dim + _end = embed_dim * 2 + _w = in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + k = nn.functional.linear(key, _w, _b) + + # This is inline in_proj function with in_proj_weight and in_proj_bias + _b = in_proj_bias + _start = embed_dim * 2 + _end = None + _w = in_proj_weight[_start:, :] + if _b is not None: + _b = _b[_start:] + v = nn.functional.linear(value, _w, _b) + + if attn_mask is not None: + assert ( + attn_mask.dtype == torch.float32 + or attn_mask.dtype == torch.float64 + or attn_mask.dtype == torch.float16 + or attn_mask.dtype == torch.uint8 + or attn_mask.dtype == torch.bool + ), "Only float, byte, and bool types are supported for attn_mask, not {}".format( + attn_mask.dtype + ) + if attn_mask.dtype == torch.uint8: + warnings.warn( + "Byte tensor for attn_mask is deprecated. Use bool tensor instead." + ) + attn_mask = attn_mask.to(torch.bool) + + if attn_mask.dim() == 2: + attn_mask = attn_mask.unsqueeze(0) + if list(attn_mask.size()) != [1, query.size(0), key.size(0)]: + raise RuntimeError( + "The size of the 2D attn_mask is not correct." + ) + elif attn_mask.dim() == 3: + if list(attn_mask.size()) != [ + bsz * num_heads, + query.size(0), + key.size(0), + ]: + raise RuntimeError( + "The size of the 3D attn_mask is not correct." + ) + else: + raise RuntimeError( + "attn_mask's dimension {} is not supported".format( + attn_mask.dim() + ) + ) + # attn_mask's dim is 3 now. + + # convert ByteTensor key_padding_mask to bool + if ( + key_padding_mask is not None + and key_padding_mask.dtype == torch.uint8 + ): + warnings.warn( + "Byte tensor for key_padding_mask is deprecated. Use bool tensor instead." + ) + key_padding_mask = key_padding_mask.to(torch.bool) + + q = q.contiguous().view(tgt_len, bsz, num_heads, head_dim) + k = k.contiguous().view(-1, bsz, num_heads, head_dim) + v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) + + src_len = k.size(0) + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz, "{} == {}".format( + key_padding_mask.size(0), bsz + ) + assert key_padding_mask.size(1) == src_len, "{} == {}".format( + key_padding_mask.size(1), src_len + ) + + q = q.transpose(0, 1) # (batch, time1, head, d_k) + + pos_emb_bsz = pos_emb.size(0) + assert pos_emb_bsz in (1, bsz) # actually it is 1 + p = self.linear_pos(pos_emb).view(pos_emb_bsz, -1, num_heads, head_dim) + + # (batch, 2*time1, head, d_k) --> (batch, head, d_k, 2*time -1) + p = p.permute(0, 2, 3, 1) + + q_with_bias_u = (q + self.pos_bias_u).transpose( + 1, 2 + ) # (batch, head, time1, d_k) + + q_with_bias_v = (q + self.pos_bias_v).transpose( + 1, 2 + ) # (batch, head, time1, d_k) + + # compute attention score + # first compute matrix a and matrix c + # as described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" Section 3.3 + k = k.permute(1, 2, 3, 0) # (batch, head, d_k, time2) + matrix_ac = torch.matmul( + q_with_bias_u, k + ) # (batch, head, time1, time2) + + # compute matrix b and matrix d + matrix_bd = torch.matmul( + q_with_bias_v, p + ) # (batch, head, time1, 2*time1-1) + + matrix_bd = self.rel_shift(matrix_bd, left_context=left_context) + + attn_output_weights = ( + matrix_ac + matrix_bd + ) * scaling # (batch, head, time1, time2) + + attn_output_weights = attn_output_weights.view( + bsz * num_heads, tgt_len, -1 + ) + + assert list(attn_output_weights.size()) == [ + bsz * num_heads, + tgt_len, + src_len, + ] + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_output_weights.masked_fill_(attn_mask, float("-inf")) + else: + attn_output_weights += attn_mask + + if key_padding_mask is not None: + attn_output_weights = attn_output_weights.view( + bsz, num_heads, tgt_len, src_len + ) + attn_output_weights = attn_output_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + float("-inf"), + ) + attn_output_weights = attn_output_weights.view( + bsz * num_heads, tgt_len, src_len + ) + + attn_output_weights = nn.functional.softmax(attn_output_weights, dim=-1) + + # If we are using dynamic_chunk_training and setting a limited + # num_left_chunks, the attention may only see the padding values which + # will also be masked out by `key_padding_mask`, at this circumstances, + # the whole column of `attn_output_weights` will be `-inf` + # (i.e. be `nan` after softmax), so, we fill `0.0` at the masking + # positions to avoid invalid loss value below. + if ( + attn_mask is not None + and attn_mask.dtype == torch.bool + and key_padding_mask is not None + ): + combined_mask = attn_mask.unsqueeze(0) | key_padding_mask.unsqueeze( + 1 + ).unsqueeze(2) + attn_output_weights = attn_output_weights.view( + bsz, num_heads, tgt_len, src_len + ) + attn_output_weights = attn_output_weights.masked_fill( + combined_mask, 0.0 + ) + attn_output_weights = attn_output_weights.view( + bsz * num_heads, tgt_len, src_len + ) + + attn_output_weights = nn.functional.dropout( + attn_output_weights, p=dropout_p, training=training + ) + + attn_output = torch.bmm(attn_output_weights, v) + assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim] + attn_output = ( + attn_output.transpose(0, 1) + .contiguous() + .view(tgt_len, bsz, embed_dim) + ) + attn_output = nn.functional.linear( + attn_output, out_proj_weight, out_proj_bias + ) + + if need_weights: + # average attention weights over heads + attn_output_weights = attn_output_weights.view( + bsz, num_heads, tgt_len, src_len + ) + return attn_output, attn_output_weights.sum(dim=1) / num_heads + else: + return attn_output, None + + +class ConvolutionModule(nn.Module): + """ConvolutionModule in Conformer model. + Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/conformer/convolution.py + + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernerl size of conv layers. + bias (bool): Whether to use bias in conv layers (default=True). + causal (bool): Whether to use causal convolution. + """ + + def __init__( + self, + channels: int, + kernel_size: int, + bias: bool = True, + causal: bool = False, + ) -> None: + """Construct an ConvolutionModule object.""" + super(ConvolutionModule, self).__init__() + # kernerl_size should be a odd number for 'SAME' padding + assert (kernel_size - 1) % 2 == 0 + self.causal = causal + + self.pointwise_conv1 = nn.Conv1d( + channels, + 2 * channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + + self.lorder = kernel_size - 1 + padding = (kernel_size - 1) // 2 + if self.causal: + padding = 0 + + self.depthwise_conv = nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + padding=padding, + groups=channels, + bias=bias, + ) + self.norm = nn.LayerNorm(channels) + self.pointwise_conv2 = nn.Conv1d( + channels, + channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.activation = Swish() + + def forward( + self, + x: Tensor, + cache: Optional[Tensor] = None, + right_context: int = 0, + ) -> Tuple[Tensor, Tensor]: + """Compute convolution module. + + Args: + x: Input tensor (#time, batch, channels). + + Returns: + Tensor: Output tensor (#time, batch, channels). + + """ + # exchange the temporal dimension and the feature dimension + x = x.permute(1, 2, 0) # (#batch, channels, time). + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channels, time) + x = nn.functional.glu(x, dim=1) # (batch, channels, time) + + # 1D Depthwise Conv + if self.causal and self.lorder > 0: + if cache is None: + # Make depthwise_conv causal by + # manualy padding self.lorder zeros to the left + x = nn.functional.pad(x, (self.lorder, 0), "constant", 0.0) + else: + assert ( + not self.training + ), "Cache should be None in training time" + assert cache.size(0) == self.lorder + x = torch.cat([cache.permute(1, 2, 0), x], dim=2) + if right_context > 0: + cache = x.permute(2, 0, 1)[ + -(self.lorder + right_context) : ( # noqa + -right_context + ), + ..., + ] + else: + cache = x.permute(2, 0, 1)[-self.lorder :, ...] # noqa + + x = self.depthwise_conv(x) + # x is (batch, channels, time) + x = x.permute(0, 2, 1) + x = self.norm(x) + x = x.permute(0, 2, 1) + + x = self.activation(x) + + x = self.pointwise_conv2(x) # (batch, channel, time) + + if cache is None: + cache = torch.empty(0) + + return x.permute(2, 0, 1), cache + + +class Swish(torch.nn.Module): + """Construct an Swish object.""" + + def forward(self, x: Tensor) -> Tensor: + """Return Swich activation function.""" + return x * torch.sigmoid(x) + + +def identity(x): + return x diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py new file mode 100644 index 00000000..333fff30 --- /dev/null +++ b/hyperion/torch/models/transducer/decoder.py @@ -0,0 +1,97 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Tuple + +import torch +import torch.nn as nn + + +# TODO(fangjun): Support switching between LSTM and GRU +class Decoder(nn.Module): + def __init__( + self, + vocab_size: int, + embedding_dim: int, + blank_id: int, + num_layers: int, + hidden_dim: int, + output_dim: int, + embedding_dropout: float = 0.0, + rnn_dropout: float = 0.0, + ): + """ + Args: + vocab_size: + Number of tokens of the modeling unit including blank. + embedding_dim: + Dimension of the input embedding. + blank_id: + The ID of the blank symbol. + num_layers: + Number of LSTM layers. + hidden_dim: + Hidden dimension of LSTM layers. + output_dim: + Output dimension of the decoder. + embedding_dropout: + Dropout rate for the embedding layer. + rnn_dropout: + Dropout for LSTM layers. + """ + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embedding_dim, + padding_idx=blank_id, + ) + self.embedding_dropout = nn.Dropout(embedding_dropout) + # TODO(fangjun): Use layer normalized LSTM + self.rnn = nn.LSTM( + input_size=embedding_dim, + hidden_size=hidden_dim, + num_layers=num_layers, + batch_first=True, + dropout=rnn_dropout, + ) + self.blank_id = blank_id + self.output_linear = nn.Linear(hidden_dim, output_dim) + + def forward( + self, + y: torch.Tensor, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Args: + y: + A 2-D tensor of shape (N, U) with BOS prepended. + states: + A tuple of two tensors containing the states information of + LSTM layers in this decoder. + Returns: + Return a tuple containing: + + - rnn_output, a tensor of shape (N, U, C) + - (h, c), containing the state information for LSTM layers. + Both are of shape (num_layers, N, C) + """ + embedding_out = self.embedding(y) + embedding_out = self.embedding_dropout(embedding_out) + rnn_out, (h, c) = self.rnn(embedding_out, states) + out = self.output_linear(rnn_out) + + return out, (h, c) diff --git a/hyperion/torch/models/transducer/joiner.py b/hyperion/torch/models/transducer/joiner.py new file mode 100644 index 00000000..2ef3f1de --- /dev/null +++ b/hyperion/torch/models/transducer/joiner.py @@ -0,0 +1,54 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn + + +class Joiner(nn.Module): + def __init__(self, input_dim: int, output_dim: int): + super().__init__() + + self.output_linear = nn.Linear(input_dim, output_dim) + + def forward( + self, encoder_out: torch.Tensor, decoder_out: torch.Tensor + ) -> torch.Tensor: + """ + Args: + encoder_out: + Output from the encoder. Its shape is (N, T, C). + decoder_out: + Output from the decoder. Its shape is (N, U, C). + Returns: + Return a tensor of shape (N, T, U, C). + """ + assert encoder_out.ndim == decoder_out.ndim == 3 + assert encoder_out.size(0) == decoder_out.size(0) + assert encoder_out.size(2) == decoder_out.size(2) + + encoder_out = encoder_out.unsqueeze(2) + # Now encoder_out is (N, T, 1, C) + + decoder_out = decoder_out.unsqueeze(1) + # Now decoder_out is (N, 1, U, C) + + logit = encoder_out + decoder_out + logit = torch.tanh(logit) + + output = self.output_linear(logit) + + return output diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py index e2c6e1be..3a55ac83 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py @@ -21,7 +21,7 @@ class HFWav2Vec2Transducer(HFWav2Transducer): Attributes: hf_feats: hugging face model wrapper object. - xvector: x-vector model object. + transducer: transducer model object. feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to the wav2vec "num_layers". feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more @@ -29,12 +29,12 @@ class HFWav2Vec2Transducer(HFWav2Transducer): """ def __init__( - self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" + self, hf_feats, transducer, feat_fusion_start=0, feat_fusion_method="weighted-avg" ): super().__init__() self.hf_feats = hf_feats - self.xvector = xvector + self.transducer = transducer self.feat_fusion_start = feat_fusion_start self.feat_fusion_method = feat_fusion_method self._hf_context = contextlib.nullcontext() @@ -85,7 +85,7 @@ def _fuse_hid_feats(self, hid_feats): return feats def compute_prototype_affinity(self): - return self.xvector.compute_prototype_affinity() + return self.transducer.compute_prototype_affinity() def update_loss_margin(self, epoch): """Updates the value of the margin in AAM/AM-softmax losses @@ -94,7 +94,7 @@ def update_loss_margin(self, epoch): Args: epoch: epoch which is about to start """ - self.xvector.update_loss_margin(epoch) + self.transducer.update_loss_margin(epoch) def rebuild_output_layer( self, @@ -107,7 +107,7 @@ def rebuild_output_layer( intertop_margin=0.0, num_subcenters=2, ): - self.xvector.rebuild_output_layer( + self.transducer.rebuild_output_layer( num_classes=num_classes, loss_type=loss_type, cos_scale=cos_scale, @@ -190,7 +190,7 @@ def forward( feats, hid_feats, feat_lengths = self.forward_feats( x, x_lengths, return_feat_layers ) - output = self.xvector( + output = self.transducer( feats, feat_lengths, y, @@ -203,7 +203,7 @@ def forward( return output if not isinstance(output, dict): - # if the xvector just returned the logits we put then into a dictionary + # if the transducer just returned the logits we put then into a dictionary # to append the hid feats later. output["logits"] = output @@ -233,7 +233,7 @@ def extract_embed( * feats.size(-1) // x.size(-1) ) - return self.xvector.extract_embed( + return self.transducer.extract_embed( feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks ) @@ -266,8 +266,8 @@ def set_train_mode(self, mode): self.unfreeze() self.freeze_feat_fuser() self.freeze_hf_feats() - self.xvector.freeze_preembed_layers() - elif mode in ["ft-xvector", "ft-xvector-nograd"]: + self.transducer.freeze_preembed_layers() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: self.unfreeze() self.freeze_hf_feats() self.freeze_feat_fuser() @@ -296,16 +296,16 @@ def _train(self, train_mode: str): super()._train(train_mode) elif train_mode == "ft-embed-affine": self.hf_feats.train() - self.xvector._train("ft-embed_affine") + self.transducer._train("ft-embed_affine") elif train_mode in [ - "ft-xvector", + "ft-transducer", "hf-feats-frozen", - "ft-xvector-nograd", + "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", ]: self.hf_feats.train() - self.xvector._train("full") + self.transducer._train("full") else: raise ValueError(f"invalid train_mode={train_mode}") @@ -315,9 +315,9 @@ def valid_train_modes(): "full", "frozen", "ft-embed-affine", - "ft-xvector", + "ft-transducer", "hf-feats-frozen", - "ft-xvector-nograd", + "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", ] @@ -326,7 +326,7 @@ def valid_train_modes(): def filter_args(**kwargs): valid_args = ( "hf_feats", - "xvector", + "transducer", "feat_fusion_start", "feat_fusion_method", ) @@ -336,12 +336,12 @@ def filter_args(**kwargs): def get_config(self): hf_cfg = self.hf_feats.get_config() - xvec_cfg = self.xvector.get_config() + xvec_cfg = self.transducer.get_config() del hf_cfg["class_name"] del xvec_cfg["class_name"] config = { "hf_feats": hf_cfg, - "xvector": xvec_cfg, + "transducer": xvec_cfg, "feat_fusion_start": self.feat_fusion_start, "feat_fusion_method": self.feat_fusion_method, } @@ -349,10 +349,10 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def change_config(self, hf_feats, xvector): - logging.info("changing hf wav2xvector config") + def change_config(self, hf_feats, transducer): + logging.info("changing hf wav2transducer config") self.hf_feats.change_config(**hf_feats) - self.xvector.change_config(**xvector) + self.transducer.change_config(**transducer) @staticmethod def add_class_args(parser, prefix=None, skip=set()): @@ -384,5 +384,5 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser.add_argument( "--" + prefix, action=ActionParser(parser=parser), - help="xvector options", + help="transducer options", ) diff --git a/hyperion/utils/text_info.py b/hyperion/utils/text_info.py deleted file mode 100644 index 4d7e2c3a..00000000 --- a/hyperion/utils/text_info.py +++ /dev/null @@ -1,104 +0,0 @@ -""" - Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -from pathlib import Path - -import numpy as np -import pandas as pd - -from .info_table import InfoTable - - -def read_2column_text(path: Union[Path, str]) -> Dict[str, str]: - """Read a text file having 2 column as dict object. - - Examples: - wav.scp: - key1 /some/path/a.wav - key2 /some/path/b.wav - - >>> read_2column_text('wav.scp') - {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'} - - """ - assert check_argument_types() - - data = {} - with Path(path).open("r", encoding="utf-8") as f: - for linenum, line in enumerate(f, 1): - sps = line.rstrip().split(maxsplit=1) - if len(sps) == 1: - k, v = sps[0], "" - else: - k, v = sps - if k in data: - raise RuntimeError(f"{k} is duplicated ({path}:{linenum})") - data[k] = v - return data - - - -class TextInfo(InfoTable): - def __init__(self, df): - super().__init__(df) - if "weights" not in self.df: - self.set_uniform_weights() - else: - self.df["weights"] /= self.df["weigths"].sum() - - def set_uniform_weights(self): - self.df["weights"] = 1 / len(self.df) - - def set_weights(self, weights): - self.df["weights"] = weights / weights.sum() - - def exp_weights(self, x): - weights = self.df["weights"] ** x - self.set_weights(weights) - - def set_zero_weight(self, id): - self.df.loc[id, "weights"] = 0 - self.df["weights"] /= self.df["weights"].sum() - - @property - def weights(self, id): - return self.df.loc[id, "weights"] - - @property - def num_classes(self): - return self.df["class_idx"].values.max() + 1 - - @classmethod - def load(cls, file_path, sp, sep=None): - """Loads utt2info list from text file. - - Args: - file_path: File to read the list. - sp: SentencePieceProcessor from the BPE model - sep: Separator between the key and file_path in the text file. - dtype: Dictionary with the dtypes of each column. - Returns: - Utt2Info object - """ - #TODO: load text information - """Loads utt2info list from text file. - - Args: - file_path: File to read the list. - sp: SentencePieceProcessor for bpe. - Returns: - Utt2Info object - """ - # # y: k2.RaggedTensor, - # # A ragged tensor with 2 axes [utt][label]. It contains labels of each utterance. - # y = sp.encode(texts, out_type=int) - # y = k2.RaggedTensor(y).to(device) - file_path = Path(file_path) - text_df = super().load(file_path, sep, name="text_label") - # for i, text in enumerate(text_df["text_label"]): - # y = sp.encode(text, out_type=int) - # y = k2.RaggedTensor(y).to(device) - # text_df["text_label"][i] = y - - return text_df From 30420e8adad694e797c04aa936fecc7586f3a0c6 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Mon, 31 Oct 2022 16:33:35 -0400 Subject: [PATCH 041/154] Data preparation and implemented Wav2vec2Transducer Models --- ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 22 +- .../v1/conf/wav2vec2xlsr300m_transducer.yaml | 51 +-- hyperion/bin/train_wav2vec2transducer.py | 212 +++------ hyperion/torch/data/audio_dataset.py | 41 +- hyperion/torch/data/bucketing_seg_sampler.py | 6 +- hyperion/torch/models/transducer/conformer.py | 63 ++- hyperion/torch/models/transducer/decoder.py | 38 ++ hyperion/torch/models/transducer/joiner.py | 16 + .../torch/models/transducer/transducer.py | 155 ++++++- .../wav2transducer/hf_wav2transducer.py | 77 ++-- .../wav2transducer/hf_wav2vec2_transducer.py | 404 +++--------------- hyperion/torch/trainers/transducer_trainer.py | 13 +- 12 files changed, 456 insertions(+), 642 deletions(-) diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index e7cfc8ef..91adaa35 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -1,22 +1,32 @@ data: train: dataset: - # max_chunk_length: 3.0 - # min_chunk_length: 3.0 - aug_cfgs: [conf/reverb_noise_aug.yaml] wav_scale: 1 + # class_names: + # - text + # aug_cfgs: + # - conf/reverb_noise_aug.yaml + return_segment_info: + - text sampler: + # sampler_type: 'seg_sampler' sampler_type: 'bucketing_seg_sampler' - batch_size: 32 + batch_size: 4 iters_per_epoch: 6 data_loader: - num_workers: 8 + num_workers: 1 val: dataset: # max_chunk_length: 4.0 # min_chunk_length: 4.0 - aug_cfgs: [conf/reverb_noise_aug.yaml] + # aug_cfgs: [conf/reverb_noise_aug.yaml] wav_scale: 1 + # class_names: + # - text + # aug_cfgs: + # - conf/reverb_noise_aug.yaml + return_segment_info: + - text sampler: sampler_type: 'bucketing_seg_sampler' batch_size: 32 diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml index fe71a40c..3f97feb7 100644 --- a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml @@ -1,40 +1,21 @@ hf_feats: pretrained_model_path: microsoft/wavlm-base-plus + # test_param: xyz transducer: - resnet_enc: - in_feats: 765 - in_conv_channels: 512 - in_kernel_size: 5 - in_stride: 1 - resb_type: seres2bn - resb_repeats: - - 1 - - 1 - - 1 - resb_channels: - - 512 - resb_kernel_sizes: - - 3 - resb_dilations: - - 2 - - 3 - - 4 - resb_strides: - - 1 - res2net_width_factor: 1 - res2net_scale: 8 - se_r: 4 - multilayer: true - multilayer_concat: true - endpoint_channels: 1536 - pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 128 - embed_dim: 192 - cos_scale: 32.0 - margin: 0.2 - margin_warmup_epochs: 5 - intertop_margin: 0.1 - dropout_rate: 0.0 + encoder_out_dim: 768 + conformer_enc: + num_features: 80 + subsampling_factor: 4 + d_model: 512 + nhead: 8 + dim_feedforward: 2048 + num_encoder_layers: 12 + vgg_frontend: False + decoder: + # vocab_size: 1000 + # blank_id: 0 + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 7e87c180..8c539cd1 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -4,7 +4,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import sys -import pdb import os from pathlib import Path from jsonargparse import ( @@ -13,6 +12,7 @@ ActionParser, namespace_to_dict, ) +import k2 import time import logging import multiprocessing @@ -29,10 +29,7 @@ from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import HFWav2Vec2Transducer -from hyperion.torch.models.transducer import Conformer -from hyperion.torch.models.transducer import Decoder -from hyperion.torch.models.transducer import Joiner - +from torch.nn.utils.rnn import pad_sequence model_dict = { @@ -40,10 +37,28 @@ } +def my_collate(batch): + audio = [] + audio_length = [] + target = [] + for i, record in enumerate(batch): + wav = torch.as_tensor(record[0]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record[1]) + if i==4: + break + audio = pad_sequence(audio) + audio_length = torch.as_tensor(audio_length) + target = k2.RaggedTensor(target) + return torch.transpose(audio,0,1), audio_length, target + + + def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] - ad_args = AD.filter_args(**kwargs["dataset"]) - sampler_args = kwargs["sampler"] + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] if rank == 0: logging.info("{} audio dataset args={}".format(partition, ad_args)) logging.info("{} sampler args={}".format(partition, sampler_args)) @@ -52,32 +67,31 @@ def init_data(partition, rank, num_gpus, **kwargs): is_val = partition == "val" ad_args["is_val"] = is_val sampler_args["shuffle"] = not is_val - print("ad_args", ad_args) dataset = AD(**ad_args) if rank == 0: logging.info("init %s samplers", partition) - print("sampler_args", sampler_args) sampler = SegSamplerFactory.create(dataset, **sampler_args) if rank == 0: logging.info("init %s dataloader", partition) - num_workers = kwargs["data_loader"]["num_workers"] + num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) largs = ( {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} ) - data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=my_collate) return data_loader -def init_model(num_classes, rank, model_class, **kwargs): +def init_model(blank_id, vocab_size, rank, model_class, **kwargs): model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: logging.info("model network args={}".format(model_args)) # TODO: check model_args - model_args["num_classes"] = num_classes + model_args["transducer"]["blank_id"] = blank_id + model_args["transducer"]["vocab_size"] = vocab_size model = model_class(**model_args) if rank == 0: logging.info("model={}".format(model)) @@ -86,130 +100,6 @@ def init_model(num_classes, rank, model_class, **kwargs): -def get_params() -> AttributeDict: - """Return a dict containing training parameters. - - All training related parameters that are not passed from the commandline - are saved in the variable `params`. - - Commandline options are merged into `params` after they are parsed, so - you can also access them via `params`. - - Explanation of options saved in `params`: - - - best_train_loss: Best training loss so far. It is used to select - the model that has the lowest training loss. It is - updated during the training. - - - best_valid_loss: Best validation loss so far. It is used to select - the model that has the lowest validation loss. It is - updated during the training. - - - best_train_epoch: It is the epoch that has the best training loss. - - - best_valid_epoch: It is the epoch that has the best validation loss. - - - batch_idx_train: Used to writing statistics to tensorboard. It - contains number of batches trained so far across - epochs. - - - log_interval: Print training loss if batch_idx % log_interval` is 0 - - - reset_interval: Reset statistics if batch_idx % reset_interval is 0 - - - valid_interval: Run validation if batch_idx % valid_interval is 0 - - - feature_dim: The model input dim. It has to match the one used - in computing features. - - - subsampling_factor: The subsampling factor for the model. - - - attention_dim: Hidden dim for multi-head attention model. - - - num_decoder_layers: Number of decoder layer of transformer decoder. - - - warm_step: The warm_step for Noam optimizer. - """ - params = AttributeDict( - { - "best_train_loss": float("inf"), - "best_valid_loss": float("inf"), - "best_train_epoch": -1, - "best_valid_epoch": -1, - "batch_idx_train": 0, - "log_interval": 50, - "reset_interval": 200, - "valid_interval": 3000, # For the 100h subset, use 800 - # parameters for conformer - "feature_dim": 80, - "encoder_out_dim": 512, - "subsampling_factor": 4, - "attention_dim": 512, - "nhead": 8, - "dim_feedforward": 2048, - "num_encoder_layers": 12, - "vgg_frontend": False, - # decoder params - "decoder_embedding_dim": 1024, - "num_decoder_layers": 2, - "decoder_hidden_dim": 512, - # parameters for Noam - "warm_step": 80000, # For the 100h subset, use 8k - "env_info": get_env_info(), - } - ) - - return params - - -def get_encoder_model(params: AttributeDict): - # TODO: We can add an option to switch between Conformer and Transformer - encoder = Conformer( - num_features=params.feature_dim, - output_dim=params.encoder_out_dim, - subsampling_factor=params.subsampling_factor, - d_model=params.attention_dim, - nhead=params.nhead, - dim_feedforward=params.dim_feedforward, - num_encoder_layers=params.num_encoder_layers, - vgg_frontend=params.vgg_frontend, - ) - return encoder - - -def get_decoder_model(params: AttributeDict): - decoder = Decoder( - vocab_size=params.vocab_size, - embedding_dim=params.decoder_embedding_dim, - blank_id=params.blank_id, - num_layers=params.num_decoder_layers, - hidden_dim=params.decoder_hidden_dim, - output_dim=params.encoder_out_dim, - ) - return decoder - - -def get_joiner_model(params: AttributeDict): - joiner = Joiner( - input_dim=params.encoder_out_dim, - output_dim=params.vocab_size, - ) - return joiner - - -def get_transducer_model(params: AttributeDict): - encoder = get_encoder_model(params) - decoder = get_decoder_model(params) - joiner = get_joiner_model(params) - - model = Transducer( - encoder=encoder, - decoder=decoder, - joiner=joiner, - ) - return model - - def train_model(gpu_id, args): @@ -221,20 +111,20 @@ def train_model(gpu_id, args): torch.manual_seed(args.seed) set_float_cpu("float32") - ddp_args = ddp.filter_ddp_args(**kwargs) - device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs["rank"] = rank + # ddp_args = ddp.filter_ddp_args(**kwargs) + # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + # kwargs["rank"] = rank + + # for Debug + rank = 0 + kwargs["rank"] = 0 + device = "cpu" + world_size=1 - # # for Debug - # rank = 0 - # kwargs["rank"] = 0 - # device = "cpu" - # world_size=1 - train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - # model = init_model(train_loader.dataset.num_classes.values())[0], **kwargs) - model = init_model(train_loader.dataset.num_classes, **kwargs) + # model = init_model(train_loader.dataset.num_classes, **kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), train_loader.dataset.sp.get_piece_size(), **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: @@ -255,12 +145,11 @@ def train_model(gpu_id, args): def make_parser(model_class): parser = ArgumentParser() - + parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") AD.add_class_args(train_parser, prefix="dataset", skip={}) SegSamplerFactory.add_class_args(train_parser, prefix="sampler") - # Sampler.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", type=int, @@ -271,7 +160,6 @@ def make_parser(model_class): val_parser = ArgumentParser(prog="") AD.add_class_args(val_parser, prefix="dataset", skip={}) SegSamplerFactory.add_class_args(val_parser, prefix="sampler") - # Sampler.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", type=int, @@ -288,13 +176,20 @@ def make_parser(model_class): "--data.train.dataset.text_file", type=str, ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + # parser.add_argument( + # "--data.train.dataset.class_files", + # type=str, + # ) parser.add_argument( "--data.train.dataset.bpe_model", type=str, ) - parser.add_argument("--data.val.dataset.text_file", type=str) + # parser.add_argument("--data.val.dataset.class_files", type=str) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" @@ -304,16 +199,6 @@ def make_parser(model_class): "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" ) - # parser.link_arguments( - # "data.train.dataset.class_file", "data.val.dataset.class_file" - # ) - # parser.link_arguments( - # "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" - # ) - # parser.link_arguments( - # "data.train.sampler.batch_size", "data.val.sampler.batch_size" - # ) - model_class.add_class_args(parser, prefix="model") Trainer.add_class_args( parser, prefix="trainer", train_modes=model_class.valid_train_modes() @@ -328,7 +213,6 @@ def make_parser(model_class): if __name__ == "__main__": - parser = ArgumentParser(description="Train Wav2Vec2Transducer model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 11e8cede..8929868f 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -21,12 +21,12 @@ import k2 import sentencepiece as spm +from torch.nn.utils.rnn import pad_sequence from torch.utils.data import Dataset import torch.distributed as dist from hyperion.np import augment -import pdb class AudioDataset1(Dataset): def __init__( @@ -539,7 +539,6 @@ def _load_bpe_model(self, bpe_model, is_val): vocab_size = self.sp.get_piece_size() def _load_text_infos(self, text_file, is_val): - #TODO: load bpe and text into data structure if text_file is None: return if self.rank == 0: @@ -547,6 +546,7 @@ def _load_text_infos(self, text_file, is_val): text = read_text(text_file) self.seg_set["text"] = text.loc[self.seg_set["id"]].text + self.text_info = ClassInfo(text) @@ -653,6 +653,7 @@ def _read_audio(self, seg_id, start, duration): def _apply_augs(self, x, num_samples, reverb_context_samples): x_augs = [] + # for each type of augmentation for i, augmenter in enumerate(self.augmenters): # we do n_augs per augmentation type @@ -677,34 +678,16 @@ def _get_segment_info(self, seg_id): class_info = self.class_info[info_name] idx = class_info.loc[seg_info, "class_idx"] seg_info = idx + if info_name == "text": + text = self.text_info.loc[seg_id, "text"] + seg_info = self.sp.encode(text, out_type=int) r.append(seg_info) return r - def _get_text_info(self, seg_id): - #TODO: bpe labels from data structure for getitem - r = [] - # converts the class_ids to integers - for info_name in self.return_segment_info: - seg_info = self.seg_set.loc[seg_id, info_name] - if info_name in self.text_info: - # if the type of information is a text - # we use the text information table to - # convert from id to text labels - text_info = self.text_info[info_name] - idx = text_info.loc[seg_info, "class_idx"] - seg_info = idx - y = sp.encode(text, out_type=int) - y = k2.RaggedTensor(y).to(device) - - r.append(y) - - return r - def __getitem__(self, segment): - #TODO: check the start/end time for Recognition seg_id, start, duration = self._parse_segment_item(segment) x, fs = self._read_audio(seg_id, start, duration) if self.augmenters: @@ -712,6 +695,7 @@ def __getitem__(self, segment): num_samples = int(duration * fs) reverb_context_samples = len(x) - num_samples x_augs = self._apply_augs(x, num_samples, reverb_context_samples) + r = x_augs # add original non augmented audio @@ -722,14 +706,9 @@ def __getitem__(self, segment): else: r = [x] - #TODO: Add it back for both case - # # adds the segment labels - # seg_info = self._get_segment_info(seg_id) - # r.extend(seg_info) - - # adds the text labels - text_info = self._get_text_info(seg_id) - r.extend(text_info) + # adds the segment labels + seg_info = self._get_segment_info(seg_id) + r.extend(seg_info) return (*r,) diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index 37794377..8dbc4e45 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -12,6 +12,7 @@ from .hyp_sampler import HypSampler from .seg_sampler import SegSampler import torch.distributed as dist +from torch.nn.utils.rnn import pad_sequence class BucketingSegSampler(HypSampler): @@ -62,6 +63,9 @@ def _create_bucket_samplers(self): self.bucket_samplers = bucket_samplers + def __len__(self): + return self._len + def _compute_len(self): self._len = 0 for i in range(self.num_buckets): @@ -93,7 +97,7 @@ def __next__(self): if self.depleted_buckets[bucket_idx]: continue - bucket = self.buckets[bucket_idx] + bucket = self.bucket_samplers[bucket_idx] try: batch = next(bucket) break diff --git a/hyperion/torch/models/transducer/conformer.py b/hyperion/torch/models/transducer/conformer.py index 2bf63320..734c28ce 100644 --- a/hyperion/torch/models/transducer/conformer.py +++ b/hyperion/torch/models/transducer/conformer.py @@ -20,11 +20,12 @@ import warnings from typing import List, Optional, Tuple +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import torch from torch import Tensor, nn -from transformer import Transformer +from .transformer import Transformer -from icefall.utils import make_pad_mask, subsequent_chunk_mask +from hyperion.utils.utils import make_pad_mask, subsequent_chunk_mask class Conformer(Transformer): @@ -388,6 +389,64 @@ def streaming_forward( return logits, lengths, states + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "num_features", + "encoder_out_dim", + "subsampling_factor", + "d_model", + "nhead", + "dim_feedforward", + "num_encoder_layers", + "vgg_frontend", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + + parser.add_argument( + "--num-features", default=80, type=int, help=("") + ) + + parser.add_argument( + "--subsampling-factor", default=4, type=int, help=("") + ) + + parser.add_argument( + "--d-model", default=512, type=int, help=("") + ) + + parser.add_argument( + "--nhead", default=8, type=int, help=("") + ) + + parser.add_argument( + "--dim-feedforward", default=2048, type=int, help=("") + ) + + parser.add_argument( + "--num-encoder-layers", default=12, type=int, help=("") + ) + + parser.add_argument( + "--vgg-frontend", default=False, type=bool, help=("") + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + + + + + class ConformerEncoderLayer(nn.Module): """ ConformerEncoderLayer is made up of self-attn, feedforward and convolution networks. diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py index 333fff30..0b47e80c 100644 --- a/hyperion/torch/models/transducer/decoder.py +++ b/hyperion/torch/models/transducer/decoder.py @@ -18,6 +18,7 @@ import torch import torch.nn as nn +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo # TODO(fangjun): Support switching between LSTM and GRU @@ -95,3 +96,40 @@ def forward( out = self.output_linear(rnn_out) return out, (h, c) + + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "embedding_dim", + "num_layers", + "hidden_dim", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--embedding-dim", default=1024, type=int, help=("feature dimension") + ) + + parser.add_argument( + "--num-layers", default=2, type=int, help=("") + ) + + parser.add_argument( + "--hidden-dim", default=512, type=int, help=("") + ) + + + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + diff --git a/hyperion/torch/models/transducer/joiner.py b/hyperion/torch/models/transducer/joiner.py index 2ef3f1de..72376b3c 100644 --- a/hyperion/torch/models/transducer/joiner.py +++ b/hyperion/torch/models/transducer/joiner.py @@ -52,3 +52,19 @@ def forward( output = self.output_linear(logit) return output + + # @staticmethod + # def filter_args(**kwargs): + # valid_args = ( + # "encoder_out_dim", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + # return args + + # @staticmethod + # def add_class_args(parser, prefix=None, skip=set()): + + # parser.add_argument( + # "--encoder-out-dim", default=512, type=int, help=("") + # ) \ No newline at end of file diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index ff12ef18..b34ff4cc 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -18,6 +18,7 @@ Note we use `rnnt_loss` from torchaudio, which exists only in torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0 """ +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import k2 import torch import torch.nn as nn @@ -25,19 +26,25 @@ import torchaudio.functional from .encoder_interface import EncoderInterface +from ...torch_model import TorchModel from hyperion.utils.utils import add_sos +from .conformer import Conformer +from .decoder import Decoder +from .joiner import Joiner -class Transducer(nn.Module): +class Transducer(TorchModel): """It implements https://arxiv.org/pdf/1211.3711.pdf "Sequence Transduction with Recurrent Neural Networks" """ def __init__( - self, - encoder: EncoderInterface, - decoder: nn.Module, - joiner: nn.Module, + self, + vocab_size, + blank_id, + encoder_out_dim, + conformer_enc, + decoder, ): """ Args: @@ -56,12 +63,20 @@ def __init__( unnormalized probs, i.e., not processed by log-softmax. """ super().__init__() - assert isinstance(encoder, EncoderInterface) - assert hasattr(decoder, "blank_id") + # assert isinstance(encoder, EncoderInterface) + # assert hasattr(decoder, "blank_id") + conformer_enc["output_dim"] = encoder_out_dim + decoder["blank_id"] = blank_id + decoder["vocab_size"] = vocab_size + decoder["output_dim"] = encoder_out_dim + joiner = {"input_dim":encoder_out_dim, "output_dim":vocab_size} + + self.encoder = Conformer(**conformer_enc) + self.decoder = Decoder(**decoder) + self.joiner = Joiner(**joiner) + + - self.encoder = encoder - self.decoder = decoder - self.joiner = joiner def forward( self, @@ -85,12 +100,14 @@ def forward( assert x.ndim == 3, x.shape assert x_lens.ndim == 1, x_lens.shape assert y.num_axes == 2, y.num_axes - + assert x.size(0) == x_lens.size(0) == y.dim0 - encoder_out, x_lens = self.encoder(x, x_lens) + # wav2vec2 works as encoder + # encoder_out, x_lens = self.encoder(x, x_lens) assert torch.all(x_lens > 0) + encoder_out = x # Now for the decoder, i.e., the prediction network row_splits = y.shape.row_splits(1) y_lens = row_splits[1:] - row_splits[:-1] @@ -113,14 +130,124 @@ def forward( f"Current torchaudio version: {torchaudio.__version__}\n" "Please install a version >= 0.10.0" ) + + x_lens = x_lens.to(torch.int32) + loss = torchaudio.functional.rnnt_loss( logits=logits, - targets=y_padded, + targets=y_padded.to(torch.int32), logit_lengths=x_lens, target_lengths=y_lens, blank=blank_id, reduction="sum", ) - return loss + return logits, loss + + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_preembed_layers() + else: + raise ValueError(f"invalid train_mode={mode}") + + self._train_mode = mode + + @classmethod + def load(cls, file_path=None, cfg=None, state_dict=None): + cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) + encoder_net = TorchNALoader.load_from_cfg(cfg=cfg["encoder_cfg"]) + for k in "encoder_cfg": + del cfg[k] + + model = cls(encoder_net, **cfg) + if state_dict is not None: + model.load_state_dict(state_dict) + + return model + + + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode == "ft-embed-affine": + self.encoder_net.eval() + if self.proj is not None: + self.proj.eval() + + self.pool_net.eval() + self.classif_net.train() + layer_list = [l for l in range(self.embed_layer)] + self.classif_net.put_layers_in_eval_mode(layer_list) + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return ["full", "frozen", "ft-embed-affine"] + + def get_config(self): + enc_cfg = self.encoder.get_config() + dec_cfg = self.decoder.get_config() + join_cfg = self.joiner.get_config() + + config = { + "encoder_out_dim" : self.encoder_out_dim, + "conformer_enc": enc_cfg, + "decoder": dec_cfg, + "joiner": join_cfg, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + + # get arguments for pooling + encoder_args = Conformer.filter_args(**kwargs["conformer_enc"]) + decoder_args = Decoder.filter_args(**kwargs["decoder"]) + # joiner_args = Joiner.filter_args(**kwargs["joiner"]) + + valid_args = ( + "encoder_out_dim", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + args["conformer_enc"] = encoder_args + args["decoder"] = decoder_args + # args["joiner"] = joiner_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + + + Conformer.add_class_args( + parser, prefix="conformer_enc", skip=[] + ) + + Decoder.add_class_args( + parser, prefix="decoder", skip=[] + ) + + parser.add_argument( + "--encoder-out-dim", default=512, type=int, help=("") + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py index 1e038f17..7956c9ba 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py @@ -1,5 +1,5 @@ """ - Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging @@ -13,6 +13,8 @@ from ...torch_model import TorchModel from ...utils import remove_silence +# from ..wav2xvectors.hf_wav2xvector import HFWav2XVector + class HFWav2Transducer(TorchModel): @@ -20,7 +22,7 @@ class HFWav2Transducer(TorchModel): Attributes: hf_feats: hugging face model wrapper object. - xvector: x-vector model object. + transducer: transducer model object. feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to the wav2vec "num_layers". feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more @@ -28,12 +30,12 @@ class HFWav2Transducer(TorchModel): """ def __init__( - self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" + self, hf_feats, transducer, feat_fusion_start=0, feat_fusion_method="weighted-avg" ): super().__init__() self.hf_feats = hf_feats - self.xvector = xvector + self.transducer = transducer self.feat_fusion_start = feat_fusion_start self.feat_fusion_method = feat_fusion_method self._hf_context = contextlib.nullcontext() @@ -84,16 +86,16 @@ def _fuse_hid_feats(self, hid_feats): return feats def compute_prototype_affinity(self): - return self.xvector.compute_prototype_affinity() + return self.transducer.compute_prototype_affinity() - def update_loss_margin(self, epoch): - """Updates the value of the margin in AAM/AM-softmax losses - given the epoch number + # def update_loss_margin(self, epoch): + # """Updates the value of the margin in AAM/AM-softmax losses + # given the epoch number - Args: - epoch: epoch which is about to start - """ - self.xvector.update_loss_margin(epoch) + # Args: + # epoch: epoch which is about to start + # """ + # self.transducer.update_loss_margin(epoch) def rebuild_output_layer( self, @@ -106,7 +108,7 @@ def rebuild_output_layer( intertop_margin=0.0, num_subcenters=2, ): - self.xvector.rebuild_output_layer( + self.transducer.rebuild_output_layer( num_classes=num_classes, loss_type=loss_type, cos_scale=cos_scale, @@ -189,25 +191,25 @@ def forward( feats, hid_feats, feat_lengths = self.forward_feats( x, x_lengths, return_feat_layers ) - output = self.xvector( + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + output, loss = self.transducer( feats, feat_lengths, y, - return_enc_layers=return_enc_layers, - return_classif_layers=return_classif_layers, - return_logits=return_logits, ) if not return_feat_layers: - return output + return output, loss if not isinstance(output, dict): - # if the xvector just returned the logits we put then into a dictionary + # if the transducer just returned the logits we put then into a dictionary # to append the hid feats later. output["logits"] = output output["h_feats"] = hid_feats - return output + return output, loss def extract_embed( self, @@ -232,7 +234,7 @@ def extract_embed( * feats.size(-1) // x.size(-1) ) - return self.xvector.extract_embed( + return self.transducer.extract_embed( feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks ) @@ -265,8 +267,8 @@ def set_train_mode(self, mode): self.unfreeze() self.freeze_feat_fuser() self.freeze_hf_feats() - self.xvector.freeze_preembed_layers() - elif mode in ["ft-xvector", "ft-xvector-nograd"]: + self.transducer.freeze_preembed_layers() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: self.unfreeze() self.freeze_hf_feats() self.freeze_feat_fuser() @@ -295,16 +297,16 @@ def _train(self, train_mode: str): super()._train(train_mode) elif train_mode == "ft-embed-affine": self.hf_feats.train() - self.xvector._train("ft-embed_affine") + self.transducer._train("ft-embed_affine") elif train_mode in [ - "ft-xvector", + "ft-transducer", "hf-feats-frozen", - "ft-xvector-nograd", + "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", ]: self.hf_feats.train() - self.xvector._train("full") + self.transducer._train("full") else: raise ValueError(f"invalid train_mode={train_mode}") @@ -314,9 +316,9 @@ def valid_train_modes(): "full", "frozen", "ft-embed-affine", - "ft-xvector", + "ft-transducer", "hf-feats-frozen", - "ft-xvector-nograd", + "ft-transducer-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", ] @@ -325,7 +327,7 @@ def valid_train_modes(): def filter_args(**kwargs): valid_args = ( "hf_feats", - "xvector", + "transducer", "feat_fusion_start", "feat_fusion_method", ) @@ -333,14 +335,13 @@ def filter_args(**kwargs): return args def get_config(self): - hf_cfg = self.hf_feats.get_config() - xvec_cfg = self.xvector.get_config() + tran_cfg = self.transducer.get_config() del hf_cfg["class_name"] - del xvec_cfg["class_name"] + del tran_cfg["class_name"] config = { "hf_feats": hf_cfg, - "xvector": xvec_cfg, + "transducer": tran_cfg, "feat_fusion_start": self.feat_fusion_start, "feat_fusion_method": self.feat_fusion_method, } @@ -348,10 +349,10 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def change_config(self, hf_feats, xvector): - logging.info("changing hf wav2xvector config") + def change_config(self, hf_feats, transducer): + logging.info("changing hf wav2transducer config") self.hf_feats.change_config(**hf_feats) - self.xvector.change_config(**xvector) + self.transducer.change_config(**transducer) @staticmethod def add_class_args(parser, prefix=None, skip=set()): @@ -385,3 +386,5 @@ def add_class_args(parser, prefix=None, skip=set()): action=ActionParser(parser=parser), help="xvector options", ) + + diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py index 3a55ac83..79c4ca86 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py @@ -1,388 +1,98 @@ """ - Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -import contextlib from jsonargparse import ArgumentParser, ActionParser +from typing import Union, Dict, Optional import torch import torch.nn as nn -# import torch.nn.functional as nnf - -from ...torch_model import TorchModel -from ...utils import remove_silence +from ..transducer import Transducer +from ...tpm import HFWav2Vec2 from .hf_wav2transducer import HFWav2Transducer + class HFWav2Vec2Transducer(HFWav2Transducer): - """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + """Class extracting Wav2Vec2 + ResNet1d x-vectors from waveform. Attributes: - hf_feats: hugging face model wrapper object. - transducer: transducer model object. - feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to - the wav2vec "num_layers". - feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more than one layer is used. """ def __init__( - self, hf_feats, transducer, feat_fusion_start=0, feat_fusion_method="weighted-avg" - ): - - super().__init__() - self.hf_feats = hf_feats - self.transducer = transducer - self.feat_fusion_start = feat_fusion_start - self.feat_fusion_method = feat_fusion_method - self._hf_context = contextlib.nullcontext() - self._make_fuser() - - def _make_fuser(self): - if self.feat_fusion_method == "last": - self.feat_fuser = None - return - - num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start - layer_dim = self.hf_feats.hidden_size - if self.feat_fusion_method == "weighted-avg": - self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) - elif self.feat_fusion_method == "linear": - self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers - elif self.feat_fusion_method == "cat": - self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) - - def _fuse_hid_feats(self, hid_feats): - """Fuses the hidden features from the Wav2Vec model. - - Args: - hid_feats: list of hidden features Tensors from Wav2Vec model. - - Returns: - Tensor of fused features (batch, channels, time) - """ - if len(hid_feats) == 1: - # There is only one layer of features - return hid_feats[0] - - hid_feats = hid_feats[self.feat_fusion_start :] - if self.feat_fusion_method == "weighted-avg": - hid_feats = torch.stack(hid_feats, dim=-1) - norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) - feats = torch.sum(hid_feats * norm_weights, dim=-1) - elif self.feat_fusion_method == "linear": - hid_feats = torch.stack(hid_feats, dim=-1) - feats = self.feat_fuser(hid_feats).squeeze(dim=-1) - elif self.feat_fusion_method == "cat": - hid_feats = torch.cat(hid_feats, dim=-1) - feats = self.feat_fuser(hid_feats) - elif self.feat_fusion_method == "last": - feats = hid_feats[-1] - - return feats - - def compute_prototype_affinity(self): - return self.transducer.compute_prototype_affinity() - - def update_loss_margin(self, epoch): - """Updates the value of the margin in AAM/AM-softmax losses - given the epoch number - - Args: - epoch: epoch which is about to start - """ - self.transducer.update_loss_margin(epoch) - - def rebuild_output_layer( - self, - num_classes=None, - loss_type="arc-softmax", - cos_scale=64, - margin=0.3, - margin_warmup_epochs=10, - intertop_k=5, - intertop_margin=0.0, - num_subcenters=2, - ): - self.transducer.rebuild_output_layer( - num_classes=num_classes, - loss_type=loss_type, - cos_scale=cos_scale, - margin=margin, - margin_warmup_epochs=margin_warmup_epochs, - intertop_k=intertop_k, - intertop_margin=intertop_margin, - num_subcenters=num_subcenters, - ) - - def forward_feats( - self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False - ): - return_hid_states = ( - False - if return_feat_layers is None and self.feat_fusion_method == "last" - else True - ) - with self._hf_context: - hf_output = self.hf_feats( - x, - x_lengths, - return_hid_states=return_hid_states, - chunk_length=chunk_length, - detach_chunks=detach_chunks, - ) - feat_lengths = hf_output["hidden_states_lengths"] - if return_hid_states: - hid_feats = hf_output["hidden_states"] - feats = self._fuse_hid_feats(hid_feats) - else: - hid_feats = None - feats = hf_output["last_hidden_state"] - - feats = feats.transpose(1, 2) - if return_feat_layers is not None: - # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) - # as the hidden features of the x-vector encoder. - hid_feats = [ - f.transpose(1, 2) - for i, f in enumerate(hid_feats) - if i in return_feat_layers - ] - else: - hid_feats = None - - return feats, hid_feats, feat_lengths - - def forward( - self, - x, - x_lengths=None, - y=None, - return_feat_layers=None, - return_enc_layers=None, - return_classif_layers=None, - return_logits=True, - ): - """Forward function. If returns the logits posteriors of the classes. - It can also returns the hidden representations in the wav2vec feature extractor, - the x-vector encoder and the - classification head. In this case the ouput variable is a dictionary. - - Args: - x: input features tensor with shape=(batch, in_feats, time) - x_lengths: time lengths of the features with shape=(batch,) - y: target classes torch.long tensor with shape=(batch,) - return_feat_layers: list of integers indicating, which wav2vec layers - we should return. If None, no wav2vec layers are returned. - return_enc_layers: list of integers indicating, which encoder layers - we should return. If None, no encoder layers are returned. - return_enc_layers: list of integers indicating, which classification head layers - we should return. If None, no head layers are returned. - return_logits: if True, it adds the logits to the output dictionary. - Returns: - Tensor with class logits with shape=(batch, num_classes) or - Dictionary with "logits", "h_enc" (list of hidden encoder layers), - "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) - """ - feats, hid_feats, feat_lengths = self.forward_feats( - x, x_lengths, return_feat_layers - ) - output = self.transducer( - feats, - feat_lengths, - y, - return_enc_layers=return_enc_layers, - return_classif_layers=return_classif_layers, - return_logits=return_logits, - ) - - if not return_feat_layers: - return output - - if not isinstance(output, dict): - # if the transducer just returned the logits we put then into a dictionary - # to append the hid feats later. - output["logits"] = output - - output["h_feats"] = hid_feats - return output - - def extract_embed( self, - x, - x_lengths=None, - vad_samples=None, - hf_chunk_length=0, - xvec_chunk_length=0, - embed_layer=None, - detach_chunks=False, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, Transducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", ): - if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) - - feats, _, feat_lengths = self.forward_feats( - x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks - ) - xvec_chunk_length = int( - xvec_chunk_length - * self.hf_feats.sample_frequency - * feats.size(-1) - // x.size(-1) - ) - return self.transducer.extract_embed( - feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks - ) - - def freeze_feat_fuser(self): - if self.feat_fuser is None: - return - - if self.feat_fusion_method == "weighted-avg": - self.feat_fuser.requires_grad = False - return - - for param in self.feat_fuser.parameters(): - param.requires_grad = False - - def freeze_hf_feats(self): - self.hf_feats.freeze() - - def freeze_hf_feature_encoder(self): - self.hf_feats.freeze_feature_encoder() - - def set_train_mode(self, mode): - if mode == self._train_mode: - return - - if mode == "full": - self.unfreeze() - elif mode == "frozen": - self.freeze() - elif mode == "ft-embed-affine": - self.unfreeze() - self.freeze_feat_fuser() - self.freeze_hf_feats() - self.transducer.freeze_preembed_layers() - elif mode in ["ft-transducer", "ft-transducer-nograd"]: - self.unfreeze() - self.freeze_hf_feats() - self.freeze_feat_fuser() - elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: - self.unfreeze() - self.freeze_hf_feats() - elif mode == "hf-feat-extractor-frozen": - self.unfreeze() - self.freeze_hf_feature_encoder() + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) else: - raise ValueError(f"invalid train_mode={mode}") + assert isinstance(hf_feats, HFWav2Vec2) - logging.info("train mode set to %s", mode) - - if "nograd" in mode: - logging.info("using torch.no_grad for hf_feats") - self._hf_context = torch.no_grad() + if isinstance(transducer, dict): + if "class_name" in transducer: + del transducer["class_name"] + transducer = Transducer(**transducer) else: - self._hf_context = contextlib.nullcontext() - - self._train_mode = mode + assert isinstance(transducer, Transducer) + # assert transducer.encoder_net.in_feats == hf_feats.hidden_size - def _train(self, train_mode: str): - - if train_mode in ["full", "frozen"]: - super()._train(train_mode) - elif train_mode == "ft-embed-affine": - self.hf_feats.train() - self.transducer._train("ft-embed_affine") - elif train_mode in [ - "ft-transducer", - "hf-feats-frozen", - "ft-transducer-nograd", - "hf-feats-frozen-nograd", - "hf-feat-extractor-frozen", - ]: - self.hf_feats.train() - self.transducer._train("full") - else: - raise ValueError(f"invalid train_mode={train_mode}") - - @staticmethod - def valid_train_modes(): - return [ - "full", - "frozen", - "ft-embed-affine", - "ft-transducer", - "hf-feats-frozen", - "ft-transducer-nograd", - "hf-feats-frozen-nograd", - "hf-feat-extractor-frozen", - ] + super().__init__(hf_feats, transducer, feat_fusion_start, feat_fusion_method) @staticmethod def filter_args(**kwargs): - valid_args = ( - "hf_feats", - "transducer", - "feat_fusion_start", - "feat_fusion_method", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args - - def get_config(self): + base_args = HFWav2Transducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = Transducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args - hf_cfg = self.hf_feats.get_config() - xvec_cfg = self.transducer.get_config() - del hf_cfg["class_name"] - del xvec_cfg["class_name"] - config = { - "hf_feats": hf_cfg, - "transducer": xvec_cfg, - "feat_fusion_start": self.feat_fusion_start, - "feat_fusion_method": self.feat_fusion_method, - } + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + Transducer.add_class_args(parser, prefix="transducer") + HFWav2Transducer.add_class_args(parser) - def change_config(self, hf_feats, transducer): - logging.info("changing hf wav2transducer config") - self.hf_feats.change_config(**hf_feats) - self.transducer.change_config(**transducer) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod - def add_class_args(parser, prefix=None, skip=set()): + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = Transducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + @staticmethod + def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - parser.add_argument( - "--feat-fusion-start", - default=0, - type=int, - help=( - "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" - "the wav2vec num_layers" - ), - ) - parser.add_argument( - "--feat-fusion-method", - default="weighted-avg", - choices=["weighted-avg", "linear", "cat", "last"], - help=( - "method to fuse the hidden layers from the wav2vec model " - "in [weighted-avg, cat]" - ), - ) + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + Transducer.add_finetune_args(parser, prefix="transducer") if prefix is not None: - outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), - help="transducer options", - ) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index a67da181..82a4f2bd 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -8,6 +8,7 @@ import logging import torch +import torchaudio import torch.nn as nn from ..utils import MetricAcc @@ -117,23 +118,25 @@ def train_epoch(self, data_loader): data_loader: pytorch data loader returning features and class labels. """ - self.model.update_loss_margin(self.cur_epoch) + # self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + self.sp = data_loader.dataset.sp + for batch, (data, audio_length, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() # TODO: Check and Modify data, target - data, target = data.to(self.device), target.to(self.device) + data, audio_length, target = data.to(self.device), audio_length.to(self.device), target.to(self.device) batch_size = data.shape[0] with self.amp_autocast(): - output = self.model(data, y=target) - loss = self.loss(output, target).mean() / self.grad_acc_steps + output, loss = self.model(data, x_lengths=audio_length, y=target) + loss = loss.mean() / self.grad_acc_steps + # loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() From bc85ec756c6715612dc988e446e0dc4f4bd9a766 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 3 Nov 2022 21:48:14 -0400 Subject: [PATCH 042/154] some bug fixes in subcenter loss and class-weighted sampler --- .../data/class_weighted_seg_chunk_sampler.py | 17 ++++- hyperion/torch/layers/margin_losses.py | 71 +++++++++++-------- hyperion/torch/models/xvectors/xvector.py | 18 ++++- hyperion/utils/class_info.py | 10 +-- hyperion/utils/info_table.py | 26 +++---- 5 files changed, 87 insertions(+), 55 deletions(-) diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 05b222c7..07a61b8f 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -182,8 +182,19 @@ def _gather_class_info(self): ) self.map_class_to_segs_idx = {} for class_id in self.class_info["id"].values: - seg_ids = map_class_to_segs.loc[class_id, "id"].values - seg_idx = self.seg_set.get_loc(seg_ids) + if class_id in map_class_to_segs.index: + seg_ids = map_class_to_segs.loc[class_id, "id"] + if isinstance(seg_ids, str): + seg_ids = [seg_ids] + else: + seg_ids = seg_ids.values + + seg_idx = self.seg_set.get_loc(seg_ids) + else: + seg_idx = [] + self.class_info.loc[class_id, "weights"] = 0.0 + self.class_info.renorm_weights() + self.map_class_to_segs_idx[class_id] = seg_idx def _set_class_weights(self): @@ -231,7 +242,7 @@ def set_hard_prototypes(self, affinity_matrix): ).indices def get_hard_prototypes(self, class_idx): - return self.hard_prototypes[class_idx].flatten() + return self.hard_prototypes[class_idx].flatten().numpy() def _sample_chunk_length(self): if self.var_batch_size: diff --git a/hyperion/torch/layers/margin_losses.py b/hyperion/torch/layers/margin_losses.py index 6443ea02..acb7a514 100644 --- a/hyperion/torch/layers/margin_losses.py +++ b/hyperion/torch/layers/margin_losses.py @@ -77,15 +77,18 @@ def __repr__(self): return self.__str__() def __str__(self): - s = "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" % ( - self.__class__.__name__, - self.in_feats, - self.num_classes, - self.cos_scale, - self.margin, - self.margin_warmup_epochs, - self.intertop_k, - self.intertop_margin, + s = ( + "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" + % ( + self.__class__.__name__, + self.in_feats, + self.num_classes, + self.cos_scale, + self.margin, + self.margin_warmup_epochs, + self.intertop_k, + self.intertop_margin, + ) ) return s @@ -224,15 +227,18 @@ def __repr__(self): return self.__str__() def __str__(self): - s = "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" % ( - self.__class__.__name__, - self.in_feats, - self.num_classes, - self.cos_scale, - self.margin, - self.margin_warmup_epochs, - self.intertop_k, - self.intertop_margin, + s = ( + "%s(in_feats=%d, num_classes=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" + % ( + self.__class__.__name__, + self.in_feats, + self.num_classes, + self.cos_scale, + self.margin, + self.margin_warmup_epochs, + self.intertop_k, + self.intertop_margin, + ) ) return s @@ -361,20 +367,25 @@ def __init__( ) def __str__(self): - s = "%s(in_feats=%d, num_classes=%d, num_subcenters=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" % ( - self.__class__.__name__, - self.in_feats, - self.num_classes, - self.num_subcenters, - self.cos_scale, - self.margin, - self.margin_warmup_epochs, - self.intertop_k, - self.intertop_margin, + s = ( + "%s(in_feats=%d, num_classes=%d, num_subcenters=%d, cos_scale=%.2f, margin=%.2f, margin_warmup_epochs=%d, intertop_k=%d, intertop_margin=%f)" + % ( + self.__class__.__name__, + self.in_feats, + self.num_classes, + self.num_subcenters, + self.cos_scale, + self.margin, + self.margin_warmup_epochs, + self.intertop_k, + self.intertop_margin, + ) ) return s def _update_counts(self, y, proto_idx): + idx1 = torch.arange(y.size(0)) + proto_idx = proto_idx[idx1, y] self.subcenter_counts[y, proto_idx] += 1 # we make counts relative to avoid risk of overflowing the integers min_counts, _ = torch.min(self.subcenter_counts, dim=1, keepdim=True) @@ -445,7 +456,9 @@ def get_main_prototype_kernel(self): self.subcenter_counts, dim=-1 ) # get indices for the main prototype idx1 = torch.arange(self.num_classes) - kernel = kernel.view(-1, self.num_classes, self.num_subcenters)[:, idx1, idx2] + kernel = self.kernel.view(-1, self.num_classes, self.num_subcenters)[ + :, idx1, idx2 + ] return kernel def compute_prototype_affinity(self): diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 197ef5a9..15f0ce86 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -572,14 +572,26 @@ def rebuild_output_layer( intertop_margin=0.0, num_subcenters=2, ): - if (self.num_classes is not None and self.num_classes != num_classes) or ( - self.loss_type != loss_type + if ( + (self.num_classes is not None and self.num_classes != num_classes) + or (self.loss_type != loss_type) + or ( + loss_type == "subcenter-arc-softmax" + and self.classif_net.num_subcenters != num_subcenters + ) ): # if we change the number of classes or the loss-type # we need to reinitiate the last layer logging.info("rebuilding output layer") self.classif_net.rebuild_output_layer( - num_classes, loss_type, cos_scale, margin, margin_warmup_epochs + num_classes, + loss_type, + cos_scale, + margin, + margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, ) return diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index f1eaf665..9e158d87 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -30,6 +30,10 @@ def set_uniform_weights(self): def set_weights(self, weights): self.df["weights"] = weights / weights.sum() + def renorm_weights(self): + weights = self.df["weights"] + self.df["weights"] = weights / weights.sum() + def exp_weights(self, x): weights = self.df["weights"] ** x self.set_weights(weights) @@ -62,11 +66,7 @@ def load(cls, file_path, sep=None): if ext == "": # if no extension we load as kaldi utt2spk file df = pd.read_csv( - file_path, - sep=" ", - header=None, - names=["id"], - dtype={"id": np.str}, + file_path, sep=" ", header=None, names=["id"], dtype={"id": np.str}, ) return cls(df) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 217f1f9a..80199a33 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -45,7 +45,7 @@ def __str__(self): @property def __repr__(self): - return self.df.__repr__ + return self.df.__repr__ @property def iat(self): @@ -75,6 +75,10 @@ def __setitem__(self): def __contains__(self): return self.df.__contains__ + @property + def index(self): + return self.df.index + def save(self, file_path, sep=None): """Saves info table to file @@ -144,9 +148,7 @@ def split(self, idx, num_parts, group_by=None): if group_by is None: _, idx1 = split_list(self.df["id"], idx, num_parts) else: - _, idx1 = split_list_group_by_key( - self.df[group_by], idx, num_parts - ) + _, idx1 = split_list_group_by_key(self.df[group_by], idx, num_parts) df = self.df.iloc[idx1] return self.__class__(df) @@ -166,7 +168,9 @@ def merge(cls, tables): return cls(df) def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): - assert items is None or iindex is None, "items and iindex cannot be not None at the same time" + assert ( + items is None or iindex is None + ), "items and iindex cannot be not None at the same time" df = self.df if not keep: @@ -195,9 +199,8 @@ def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): if columns is not None: df = df[columns] - - return self.__class__(df) + return self.__class__(df) def __eq__(self, other): """Equal operator""" @@ -216,8 +219,6 @@ def __cmp__(self, other): return 0 return 1 - - # def __len__(self): # """Returns the number of elements in the list.""" # return len(self.df) @@ -316,8 +317,6 @@ def __cmp__(self, other): # utt_info = self.utt_info.iloc[idx1] # return Utt2Info(utt_info) - - # def filter(self, filter_key, keep=True): # """Removes elements from Utt2Info object by key @@ -411,13 +410,10 @@ def get_loc(self, keys): loc = self.df.index.get_loc(keys) if isinstance(loc, int): return loc - elif isinstance(loc, np.ndarray) and loc.dtype==np.bool: + elif isinstance(loc, np.ndarray) and loc.dtype == np.bool: return np.nonzero(loc)[0] else: return list(range(loc.start, loc.stop, loc.step)) def get_col_idx(self, keys): return self.df.columns.get_loc(keys) - - - \ No newline at end of file From 450f50d35c1421cf561841862d306cb8a1afcf05 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Sat, 5 Nov 2022 13:53:04 -0400 Subject: [PATCH 043/154] Wav2vec2 Transducer update data augmentation --- ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 15 +- .../v1/conf/wav2vec2xlsr300m_transducer.yaml | 18 +- hyperion/bin/train_wav2vec2transducer.py | 32 +- hyperion/torch/data/audio_dataset.py | 5 +- .../torch/models/transducer/subsampling.py | 161 +++++++ .../torch/models/transducer/transducer.py | 24 +- .../torch/models/transducer/transformer.py | 418 ++++++++++++++++++ hyperion/torch/tpm/hf/hf_wav2vec_base.py | 4 +- hyperion/torch/trainers/transducer_trainer.py | 2 - hyperion/utils/text.py | 29 ++ 10 files changed, 651 insertions(+), 57 deletions(-) create mode 100644 hyperion/torch/models/transducer/subsampling.py create mode 100644 hyperion/torch/models/transducer/transformer.py create mode 100644 hyperion/utils/text.py diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index 91adaa35..2ee0a1aa 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -2,10 +2,8 @@ data: train: dataset: wav_scale: 1 - # class_names: - # - text - # aug_cfgs: - # - conf/reverb_noise_aug.yaml + aug_cfgs: + - conf/reverb_noise_aug.yaml return_segment_info: - text sampler: @@ -17,14 +15,9 @@ data: num_workers: 1 val: dataset: - # max_chunk_length: 4.0 - # min_chunk_length: 4.0 - # aug_cfgs: [conf/reverb_noise_aug.yaml] + aug_cfgs: + - conf/reverb_noise_aug.yaml wav_scale: 1 - # class_names: - # - text - # aug_cfgs: - # - conf/reverb_noise_aug.yaml return_segment_info: - text sampler: diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml index 3f97feb7..d736dc86 100644 --- a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml @@ -1,16 +1,16 @@ hf_feats: - pretrained_model_path: microsoft/wavlm-base-plus + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus # test_param: xyz transducer: encoder_out_dim: 768 - conformer_enc: - num_features: 80 - subsampling_factor: 4 - d_model: 512 - nhead: 8 - dim_feedforward: 2048 - num_encoder_layers: 12 - vgg_frontend: False +# conformer_enc: + # num_features: 80 + # subsampling_factor: 4 + # d_model: 512 + # nhead: 8 + # dim_feedforward: 2048 + # num_encoder_layers: 12 + # vgg_frontend: False decoder: # vocab_size: 1000 # blank_id: 0 diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 8c539cd1..bfc6248a 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -37,7 +37,7 @@ } -def my_collate(batch): +def transducer_collate(batch): audio = [] audio_length = [] target = [] @@ -46,8 +46,6 @@ def my_collate(batch): audio.append(wav) audio_length.append(wav.shape[0]) target.append(record[1]) - if i==4: - break audio = pad_sequence(audio) audio_length = torch.as_tensor(audio_length) target = k2.RaggedTensor(target) @@ -81,7 +79,7 @@ def init_data(partition, rank, num_gpus, **kwargs): largs = ( {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} ) - data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=my_collate) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) return data_loader @@ -111,25 +109,24 @@ def train_model(gpu_id, args): torch.manual_seed(args.seed) set_float_cpu("float32") - # ddp_args = ddp.filter_ddp_args(**kwargs) - # device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - # kwargs["rank"] = rank + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank - # for Debug - rank = 0 - kwargs["rank"] = 0 - device = "cpu" - world_size=1 + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - # model = init_model(train_loader.dataset.num_classes, **kwargs) model = init_model(train_loader.dataset.sp.piece_to_id(""), train_loader.dataset.sp.get_piece_size(), **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {"acc": CategoricalAccuracy()} + metrics = {} #{"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, @@ -179,18 +176,11 @@ def make_parser(model_class): parser.add_argument("--data.val.dataset.text_file", type=str) - # parser.add_argument( - # "--data.train.dataset.class_files", - # type=str, - # ) - parser.add_argument( "--data.train.dataset.bpe_model", type=str, ) - # parser.add_argument("--data.val.dataset.class_files", type=str) - parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 8929868f..35b7d85b 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -692,7 +692,10 @@ def __getitem__(self, segment): x, fs = self._read_audio(seg_id, start, duration) if self.augmenters: # augmentations - num_samples = int(duration * fs) + if duration == 0: + num_samples = len(x) + else: + num_samples = int(duration * fs) reverb_context_samples = len(x) - num_samples x_augs = self._apply_augs(x, num_samples, reverb_context_samples) diff --git a/hyperion/torch/models/transducer/subsampling.py b/hyperion/torch/models/transducer/subsampling.py new file mode 100644 index 00000000..542fb036 --- /dev/null +++ b/hyperion/torch/models/transducer/subsampling.py @@ -0,0 +1,161 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch +import torch.nn as nn + + +class Conv2dSubsampling(nn.Module): + """Convolutional 2D subsampling (to 1/4 length). + + Convert an input of shape (N, T, idim) to an output + with shape (N, T', odim), where + T' = ((T-1)//2 - 1)//2, which approximates T' == T//4 + + It is based on + https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py # noqa + """ + + def __init__(self, idim: int, odim: int) -> None: + """ + Args: + idim: + Input dim. The input shape is (N, T, idim). + Caution: It requires: T >=7, idim >=7 + odim: + Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim) + """ + assert idim >= 7 + super().__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=1, out_channels=odim, kernel_size=3, stride=2 + ), + nn.ReLU(), + nn.Conv2d( + in_channels=odim, out_channels=odim, kernel_size=3, stride=2 + ), + nn.ReLU(), + ) + self.out = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Subsample x. + + Args: + x: + Its shape is (N, T, idim). + + Returns: + Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim) + """ + # On entry, x is (N, T, idim) + x = x.unsqueeze(1) # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W) + x = self.conv(x) + # Now x is of shape (N, odim, ((T-1)//2 - 1)//2, ((idim-1)//2 - 1)//2) + b, c, t, f = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) + # Now x is of shape (N, ((T-1)//2 - 1))//2, odim) + return x + + +class VggSubsampling(nn.Module): + """Trying to follow the setup described in the following paper: + https://arxiv.org/pdf/1910.09799.pdf + + This paper is not 100% explicit so I am guessing to some extent, + and trying to compare with other VGG implementations. + + Convert an input of shape (N, T, idim) to an output + with shape (N, T', odim), where + T' = ((T-1)//2 - 1)//2, which approximates T' = T//4 + """ + + def __init__(self, idim: int, odim: int) -> None: + """Construct a VggSubsampling object. + + This uses 2 VGG blocks with 2 Conv2d layers each, + subsampling its input by a factor of 4 in the time dimensions. + + Args: + idim: + Input dim. The input shape is (N, T, idim). + Caution: It requires: T >=7, idim >=7 + odim: + Output dim. The output shape is (N, ((T-1)//2 - 1)//2, odim) + """ + super().__init__() + + cur_channels = 1 + layers = [] + block_dims = [32, 64] + + # The decision to use padding=1 for the 1st convolution, then padding=0 + # for the 2nd and for the max-pooling, and ceil_mode=True, was driven by + # a back-compatibility concern so that the number of frames at the + # output would be equal to: + # (((T-1)//2)-1)//2. + # We can consider changing this by using padding=1 on the + # 2nd convolution, so the num-frames at the output would be T//4. + for block_dim in block_dims: + layers.append( + torch.nn.Conv2d( + in_channels=cur_channels, + out_channels=block_dim, + kernel_size=3, + padding=1, + stride=1, + ) + ) + layers.append(torch.nn.ReLU()) + layers.append( + torch.nn.Conv2d( + in_channels=block_dim, + out_channels=block_dim, + kernel_size=3, + padding=0, + stride=1, + ) + ) + layers.append( + torch.nn.MaxPool2d( + kernel_size=2, stride=2, padding=0, ceil_mode=True + ) + ) + cur_channels = block_dim + + self.layers = nn.Sequential(*layers) + + self.out = nn.Linear( + block_dims[-1] * (((idim - 1) // 2 - 1) // 2), odim + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Subsample x. + + Args: + x: + Its shape is (N, T, idim). + + Returns: + Return a tensor of shape (N, ((T-1)//2 - 1)//2, odim) + """ + x = x.unsqueeze(1) + x = self.layers(x) + b, c, t, f = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) + return x diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index b34ff4cc..6fcc965d 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -28,7 +28,7 @@ from ...torch_model import TorchModel from hyperion.utils.utils import add_sos -from .conformer import Conformer +# from .conformer import Conformer from .decoder import Decoder from .joiner import Joiner @@ -43,7 +43,7 @@ def __init__( vocab_size, blank_id, encoder_out_dim, - conformer_enc, + # conformer_enc, decoder, ): """ @@ -65,13 +65,13 @@ def __init__( super().__init__() # assert isinstance(encoder, EncoderInterface) # assert hasattr(decoder, "blank_id") - conformer_enc["output_dim"] = encoder_out_dim + # conformer_enc["output_dim"] = encoder_out_dim decoder["blank_id"] = blank_id decoder["vocab_size"] = vocab_size decoder["output_dim"] = encoder_out_dim joiner = {"input_dim":encoder_out_dim, "output_dim":vocab_size} - self.encoder = Conformer(**conformer_enc) + # self.encoder = Conformer(**conformer_enc) self.decoder = Decoder(**decoder) self.joiner = Joiner(**joiner) @@ -196,13 +196,13 @@ def valid_train_modes(): return ["full", "frozen", "ft-embed-affine"] def get_config(self): - enc_cfg = self.encoder.get_config() + # enc_cfg = self.encoder.get_config() dec_cfg = self.decoder.get_config() join_cfg = self.joiner.get_config() config = { - "encoder_out_dim" : self.encoder_out_dim, - "conformer_enc": enc_cfg, + # "encoder_out_dim" : self.encoder_out_dim, + # "conformer_enc": enc_cfg, "decoder": dec_cfg, "joiner": join_cfg, } @@ -214,7 +214,7 @@ def get_config(self): def filter_args(**kwargs): # get arguments for pooling - encoder_args = Conformer.filter_args(**kwargs["conformer_enc"]) + # encoder_args = Conformer.filter_args(**kwargs["conformer_enc"]) decoder_args = Decoder.filter_args(**kwargs["decoder"]) # joiner_args = Joiner.filter_args(**kwargs["joiner"]) @@ -223,7 +223,7 @@ def filter_args(**kwargs): ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - args["conformer_enc"] = encoder_args + # args["conformer_enc"] = encoder_args args["decoder"] = decoder_args # args["joiner"] = joiner_args return args @@ -237,9 +237,9 @@ def add_class_args(parser, prefix=None, skip=set()): - Conformer.add_class_args( - parser, prefix="conformer_enc", skip=[] - ) + # Conformer.add_class_args( + # parser, prefix="conformer_enc", skip=[] + # ) Decoder.add_class_args( parser, prefix="decoder", skip=[] diff --git a/hyperion/torch/models/transducer/transformer.py b/hyperion/torch/models/transducer/transformer.py new file mode 100644 index 00000000..38edbd62 --- /dev/null +++ b/hyperion/torch/models/transducer/transformer.py @@ -0,0 +1,418 @@ +# Copyright 2021 University of Chinese Academy of Sciences (author: Han Zhu) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from .encoder_interface import EncoderInterface +from .subsampling import Conv2dSubsampling, VggSubsampling + +from hyperion.utils.utils import make_pad_mask + + +class Transformer(EncoderInterface): + def __init__( + self, + num_features: int, + output_dim: int, + subsampling_factor: int = 4, + d_model: int = 256, + nhead: int = 4, + dim_feedforward: int = 2048, + num_encoder_layers: int = 12, + dropout: float = 0.1, + normalize_before: bool = True, + vgg_frontend: bool = False, + ) -> None: + """ + Args: + num_features: + The input dimension of the model. + output_dim: + The output dimension of the model. + subsampling_factor: + Number of output frames is num_in_frames // subsampling_factor. + Currently, subsampling_factor MUST be 4. + d_model: + Attention dimension. + nhead: + Number of heads in multi-head attention. + Must satisfy d_model // nhead == 0. + dim_feedforward: + The output dimension of the feedforward layers in encoder. + num_encoder_layers: + Number of encoder layers. + dropout: + Dropout in encoder. + normalize_before: + If True, use pre-layer norm; False to use post-layer norm. + vgg_frontend: + True to use vgg style frontend for subsampling. + """ + super().__init__() + + self.num_features = num_features + self.output_dim = output_dim + self.subsampling_factor = subsampling_factor + if subsampling_factor != 4: + raise NotImplementedError("Support only 'subsampling_factor=4'.") + + # self.encoder_embed converts the input of shape (N, T, num_features) + # to the shape (N, T//subsampling_factor, d_model). + # That is, it does two things simultaneously: + # (1) subsampling: T -> T//subsampling_factor + # (2) embedding: num_features -> d_model + if vgg_frontend: + self.encoder_embed = VggSubsampling(num_features, d_model) + else: + self.encoder_embed = Conv2dSubsampling(num_features, d_model) + + self.encoder_pos = PositionalEncoding(d_model, dropout) + + encoder_layer = TransformerEncoderLayer( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + normalize_before=normalize_before, + ) + + if normalize_before: + encoder_norm = nn.LayerNorm(d_model) + else: + encoder_norm = None + + self.encoder = nn.TransformerEncoder( + encoder_layer=encoder_layer, + num_layers=num_encoder_layers, + norm=encoder_norm, + ) + + # TODO(fangjun): remove dropout + self.encoder_output_layer = nn.Sequential( + nn.Dropout(p=dropout), nn.Linear(d_model, output_dim) + ) + + def forward( + self, x: torch.Tensor, x_lens: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x: + The input tensor. Its shape is (batch_size, seq_len, feature_dim). + x_lens: + A tensor of shape (batch_size,) containing the number of frames in + `x` before padding. + Returns: + Return a tuple containing 2 tensors: + - logits, its shape is (batch_size, output_seq_len, output_dim) + - logit_lens, a tensor of shape (batch_size,) containing the number + of frames in `logits` before padding. + """ + x = self.encoder_embed(x) + x = self.encoder_pos(x) + x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) + + # Caution: We assume the subsampling factor is 4! + lengths = ((x_lens - 1) // 2 - 1) // 2 + assert x.size(0) == lengths.max().item() + + mask = make_pad_mask(lengths) + x = self.encoder(x, src_key_padding_mask=mask) # (T, N, C) + + logits = self.encoder_output_layer(x) + logits = logits.permute(1, 0, 2) # (T, N, C) ->(N, T, C) + + return logits, lengths + + +class TransformerEncoderLayer(nn.Module): + """ + Modified from torch.nn.TransformerEncoderLayer. + Add support of normalize_before, + i.e., use layer_norm before the first block. + + Args: + d_model: + the number of expected features in the input (required). + nhead: + the number of heads in the multiheadattention models (required). + dim_feedforward: + the dimension of the feedforward network model (default=2048). + dropout: + the dropout value (default=0.1). + activation: + the activation function of intermediate layer, relu or + gelu (default=relu). + normalize_before: + whether to use layer_norm before the first block. + + Examples:: + >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8) + >>> src = torch.rand(10, 32, 512) + >>> out = encoder_layer(src) + """ + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: str = "relu", + normalize_before: bool = True, + ) -> None: + super(TransformerEncoderLayer, self).__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.0) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + + self.normalize_before = normalize_before + + def __setstate__(self, state): + if "activation" not in state: + state["activation"] = nn.functional.relu + super(TransformerEncoderLayer, self).__setstate__(state) + + def forward( + self, + src: torch.Tensor, + src_mask: Optional[torch.Tensor] = None, + src_key_padding_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional) + + Shape: + src: (S, N, E). + src_mask: (S, S). + src_key_padding_mask: (N, S). + S is the source sequence length, T is the target sequence length, + N is the batch size, E is the feature number + """ + residual = src + if self.normalize_before: + src = self.norm1(src) + src2 = self.self_attn( + src, + src, + src, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask, + )[0] + src = residual + self.dropout1(src2) + if not self.normalize_before: + src = self.norm1(src) + + residual = src + if self.normalize_before: + src = self.norm2(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = residual + self.dropout2(src2) + if not self.normalize_before: + src = self.norm2(src) + return src + + +def _get_activation_fn(activation: str): + if activation == "relu": + return nn.functional.relu + elif activation == "gelu": + return nn.functional.gelu + + raise RuntimeError( + "activation should be relu/gelu, not {}".format(activation) + ) + + +class PositionalEncoding(nn.Module): + """This class implements the positional encoding + proposed in the following paper: + + - Attention Is All You Need: https://arxiv.org/pdf/1706.03762.pdf + + PE(pos, 2i) = sin(pos / (10000^(2i/d_modle)) + PE(pos, 2i+1) = cos(pos / (10000^(2i/d_modle)) + + Note:: + + 1 / (10000^(2i/d_model)) = exp(-log(10000^(2i/d_model))) + = exp(-1* 2i / d_model * log(100000)) + = exp(2i * -(log(10000) / d_model)) + """ + + def __init__(self, d_model: int, dropout: float = 0.1) -> None: + """ + Args: + d_model: + Embedding dimension. + dropout: + Dropout probability to be applied to the output of this module. + """ + super().__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = nn.Dropout(p=dropout) + # not doing: self.pe = None because of errors thrown by torchscript + self.pe = torch.zeros(1, 0, self.d_model, dtype=torch.float32) + + def extend_pe(self, x: torch.Tensor) -> None: + """Extend the time t in the positional encoding if required. + + The shape of `self.pe` is (1, T1, d_model). The shape of the input x + is (N, T, d_model). If T > T1, then we change the shape of self.pe + to (N, T, d_model). Otherwise, nothing is done. + + Args: + x: + It is a tensor of shape (N, T, C). + Returns: + Return None. + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model, dtype=torch.float32) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + # Now pe is of shape (1, T, d_model), where T is x.size(1) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Add positional encoding. + + Args: + x: + Its shape is (N, T, C) + + Returns: + Return a tensor of shape (N, T, C) + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1), :] + return self.dropout(x) + + +class Noam(object): + """ + Implements Noam optimizer. + + Proposed in + "Attention Is All You Need", https://arxiv.org/pdf/1706.03762.pdf + + Modified from + https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/optimizer.py # noqa + + Args: + params: + iterable of parameters to optimize or dicts defining parameter groups + model_size: + attention dimension of the transformer model + factor: + learning rate factor + warm_step: + warmup steps + """ + + def __init__( + self, + params, + model_size: int = 256, + factor: float = 10.0, + warm_step: int = 25000, + weight_decay=0, + ) -> None: + """Construct an Noam object.""" + self.optimizer = torch.optim.Adam( + params, lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=weight_decay + ) + self._step = 0 + self.warmup = warm_step + self.factor = factor + self.model_size = model_size + self._rate = 0 + + @property + def param_groups(self): + """Return param_groups.""" + return self.optimizer.param_groups + + def step(self): + """Update parameters and rate.""" + self._step += 1 + rate = self.rate() + for p in self.optimizer.param_groups: + p["lr"] = rate + self._rate = rate + self.optimizer.step() + + def rate(self, step=None): + """Implement `lrate` above.""" + if step is None: + step = self._step + return ( + self.factor + * self.model_size ** (-0.5) + * min(step ** (-0.5), step * self.warmup ** (-1.5)) + ) + + def zero_grad(self): + """Reset gradient.""" + self.optimizer.zero_grad() + + def state_dict(self): + """Return state_dict.""" + return { + "_step": self._step, + "warmup": self.warmup, + "factor": self.factor, + "model_size": self.model_size, + "_rate": self._rate, + "optimizer": self.optimizer.state_dict(), + } + + def load_state_dict(self, state_dict): + """Load state_dict.""" + for key, value in state_dict.items(): + if key == "optimizer": + self.optimizer.load_state_dict(state_dict["optimizer"]) + else: + setattr(self, key, value) diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index 1dceed1c..ed3fcbb3 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -273,7 +273,7 @@ def _preprocess(self, x, x_lengths=None): """Prepares input audio to be used as input to wav2vec style model.""" x_mask = seq_lengths_to_mask(x_lengths, x.size(-1), dtype=torch.long) if self.normalize_input: - x = self._normalize(x, x_lengths) + x = self._normalize(x, x_mask) if self.use_input_attention_mask: x_mask = None @@ -570,6 +570,8 @@ def add_class_args(parser, prefix=None, skip=set()): default=None, help=("file path or HuggingFace Hub path to pre-trained model"), ) + + parser.add_argument( "--normalize-input", default=True, diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 82a4f2bd..74363066 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -118,8 +118,6 @@ def train_epoch(self, data_loader): data_loader: pytorch data loader returning features and class labels. """ - # self.model.update_loss_margin(self.cur_epoch) - metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() diff --git a/hyperion/utils/text.py b/hyperion/utils/text.py new file mode 100644 index 00000000..5e06ad0c --- /dev/null +++ b/hyperion/utils/text.py @@ -0,0 +1,29 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from pathlib import Path + +import numpy as np +import pandas as pd + + + +def read_text(text_file: str): + # assert check_argument_types() + text_file = Path(text_file) + + data = {"id":[],"text":[]} + with Path(text_file).open("r", encoding="utf-8") as f: + for linenum, line in enumerate(f, 1): + sps = line.rstrip().split(maxsplit=1) + if len(sps) == 1: + k, v = sps[0], "" + else: + k, v = sps + # if k in data: + # raise RuntimeError(f"{k} is duplicated ({path}:{linenum})") + data["id"].append(k) + data["text"].append(v) + return pd.DataFrame(data=data, index=data["id"]) + From 5ac5496400f7dfd6939a186460524dc4aec2a33b Mon Sep 17 00:00:00 2001 From: neillu23 Date: Mon, 7 Nov 2022 10:55:26 -0500 Subject: [PATCH 044/154] gradient clipping and multi-gpu --- ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 12 +++-- .../v1/conf/wav2vec2xlsr300m_transducer.yaml | 1 - .../torch/models/transducer/transducer.py | 24 ++-------- hyperion/torch/trainers/transducer_trainer.py | 46 +++++++++++++++++++ hyperion/torch/utils/ddp.py | 6 ++- 5 files changed, 61 insertions(+), 28 deletions(-) diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index 2ee0a1aa..79eadb07 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -9,10 +9,11 @@ data: sampler: # sampler_type: 'seg_sampler' sampler_type: 'bucketing_seg_sampler' + min_batch_size: 4 batch_size: 4 iters_per_epoch: 6 data_loader: - num_workers: 1 + num_workers: 8 val: dataset: aug_cfgs: @@ -21,8 +22,10 @@ data: return_segment_info: - text sampler: + # sampler_type: 'seg_sampler' sampler_type: 'bucketing_seg_sampler' - batch_size: 32 + min_batch_size: 2 + batch_size: 2 iters_per_epoch: 6 data_loader: num_workers: 8 @@ -30,7 +33,7 @@ model: wav2vec2xlsr300m_transducer.yaml trainer: optim: opt_type: sgd - lr: 0.45 + lr: 0.003 momentum: 0.9 weight_decay: 4e-4 lrsched: @@ -38,9 +41,10 @@ trainer: decay_rate: 0.5 decay_steps: 4200 hold_steps: 1500 - min_lr: 4e-4 + min_lr: 4e-5 warmup_steps: 1500 update_lr_on_opt_step: true + grad_clip: 100 use_amp: true log_interval: 1000 epochs: 60 diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml index d736dc86..57f7272b 100644 --- a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml @@ -1,6 +1,5 @@ hf_feats: pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus - # test_param: xyz transducer: encoder_out_dim: 768 # conformer_enc: diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index 6fcc965d..4fa9fc0b 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -142,6 +142,9 @@ def forward( blank=blank_id, reduction="sum", ) + # print("loss",loss) + # print("logits",logits) + # print("y_padded",y_padded) return logits, loss @@ -162,32 +165,11 @@ def set_train_mode(self, mode): self._train_mode = mode - @classmethod - def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - encoder_net = TorchNALoader.load_from_cfg(cfg=cfg["encoder_cfg"]) - for k in "encoder_cfg": - del cfg[k] - - model = cls(encoder_net, **cfg) - if state_dict is not None: - model.load_state_dict(state_dict) - - return model def _train(self, train_mode: str): if train_mode in ["full", "frozen"]: super()._train(train_mode) - elif train_mode == "ft-embed-affine": - self.encoder_net.eval() - if self.proj is not None: - self.proj.eval() - - self.pool_net.eval() - self.classif_net.train() - layer_list = [l for l in range(self.embed_layer)] - self.classif_net.put_layers_in_eval_mode(layer_list) else: raise ValueError(f"invalid train_mode={train_mode}") diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 74363066..f3047c7e 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -122,7 +122,13 @@ def train_epoch(self, data_loader): batch_metrics = ODict() self.model.train() self.sp = data_loader.dataset.sp + # for batch, (data, audio_length, target) in enumerate(data_loader): + # print("batch",batch) + # print("data shape",data.shape) + for batch, (data, audio_length, target) in enumerate(data_loader): + # print("batch index", batch) + # print("batch size", data.shape) self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: @@ -159,3 +165,43 @@ def train_epoch(self, data_loader): logs = ODict(("train_" + k, v) for k, v in logs.items()) logs["lr"] = self._get_lr() return logs + + + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + with torch.no_grad(): + if swa_update_bn: + log_tag = "train_" + self.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, (data, audio_length, target) in enumerate(data_loader): + data, audio_length, target = data.to(self.device), audio_length.to(self.device), target.to(self.device) + batch_size = data.shape[0] + # data, target = data.to(self.device), target.to(self.device) + # batch_size = data.shape[0] + + with self.amp_autocast(): + output, loss = self.model(data, x_lengths=audio_length, y=target) + # output = self.model(data) + # loss = self.loss(output, target) + + batch_metrics["loss"] = loss.mean().item() + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) + return logs \ No newline at end of file diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 7038cff3..038b3685 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -59,7 +59,7 @@ def ddp_init( os.environ["MASTER_PORT"] = master_port logging.info( - f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port}" + f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" ) dist.init_process_group( "nccl", @@ -67,7 +67,9 @@ def ddp_init( world_size=world_size, ) torch.tensor([0]).to(gpu_id) - return gpu_id, rank, world_size + device = torch.device('cuda', gpu_id) + return device, rank, world_size + # return gpu_id, rank, world_size def ddp_cleanup(): From 4be4d87a7b7979cd3c644763910f93e643f05e7d Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 9 Nov 2022 11:11:56 -0500 Subject: [PATCH 045/154] add resampling option to audio dataset --- .../xvectors/extract_xvectors_from_wav.sh | 30 +- hyperion/np/metrics/utils.py | 110 +++ hyperion/torch/data/audio_dataset.py | 897 +++++++++--------- 3 files changed, 594 insertions(+), 443 deletions(-) diff --git a/hyp_utils/xvectors/extract_xvectors_from_wav.sh b/hyp_utils/xvectors/extract_xvectors_from_wav.sh index ef06d94d..0b5227cc 100755 --- a/hyp_utils/xvectors/extract_xvectors_from_wav.sh +++ b/hyp_utils/xvectors/extract_xvectors_from_wav.sh @@ -137,21 +137,27 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then - if [ -n "$data_out_dir" ];then - echo "$0: creating data dir $data_out_dir for augmented x-vectors" - mkdir -p $data_out_dir - awk -F "," '$1 != "key_aug" { print $1,$2}' $output_dir/aug_info.csv \ - > $data_out_dir/augm2clean - awk -v u2s=$data_dir/utt2spk 'BEGIN{ + if [ -n "$data_out_dir" ];then + echo "$0: creating data dir $data_out_dir for augmented x-vectors" + mkdir -p $data_out_dir + awk -F "," '$1 != "key_aug" { print $1,$2}' $output_dir/aug_info.csv \ + > $data_out_dir/augm2clean + + for f in utt2spk utt2lang + do + if [ -f $data_dir/utt2spk ];then + awk -v u2s=$data_dir/$f 'BEGIN{ while(getline < u2s) { spk[$1]=$2 } } -{ print $1,spk[$2]}' $data_out_dir/augm2clean > $data_out_dir/utt2spk - utils/utt2spk_to_spk2utt.pl $data_out_dir/utt2spk > $data_out_dir/spk2utt - cp $output_dir/utt2num_frames $data_out_dir - else - cp $output_dir/utt2num_frames $data_dir - fi +{ print $1,spk[$2]}' $data_out_dir/augm2clean > $data_out_dir/$f + fi + done + utils/utt2spk_to_spk2utt.pl $data_out_dir/utt2spk > $data_out_dir/spk2utt + cp $output_dir/utt2num_frames $data_out_dir + else + cp $output_dir/utt2num_frames $data_dir + fi fi diff --git a/hyperion/np/metrics/utils.py b/hyperion/np/metrics/utils.py index 4f06bb18..c5871dfd 100644 --- a/hyperion/np/metrics/utils.py +++ b/hyperion/np/metrics/utils.py @@ -8,6 +8,7 @@ import numpy as np from ...hyp_defs import float_cpu +from ...utils.math import softmax, logsumexp def effective_prior(p_tar, c_miss, c_fa): @@ -27,6 +28,115 @@ def effective_prior(p_tar, c_miss, c_fa): return p_eff +def lre_priors(num_classes, p_tar, p_oos=0.0): + """Returns all prior distributions as needed for LRE language detection task. + + Args: + num_classes: number of target classes. + p_tar: target prior. + p_oos: prior of out-of-set hypothesis. + + Returns + Matrix of priors P with shape (num_classes, num_classes) or (num_classes, num_classes+1) if p_oos > 0, where P(i,:) are the priors for the case that class i is the target class. + """ + I = np.eye(num_classes) + ones = np.ones((num_classes, num_classes)) + priors = (1 - p_tar - p_oos) * (ones - I) / (num_classes - 1) + p_tar * I + if p_oos > 0: + priors_oos = p_oos * np.ones((num_classes, 1)) + priors = np.concatenate((priors, priors_oos), axis=-1) + + return priors + + +def loglk2llr(loglk, priors, target_idx=None): + """Converts log-likelihoods to detection log-likelihood ratios. + + Args: + loglk: log-likelihood matrix P(x_t | class_i) with shape = (num_samples, num_classes) + priors: vector of prior probabilities, positive, sum up to one. + target_idx: index of the target class, the other classes are assumed to be non-target classes, + it can be also a list of indexes to consider multiple target classes. + if None, it returns matrix with LLR w.r.t. all classes. + + Returns: + Matrix of log-likelihood ratios LLR = log P(x_t | class_i) / log P(x_t / non-class_i) with + shape (num_samples, num_target_classes), if None, num_target_classes=num_classes + + """ + + num_classes = loglk.shape[1] + assert num_classes == len(priors), "wrong prior length" + assert np.all(priors >= 0), "negative priors present" + assert np.abs(np.log(np.sum(priors))) > 0.001, "priors does not sum up to one" + assert target_idx is None or target_idx >= 0 and target_idx < num_classes + if target_idx is None: + target_idx = np.arange(num_classes) + elif isinstance(target_idx, int): + target_idx = [target_idx] + + num_target_classes = len(target_idx) + llr = np.zeros((loglk.shape[0], num_target_classes), dtype=loglk.dtype) + for i, target in enumerate(target_idx): + priors_i = np.copy(priors) + priors[target] = 0 + priors /= np.sum(priors) + priors[target] = 1 + llr = llr + np.log(priors) + non_idx = np.concatenate( + (np.arange(target_idx), np.arange(target_idx + 1, num_classes)) + ) + llr[:, i] = loglk[:, target] - logsumexp(llglk[:, non_idx], axis=-1) + + return llr + + +def loglk2posterior(loglk, priors): + """Converts log-likelihoods to posteriors + + Args: + loglk: log-likelihood matrix P(x_t | class_i) with shape = (num_samples, num_classes) + priors: vector of prior probabilities, positive, sum up to one. + + Returns: + Matrix of posteriors with shape = (num_samples, num_classes) + + """ + + num_classes = loglk.shape[1] + assert num_classes == len(priors), "wrong prior length" + assert np.all(priors >= 0), "negative priors present" + assert np.abs(np.log(np.sum(priors))) > 0.001, "priors does not sum up to one" + + log_post = loglk + np.log(priors) + return softmax(log_post, axis=-1) + + +def lre_loglk2llr(loglk, p_tar, p_oos=0): + """Converts log-likelihoods to detection log-likelihood ratios suitable for LRE. + + Args: + loglk: log-likelihood matrix P(x_t | class_i) with shape = (num_samples, num_classes) + priors: prior prob that each language is the target language + p_oos: prior prob that test language is out-of-set. + + Returns: + Matrix of log-likelihood ratios LLR = log P(x_t | class_i) / log P(x_t / non-class_i) with + shape (num_samples, classes), + + """ + + num_tar_classes = loglk.shape[-1] + if p_oos == 0: + num_tar_classes -= 1 + priors = llr_priors(num_tar_classes, p_tar, p_oos) + llr = np.zeros_like((loglk.shape[0], num_tar_classes), dtype=loglk.dtype) + for i in range(num_tar_classes): + llr[:, i] = loglk2llr(loglk, priors[i], target_idx=i) + + return llr + + def pavx(y): """PAV: Pool Adjacent Violators algorithm. Non-paramtetric optimization subject to monotonicity. diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 8c69c3e1..f24ca8c5 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -12,6 +12,7 @@ import pandas as pd import torch +import torchaudio.transforms as tat from ..torch_defs import floatstr_torch from ...io import RandomAccessAudioReader as AR @@ -24,427 +25,427 @@ from hyperion.np import augment -class AudioDataset1(Dataset): - def __init__( - self, - audio_file, - key_file, - class_file=None, - time_durs_file=None, - min_chunk_length=1, - max_chunk_length=None, - aug_cfg=None, - return_fullseqs=False, - return_class=True, - return_clean_aug_pair=False, - transpose_input=False, - wav_scale=2 ** 15 - 1, - is_val=False, - ): - - try: - rank = dist.get_rank() - world_size = dist.get_world_size() - except: - rank = 0 - world_size = 1 - - self.rank = rank - self.world_size = world_size - - if rank == 0: - logging.info("opening dataset %s", audio_file) - self.r = AR(audio_file, wav_scale=wav_scale) - if rank == 0: - logging.info("loading utt2info file %s" % key_file) - self.u2c = Utt2Info.load(key_file, sep=" ") - if rank == 0: - logging.info("dataset contains %d seqs" % self.num_seqs) - - self.is_val = is_val - self._read_time_durs_file(time_durs_file) - - self._prune_short_seqs(min_chunk_length) - - self.short_seq_exist = self._seq_shorter_than_max_length_exists( - max_chunk_length - ) - - self._prepare_class_info(class_file) - - if max_chunk_length is None: - max_chunk_length = min_chunk_length - self._min_chunk_length = min_chunk_length - self._max_chunk_length = max_chunk_length - - self.return_fullseqs = return_fullseqs - self.return_class = return_class - self.return_clean_aug_pair = return_clean_aug_pair - - self.transpose_input = transpose_input - - self.augmenter = None - self.reverb_context = 0 - if aug_cfg is not None: - self.augmenter = SpeechAugment.create( - aug_cfg, random_seed=112358 + 1000 * rank - ) - self.reverb_context = self.augmenter.max_reverb_context - - def _read_time_durs_file(self, file_path): - if self.rank == 0: - logging.info("reading time_durs file %s" % file_path) - nf_df = pd.read_csv(file_path, header=None, sep=" ") - nf_df.index = nf_df[0] - self._seq_lengths = nf_df.loc[self.u2c.key, 1].values - - @property - def wav_scale(self): - return self.r.wav_scale - - @property - def num_seqs(self): - return len(self.u2c) - - def __len__(self): - return self.num_seqs - - @property - def seq_lengths(self): - return self._seq_lengths - - @property - def total_length(self): - return np.sum(self.seq_lengths) - - @property - def min_chunk_length(self): - if self.return_fullseqs: - self._min_chunk_length = np.min(self.seq_lengths) - return self._min_chunk_length - - @property - def max_chunk_length(self): - if self._max_chunk_length is None: - self._max_chunk_length = np.max(self.seq_lengths) - return self._max_chunk_length - - @property - def min_seq_length(self): - return np.min(self.seq_lengths) - - @property - def max_seq_length(self): - return np.max(self.seq_lengths) - - def _prune_short_seqs(self, min_length): - if self.rank == 0: - logging.info("pruning short seqs") - keep_idx = self.seq_lengths >= min_length - self.u2c = self.u2c.filter_index(keep_idx) - self._seq_lengths = self.seq_lengths[keep_idx] - if self.rank == 0: - logging.info( - "pruned seqs with min_length < %f," - "keep %d/%d seqs" % (min_length, self.num_seqs, len(keep_idx)) - ) - - def _prepare_class_info(self, class_file): - class_weights = None - if class_file is None: - classes, class_idx = np.unique(self.u2c.info, return_inverse=True) - class2idx = {k: i for i, k in enumerate(classes)} - else: - if self.rank == 0: - logging.info("reading class-file %s" % (class_file)) - class_info = pd.read_csv(class_file, header=None, sep=" ") - class2idx = {str(k): i for i, k in enumerate(class_info[0])} - class_idx = np.array([class2idx[k] for k in self.u2c.info], dtype=int) - if class_info.shape[1] == 2: - class_weights = np.array(class_info[1]).astype( - floatstr_torch(), copy=False - ) - - self.num_classes = len(class2idx) - - class2utt_idx = {} - class2num_utt = np.zeros((self.num_classes,), dtype=int) - - for k in range(self.num_classes): - idx = (class_idx == k).nonzero()[0] - class2utt_idx[k] = idx - class2num_utt[k] = len(idx) - if class2num_utt[k] == 0: - if not self.is_val: - logging.warning("class %d doesn't have any samples" % (k)) - if class_weights is None: - class_weights = np.ones((self.num_classes,), dtype=floatstr_torch()) - class_weights[k] = 0 - - count_empty = np.sum(class2num_utt == 0) - if count_empty > 0: - logging.warning("%d classes have 0 samples" % (count_empty)) - - self.utt_idx2class = class_idx - self.class2utt_idx = class2utt_idx - self.class2num_utt = class2num_utt - if class_weights is not None: - class_weights /= np.sum(class_weights) - class_weights = torch.Tensor(class_weights) - self.class_weights = class_weights - - if self.short_seq_exist: - # if there are seq shorter than max_chunk_lenght we need some extra variables - # we will need class_weights to put to 0 classes that have all utts shorter than the batch chunk length - if self.class_weights is None: - self.class_weights = torch.ones((self.num_classes,)) - - # we need the max length of the utterances of each class - class2max_length = torch.zeros((self.num_classes,), dtype=torch.float) - for c in range(self.num_classes): - if class2num_utt[c] > 0: - class2max_length[c] = np.max( - self.seq_lengths[self.class2utt_idx[c]] - ) - - self.class2max_length = class2max_length - - def _seq_shorter_than_max_length_exists(self, max_length): - return np.any(self.seq_lengths < max_length) - - @property - def var_chunk_length(self): - return self.min_chunk_length < self.max_chunk_length - - def get_random_chunk_length(self): - - if self.var_chunk_length: - return ( - torch.rand(size=(1,)).item() - * (self.max_chunk_length - self.min_chunk_length) - + self.min_chunk_length - ) - - return self.max_chunk_length - - def __getitem__(self, index): - # logging.info('{} {} {} get item {}'.format( - # self, os.getpid(), threading.get_ident(), index)) - if self.return_fullseqs: - return self._get_fullseq(index) - else: - return self._get_random_chunk(index) - - def _get_fullseq(self, index): - key = self.u2c.key[index] - x, fs = self.r.read([key]) - x = x[0].astype(floatstr_torch(), copy=False) - x_clean = x - if self.augmenter is not None: - x, aug_info = self.augmenter(x) - - if self.transpose_input: - x = x[None, :] - if self.return_clean_aug_pair: - x_clean = x_clean[None, :] - - if self.return_clean_aug_pair: - r = x, x_clean - - if not self.return_class: - return r - - class_idx = self.utt_idx2class[index] - r = *r, class_idx - return r - - def _get_random_chunk(self, index): - - if len(index) == 2: - index, chunk_length = index - else: - chunk_length = self.max_chunk_length - - key = self.u2c.key[index] - - full_seq_length = self.seq_lengths[index] - assert ( - chunk_length <= full_seq_length - ), "chunk_length(%d) <= full_seq_length(%d)" % (chunk_length, full_seq_length) - - time_offset = torch.rand(size=(1,)).item() * (full_seq_length - chunk_length) - reverb_context = min(self.reverb_context, time_offset) - time_offset -= reverb_context - read_chunk_length = chunk_length + reverb_context - - # logging.info('get-random-chunk {} {} {} {} {}'.format(index, key, time_offset, chunk_length, full_seq_length )) - x, fs = self.r.read([key], time_offset=time_offset, time_durs=read_chunk_length) - - # try: - # x, fs = self.r.read([key], time_offset=time_offset, - # time_durs=read_chunk_length) - # except: - # # some files produce error in the fseek after reading the data, - # # this seems an issue from pysoundfile or soundfile lib itself - # # reading from a sligthly different starting position seems to solve the problem in most cases - # try: - # logging.info('error-1 reading at key={} totol_dur={} offset={} read_chunk_length={}, retrying...'.format( - # key, full_seq_length, time_offset, read_chunk_length)) - # time_offset = math.floor(time_offset) - # x, fs = self.r.read([key], time_offset=time_offset, - # time_durs=read_chunk_length) - # except: - # try: - # # if changing the value of time-offset doesn't solve the issue, we try to read from - # # from time-offset to the end of the file, and remove the extra frames later - # logging.info('error-2 reading at key={} totol_dur={} offset={} retrying reading until end-of-file ...'.format( - # key, full_seq_length, time_offset)) - # x, fs = self.r.read([key], time_offset=time_offset) - # x = [x[0][:int(read_chunk_length * fs[0])]] - # except: - # # try to read the full file - # logging.info('error-3 reading at key={} totol_dur={} retrying reading full file ...'.format( - # key, full_seq_length)) - # x, fs = self.r.read([key]) - # x = [x[0][:int(read_chunk_length * fs[0])]] - - x = x[0] - fs = fs[0] - - x_clean = x - logging.info("hola1") - if self.augmenter is not None: - logging.info("hola2") - chunk_length_samples = int(chunk_length * fs) - end_idx = len(x) - reverb_context_samples = end_idx - chunk_length_samples - assert reverb_context_samples >= 0, ( - "key={} time-offset={}, read-chunk={} " - "read-x-samples={}, chunk_samples={}, reverb_context_samples={}" - ).format( - key, - time_offset, - read_chunk_length, - end_idx, - chunk_length_samples, - reverb_context_samples, - ) - # end_idx = reverb_context_samples + chunk_length_samples - x, aug_info = self.augmenter(x) - x = x[reverb_context_samples:end_idx] - if self.return_clean_aug_pair: - x_clean = x_clean[reverb_context_samples:end_idx] - x_clean = x_clean.astype(floatstr_torch(), copy=False) - # x_clean = x_clean[reverb_context_samples:] - # logging.info('augmentation x-clean={}, x={}, aug_info={}'.format( - # x_clean.shape, x.shape, aug_info)) - # if len(x) != 64000: - # logging.info('x!=4s, {} {} {} {} {} {} {} {}'.format(len(x),reverb_context, reverb_context_samples, chunk_length, chunk_length_samples, end_idx, fs, read_chunk_length)) - - # if len(x) != 64000: - # logging.info('x!=4s-2, {} {} {} {}'.format(len(x), chunk_length, fs, read_chunk_length)) - - if self.transpose_input: - x = x[None, :] - if self.return_clean_aug_pair: - x_clean = x_clean[None, :] - - x = x.astype(floatstr_torch(), copy=False) - if self.return_clean_aug_pair: - r = x, x_clean - else: - r = (x,) - - if not self.return_class: - return r - - class_idx = self.utt_idx2class[index] - r = *r, class_idx - return r - - @staticmethod - def filter_args(**kwargs): - - ar_args = AR.filter_args(**kwargs) - valid_args = ( - "audio_file", - "key_file", - "aug_cfg", - "path_prefix", - "class_file", - "time_durs_file", - "min_chunk_length", - "max_chunk_length", - "return_fullseqs", - "part_idx", - "num_parts", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - args.update(ar_args) - return args - - @staticmethod - def add_class_args(parser, prefix=None, skip={"audio_file", "key_file"}): - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - if "audio_file" not in skip: - parser.add_argument( - "--audio-file", - required=True, - help=("audio manifest file"), - ) - - if "key_file" not in skip: - parser.add_argument( - "--key-file", - required=True, - help=("key manifest file"), - ) - - parser.add_argument( - "--class-file", - default=None, - help=("ordered list of classes keys, it can contain class weights"), - ) - - parser.add_argument( - "--time-durs-file", default=None, help=("utt to duration in secs file") - ) - - parser.add_argument( - "--aug-cfg", - default=None, - help=("augmentation configuration file."), - ) - - parser.add_argument( - "--min-chunk-length", - type=float, - default=None, - help=("minimum length of sequence chunks"), - ) - parser.add_argument( - "--max-chunk-length", - type=float, - default=None, - help=("maximum length of sequence chunks"), - ) - - parser.add_argument( - "--return-fullseqs", - default=False, - action="store_true", - help=("returns full sequences instead of chunks"), - ) - - AR.add_class_args(parser) - if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='audio dataset options') - - add_argparse_args = add_class_args +# class AudioDataset1(Dataset): +# def __init__( +# self, +# audio_file, +# key_file, +# class_file=None, +# time_durs_file=None, +# min_chunk_length=1, +# max_chunk_length=None, +# aug_cfg=None, +# return_fullseqs=False, +# return_class=True, +# return_clean_aug_pair=False, +# transpose_input=False, +# wav_scale=2 ** 15 - 1, +# is_val=False, +# ): + +# try: +# rank = dist.get_rank() +# world_size = dist.get_world_size() +# except: +# rank = 0 +# world_size = 1 + +# self.rank = rank +# self.world_size = world_size + +# if rank == 0: +# logging.info("opening dataset %s", audio_file) +# self.r = AR(audio_file, wav_scale=wav_scale) +# if rank == 0: +# logging.info("loading utt2info file %s" % key_file) +# self.u2c = Utt2Info.load(key_file, sep=" ") +# if rank == 0: +# logging.info("dataset contains %d seqs" % self.num_seqs) + +# self.is_val = is_val +# self._read_time_durs_file(time_durs_file) + +# self._prune_short_seqs(min_chunk_length) + +# self.short_seq_exist = self._seq_shorter_than_max_length_exists( +# max_chunk_length +# ) + +# self._prepare_class_info(class_file) + +# if max_chunk_length is None: +# max_chunk_length = min_chunk_length +# self._min_chunk_length = min_chunk_length +# self._max_chunk_length = max_chunk_length + +# self.return_fullseqs = return_fullseqs +# self.return_class = return_class +# self.return_clean_aug_pair = return_clean_aug_pair + +# self.transpose_input = transpose_input + +# self.augmenter = None +# self.reverb_context = 0 +# if aug_cfg is not None: +# self.augmenter = SpeechAugment.create( +# aug_cfg, random_seed=112358 + 1000 * rank +# ) +# self.reverb_context = self.augmenter.max_reverb_context + +# def _read_time_durs_file(self, file_path): +# if self.rank == 0: +# logging.info("reading time_durs file %s" % file_path) +# nf_df = pd.read_csv(file_path, header=None, sep=" ") +# nf_df.index = nf_df[0] +# self._seq_lengths = nf_df.loc[self.u2c.key, 1].values + +# @property +# def wav_scale(self): +# return self.r.wav_scale + +# @property +# def num_seqs(self): +# return len(self.u2c) + +# def __len__(self): +# return self.num_seqs + +# @property +# def seq_lengths(self): +# return self._seq_lengths + +# @property +# def total_length(self): +# return np.sum(self.seq_lengths) + +# @property +# def min_chunk_length(self): +# if self.return_fullseqs: +# self._min_chunk_length = np.min(self.seq_lengths) +# return self._min_chunk_length + +# @property +# def max_chunk_length(self): +# if self._max_chunk_length is None: +# self._max_chunk_length = np.max(self.seq_lengths) +# return self._max_chunk_length + +# @property +# def min_seq_length(self): +# return np.min(self.seq_lengths) + +# @property +# def max_seq_length(self): +# return np.max(self.seq_lengths) + +# def _prune_short_seqs(self, min_length): +# if self.rank == 0: +# logging.info("pruning short seqs") +# keep_idx = self.seq_lengths >= min_length +# self.u2c = self.u2c.filter_index(keep_idx) +# self._seq_lengths = self.seq_lengths[keep_idx] +# if self.rank == 0: +# logging.info( +# "pruned seqs with min_length < %f," +# "keep %d/%d seqs" % (min_length, self.num_seqs, len(keep_idx)) +# ) + +# def _prepare_class_info(self, class_file): +# class_weights = None +# if class_file is None: +# classes, class_idx = np.unique(self.u2c.info, return_inverse=True) +# class2idx = {k: i for i, k in enumerate(classes)} +# else: +# if self.rank == 0: +# logging.info("reading class-file %s" % (class_file)) +# class_info = pd.read_csv(class_file, header=None, sep=" ") +# class2idx = {str(k): i for i, k in enumerate(class_info[0])} +# class_idx = np.array([class2idx[k] for k in self.u2c.info], dtype=int) +# if class_info.shape[1] == 2: +# class_weights = np.array(class_info[1]).astype( +# floatstr_torch(), copy=False +# ) + +# self.num_classes = len(class2idx) + +# class2utt_idx = {} +# class2num_utt = np.zeros((self.num_classes,), dtype=int) + +# for k in range(self.num_classes): +# idx = (class_idx == k).nonzero()[0] +# class2utt_idx[k] = idx +# class2num_utt[k] = len(idx) +# if class2num_utt[k] == 0: +# if not self.is_val: +# logging.warning("class %d doesn't have any samples" % (k)) +# if class_weights is None: +# class_weights = np.ones((self.num_classes,), dtype=floatstr_torch()) +# class_weights[k] = 0 + +# count_empty = np.sum(class2num_utt == 0) +# if count_empty > 0: +# logging.warning("%d classes have 0 samples" % (count_empty)) + +# self.utt_idx2class = class_idx +# self.class2utt_idx = class2utt_idx +# self.class2num_utt = class2num_utt +# if class_weights is not None: +# class_weights /= np.sum(class_weights) +# class_weights = torch.Tensor(class_weights) +# self.class_weights = class_weights + +# if self.short_seq_exist: +# # if there are seq shorter than max_chunk_lenght we need some extra variables +# # we will need class_weights to put to 0 classes that have all utts shorter than the batch chunk length +# if self.class_weights is None: +# self.class_weights = torch.ones((self.num_classes,)) + +# # we need the max length of the utterances of each class +# class2max_length = torch.zeros((self.num_classes,), dtype=torch.float) +# for c in range(self.num_classes): +# if class2num_utt[c] > 0: +# class2max_length[c] = np.max( +# self.seq_lengths[self.class2utt_idx[c]] +# ) + +# self.class2max_length = class2max_length + +# def _seq_shorter_than_max_length_exists(self, max_length): +# return np.any(self.seq_lengths < max_length) + +# @property +# def var_chunk_length(self): +# return self.min_chunk_length < self.max_chunk_length + +# def get_random_chunk_length(self): + +# if self.var_chunk_length: +# return ( +# torch.rand(size=(1,)).item() +# * (self.max_chunk_length - self.min_chunk_length) +# + self.min_chunk_length +# ) + +# return self.max_chunk_length + +# def __getitem__(self, index): +# # logging.info('{} {} {} get item {}'.format( +# # self, os.getpid(), threading.get_ident(), index)) +# if self.return_fullseqs: +# return self._get_fullseq(index) +# else: +# return self._get_random_chunk(index) + +# def _get_fullseq(self, index): +# key = self.u2c.key[index] +# x, fs = self.r.read([key]) +# x = x[0].astype(floatstr_torch(), copy=False) +# x_clean = x +# if self.augmenter is not None: +# x, aug_info = self.augmenter(x) + +# if self.transpose_input: +# x = x[None, :] +# if self.return_clean_aug_pair: +# x_clean = x_clean[None, :] + +# if self.return_clean_aug_pair: +# r = x, x_clean + +# if not self.return_class: +# return r + +# class_idx = self.utt_idx2class[index] +# r = *r, class_idx +# return r + +# def _get_random_chunk(self, index): + +# if len(index) == 2: +# index, chunk_length = index +# else: +# chunk_length = self.max_chunk_length + +# key = self.u2c.key[index] + +# full_seq_length = self.seq_lengths[index] +# assert ( +# chunk_length <= full_seq_length +# ), "chunk_length(%d) <= full_seq_length(%d)" % (chunk_length, full_seq_length) + +# time_offset = torch.rand(size=(1,)).item() * (full_seq_length - chunk_length) +# reverb_context = min(self.reverb_context, time_offset) +# time_offset -= reverb_context +# read_chunk_length = chunk_length + reverb_context + +# # logging.info('get-random-chunk {} {} {} {} {}'.format(index, key, time_offset, chunk_length, full_seq_length )) +# x, fs = self.r.read([key], time_offset=time_offset, time_durs=read_chunk_length) + +# # try: +# # x, fs = self.r.read([key], time_offset=time_offset, +# # time_durs=read_chunk_length) +# # except: +# # # some files produce error in the fseek after reading the data, +# # # this seems an issue from pysoundfile or soundfile lib itself +# # # reading from a sligthly different starting position seems to solve the problem in most cases +# # try: +# # logging.info('error-1 reading at key={} totol_dur={} offset={} read_chunk_length={}, retrying...'.format( +# # key, full_seq_length, time_offset, read_chunk_length)) +# # time_offset = math.floor(time_offset) +# # x, fs = self.r.read([key], time_offset=time_offset, +# # time_durs=read_chunk_length) +# # except: +# # try: +# # # if changing the value of time-offset doesn't solve the issue, we try to read from +# # # from time-offset to the end of the file, and remove the extra frames later +# # logging.info('error-2 reading at key={} totol_dur={} offset={} retrying reading until end-of-file ...'.format( +# # key, full_seq_length, time_offset)) +# # x, fs = self.r.read([key], time_offset=time_offset) +# # x = [x[0][:int(read_chunk_length * fs[0])]] +# # except: +# # # try to read the full file +# # logging.info('error-3 reading at key={} totol_dur={} retrying reading full file ...'.format( +# # key, full_seq_length)) +# # x, fs = self.r.read([key]) +# # x = [x[0][:int(read_chunk_length * fs[0])]] + +# x = x[0] +# fs = fs[0] + +# x_clean = x +# logging.info("hola1") +# if self.augmenter is not None: +# logging.info("hola2") +# chunk_length_samples = int(chunk_length * fs) +# end_idx = len(x) +# reverb_context_samples = end_idx - chunk_length_samples +# assert reverb_context_samples >= 0, ( +# "key={} time-offset={}, read-chunk={} " +# "read-x-samples={}, chunk_samples={}, reverb_context_samples={}" +# ).format( +# key, +# time_offset, +# read_chunk_length, +# end_idx, +# chunk_length_samples, +# reverb_context_samples, +# ) +# # end_idx = reverb_context_samples + chunk_length_samples +# x, aug_info = self.augmenter(x) +# x = x[reverb_context_samples:end_idx] +# if self.return_clean_aug_pair: +# x_clean = x_clean[reverb_context_samples:end_idx] +# x_clean = x_clean.astype(floatstr_torch(), copy=False) +# # x_clean = x_clean[reverb_context_samples:] +# # logging.info('augmentation x-clean={}, x={}, aug_info={}'.format( +# # x_clean.shape, x.shape, aug_info)) +# # if len(x) != 64000: +# # logging.info('x!=4s, {} {} {} {} {} {} {} {}'.format(len(x),reverb_context, reverb_context_samples, chunk_length, chunk_length_samples, end_idx, fs, read_chunk_length)) + +# # if len(x) != 64000: +# # logging.info('x!=4s-2, {} {} {} {}'.format(len(x), chunk_length, fs, read_chunk_length)) + +# if self.transpose_input: +# x = x[None, :] +# if self.return_clean_aug_pair: +# x_clean = x_clean[None, :] + +# x = x.astype(floatstr_torch(), copy=False) +# if self.return_clean_aug_pair: +# r = x, x_clean +# else: +# r = (x,) + +# if not self.return_class: +# return r + +# class_idx = self.utt_idx2class[index] +# r = *r, class_idx +# return r + +# @staticmethod +# def filter_args(**kwargs): + +# ar_args = AR.filter_args(**kwargs) +# valid_args = ( +# "audio_file", +# "key_file", +# "aug_cfg", +# "path_prefix", +# "class_file", +# "time_durs_file", +# "min_chunk_length", +# "max_chunk_length", +# "return_fullseqs", +# "part_idx", +# "num_parts", +# ) +# args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) +# args.update(ar_args) +# return args + +# @staticmethod +# def add_class_args(parser, prefix=None, skip={"audio_file", "key_file"}): +# if prefix is not None: +# outer_parser = parser +# parser = ArgumentParser(prog="") + +# if "audio_file" not in skip: +# parser.add_argument( +# "--audio-file", +# required=True, +# help=("audio manifest file"), +# ) + +# if "key_file" not in skip: +# parser.add_argument( +# "--key-file", +# required=True, +# help=("key manifest file"), +# ) + +# parser.add_argument( +# "--class-file", +# default=None, +# help=("ordered list of classes keys, it can contain class weights"), +# ) + +# parser.add_argument( +# "--time-durs-file", default=None, help=("utt to duration in secs file") +# ) + +# parser.add_argument( +# "--aug-cfg", +# default=None, +# help=("augmentation configuration file."), +# ) + +# parser.add_argument( +# "--min-chunk-length", +# type=float, +# default=None, +# help=("minimum length of sequence chunks"), +# ) +# parser.add_argument( +# "--max-chunk-length", +# type=float, +# default=None, +# help=("maximum length of sequence chunks"), +# ) + +# parser.add_argument( +# "--return-fullseqs", +# default=False, +# action="store_true", +# help=("returns full sequences instead of chunks"), +# ) + +# AR.add_class_args(parser) +# if prefix is not None: +# outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) +# # help='audio dataset options') + +# add_argparse_args = add_class_args from ...utils.class_info import ClassInfo @@ -463,6 +464,7 @@ def __init__( num_augs=1, return_segment_info=None, return_orig=False, + target_sample_freq=None, wav_scale=2 ** 15 - 1, is_val=False, ): @@ -514,6 +516,9 @@ def __init__( self.num_augs = num_augs self._create_augmenters(aug_cfgs) + self.target_sample_freq = target_sample_freq + self.resamplers = {} + def _load_class_infos(self, class_names, class_files, is_val): self.class_info = {} if class_names is None: @@ -646,10 +651,37 @@ def _get_segment_info(self, seg_id): return r + def _get_resampler(self, fs): + if fs in self.resamplers: + return self.resamplers[fs] + + resampler = tat.Resample( + int(fs), + int(self.target_sample_freq), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + self.resampler[fs] = resampler + return resampler + + def _resample(self, x, fs): + try: + if self.target_sample_freq is None or fs == self.target_sample_freq: + return x, fs + + resampler = self._get_resampler(fs) + return resampler(x), self.target_sample_freq + except: + return x, fs + def __getitem__(self, segment): seg_id, start, duration = self._parse_segment_item(segment) x, fs = self._read_audio(seg_id, start, duration) + x, fs = self._resample(x, fs) + if self.augmenters: # augmentations num_samples = int(duration * fs) @@ -685,6 +717,7 @@ def filter_args(**kwargs): "return_segment_info", "return_orig", "time_durs_file", + "target_sample_freq", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) args.update(ar_args) @@ -698,16 +731,12 @@ def add_class_args(parser, prefix=None, skip={}): if "audio_file" not in skip: parser.add_argument( - "--audio-file", - required=True, - help=("audio manifest file"), + "--audio-file", required=True, help=("audio manifest file"), ) if "segments_file" not in skip: parser.add_argument( - "--segments-file", - required=True, - help=("segments manifest file"), + "--segments-file", required=True, help=("segments manifest file"), ) parser.add_argument( @@ -720,10 +749,7 @@ def add_class_args(parser, prefix=None, skip={}): ) parser.add_argument( - "--class-files", - default=None, - nargs="+", - help=("list of class info files"), + "--class-files", default=None, nargs="+", help=("list of class info files"), ) parser.add_argument( @@ -763,6 +789,15 @@ def add_class_args(parser, prefix=None, skip={}): ), ) + parser.add_argument( + "--target-sample-freq", + default=None, + type=int, + help=( + "target sampling frequencey, if not None all audios are converted to this sample freq" + ), + ) + AR.add_class_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) From 83b7d9c5ef539dea177049e700faf4d8c14b41d2 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Wed, 9 Nov 2022 21:19:19 -0500 Subject: [PATCH 046/154] fix issue in multi-gpus training and update model parameters --- ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 2 + .../v1/conf/wav2vec2xlsr300m_transducer.yaml | 5 +- hyperion/bin/train_wav2vec2transducer.py | 2 +- hyperion/torch/data/bucketing_seg_sampler.py | 6 +- hyperion/torch/models/transducer/decoder.py | 46 +++++++++++--- hyperion/torch/models/transducer/joiner.py | 60 ++++++++++++++++--- .../torch/models/transducer/transducer.py | 36 +++-------- .../wav2transducer/hf_wav2vec2_transducer.py | 5 +- hyperion/torch/trainers/transducer_trainer.py | 5 -- 9 files changed, 109 insertions(+), 58 deletions(-) diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index 79eadb07..6ac61b76 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -12,6 +12,7 @@ data: min_batch_size: 4 batch_size: 4 iters_per_epoch: 6 + drop_last: true data_loader: num_workers: 8 val: @@ -27,6 +28,7 @@ data: min_batch_size: 2 batch_size: 2 iters_per_epoch: 6 + drop_last: true data_loader: num_workers: 8 model: wav2vec2xlsr300m_transducer.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml index 57f7272b..3d9d768a 100644 --- a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml @@ -1,7 +1,6 @@ hf_feats: pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus transducer: - encoder_out_dim: 768 # conformer_enc: # num_features: 80 # subsampling_factor: 4 @@ -11,10 +10,10 @@ transducer: # num_encoder_layers: 12 # vgg_frontend: False decoder: - # vocab_size: 1000 - # blank_id: 0 embedding_dim: 1024 num_layers: 2 hidden_dim: 512 + joiner: + num_layers: 1 feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index bfc6248a..3e4ccb84 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -41,7 +41,7 @@ def transducer_collate(batch): audio = [] audio_length = [] target = [] - for i, record in enumerate(batch): + for record in batch: wav = torch.as_tensor(record[0]) audio.append(wav) audio_length.append(wav.shape[0]) diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index 8dbc4e45..83e6425c 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -12,7 +12,6 @@ from .hyp_sampler import HypSampler from .seg_sampler import SegSampler import torch.distributed as dist -from torch.nn.utils.rnn import pad_sequence class BucketingSegSampler(HypSampler): @@ -44,10 +43,7 @@ def create_buckets(self): buckets = [] for i in range(self.num_buckets): bucket_bool = (cum_lengths <= bucket_length) & (cum_lengths > 0) - bucket_idx = [] - for i, bo in enumerate(bucket_bool): - if bo: - bucket_idx.append(i) + bucket_idx = np.arange(len(bucket_bool))[bucket_bool] bucket_i = sorted_seg_set.iloc[bucket_idx] buckets.append(bucket_i) cum_lengths -= bucket_length diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py index 0b47e80c..56caaef6 100644 --- a/hyperion/torch/models/transducer/decoder.py +++ b/hyperion/torch/models/transducer/decoder.py @@ -14,11 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo from typing import Optional, Tuple import torch import torch.nn as nn -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo # TODO(fangjun): Support switching between LSTM and GRU @@ -30,7 +30,7 @@ def __init__( blank_id: int, num_layers: int, hidden_dim: int, - output_dim: int, + in_feats: int, embedding_dropout: float = 0.0, rnn_dropout: float = 0.0, ): @@ -68,8 +68,14 @@ def __init__( batch_first=True, dropout=rnn_dropout, ) + + self.in_feats = in_feats self.blank_id = blank_id - self.output_linear = nn.Linear(hidden_dim, output_dim) + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + self.num_layers = num_layers + self.hidden_dim = hidden_dim + self.output_linear = nn.Linear(hidden_dim, in_feats) def forward( self, @@ -97,10 +103,26 @@ def forward( return out, (h, c) + def get_config(self): + config = { + "in_feats" : self.in_feats, + "blank_id" : self.blank_id, + "vocab_size" : self.vocab_size, + "embedding_dim" :self.embedding_dim, + "num_layers" : self.num_layers, + "hidden_dim" : self.hidden_dim, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + @staticmethod def filter_args(**kwargs): valid_args = ( + "in_feats", + "blank_id", + "vocab_size", "embedding_dim", "num_layers", "hidden_dim", @@ -110,12 +132,24 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None, skip=set()): + def add_class_args(parser, prefix=None, skip=set(["in_feats", "blank_id", "vocab_size" ])): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - + + if "in_feats" not in skip: + parser.add_argument( + "--in-feats", type=int, required=True, help=("input feature dimension") + ) + if "blank_id" not in skip: + parser.add_argument( + "--blank-id", type=int, required=True, help=("blank id from sp model") + ) + if "vocab_size" not in skip: + parser.add_argument( + "--vocab-size", type=int, required=True, help=("output prediction dimension") + ) parser.add_argument( "--embedding-dim", default=1024, type=int, help=("feature dimension") ) @@ -128,8 +162,6 @@ def add_class_args(parser, prefix=None, skip=set()): "--hidden-dim", default=512, type=int, help=("") ) - - if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/joiner.py b/hyperion/torch/models/transducer/joiner.py index 72376b3c..e05e0f50 100644 --- a/hyperion/torch/models/transducer/joiner.py +++ b/hyperion/torch/models/transducer/joiner.py @@ -14,15 +14,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import torch import torch.nn as nn class Joiner(nn.Module): - def __init__(self, input_dim: int, output_dim: int): + def __init__(self, in_feats: int, out_dims: int, num_layers: int): super().__init__() + self.in_feats = in_feats + self.out_dims = out_dims + self.num_layers = num_layers - self.output_linear = nn.Linear(input_dim, output_dim) + self.output_linear = nn.Linear(in_feats, out_dims) def forward( self, encoder_out: torch.Tensor, decoder_out: torch.Tensor @@ -53,14 +57,52 @@ def forward( return output - # @staticmethod - # def filter_args(**kwargs): - # valid_args = ( - # "encoder_out_dim", - # ) - # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - # return args + def get_config(self): + config = { + "in_feats" : self.in_feats, + "out_dims": self.out_dims, + "num_layers": self.num_layers, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "in_feats", + "out_dims", + "num_layers", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + + @staticmethod + def add_class_args(parser, prefix=None, skip=set(["in_feats", "out_dims"])): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument( + "--in-feats", type=int, required=True, help=("input feature dimension") + ) + + if "out_dims" not in skip: + parser.add_argument( + "--out-dims", type=int, required=True, help=("output feature dimension (vocab size)") + ) + parser.add_argument( + "--num-layers", default=1, type=int, help=("layers of the joiner") + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # @staticmethod # def add_class_args(parser, prefix=None, skip=set()): diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index 4fa9fc0b..5daace99 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -42,9 +42,9 @@ def __init__( self, vocab_size, blank_id, - encoder_out_dim, # conformer_enc, decoder, + joiner, ): """ Args: @@ -65,13 +65,11 @@ def __init__( super().__init__() # assert isinstance(encoder, EncoderInterface) # assert hasattr(decoder, "blank_id") - # conformer_enc["output_dim"] = encoder_out_dim + decoder["blank_id"] = blank_id decoder["vocab_size"] = vocab_size - decoder["output_dim"] = encoder_out_dim - joiner = {"input_dim":encoder_out_dim, "output_dim":vocab_size} + joiner["out_dims"] = vocab_size - # self.encoder = Conformer(**conformer_enc) self.decoder = Decoder(**decoder) self.joiner = Joiner(**joiner) @@ -142,9 +140,6 @@ def forward( blank=blank_id, reduction="sum", ) - # print("loss",loss) - # print("logits",logits) - # print("y_padded",y_padded) return logits, loss @@ -178,13 +173,10 @@ def valid_train_modes(): return ["full", "frozen", "ft-embed-affine"] def get_config(self): - # enc_cfg = self.encoder.get_config() dec_cfg = self.decoder.get_config() join_cfg = self.joiner.get_config() config = { - # "encoder_out_dim" : self.encoder_out_dim, - # "conformer_enc": enc_cfg, "decoder": dec_cfg, "joiner": join_cfg, } @@ -196,18 +188,14 @@ def get_config(self): def filter_args(**kwargs): # get arguments for pooling - # encoder_args = Conformer.filter_args(**kwargs["conformer_enc"]) decoder_args = Decoder.filter_args(**kwargs["decoder"]) - # joiner_args = Joiner.filter_args(**kwargs["joiner"]) + joiner_args = Joiner.filter_args(**kwargs["joiner"]) - valid_args = ( - "encoder_out_dim", - ) + valid_args = () args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - # args["conformer_enc"] = encoder_args args["decoder"] = decoder_args - # args["joiner"] = joiner_args + args["joiner"] = joiner_args return args @staticmethod @@ -217,18 +205,12 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - - - # Conformer.add_class_args( - # parser, prefix="conformer_enc", skip=[] - # ) - Decoder.add_class_args( - parser, prefix="decoder", skip=[] + parser, prefix="decoder" ) - parser.add_argument( - "--encoder-out-dim", default=512, type=int, help=("") + Joiner.add_class_args( + parser, prefix="joiner" ) if prefix is not None: diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py index 79c4ca86..242a5ca1 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py @@ -45,12 +45,15 @@ def __init__( assert isinstance(hf_feats, HFWav2Vec2) if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + transducer["joiner"]["in_feats"] = hf_feats.hidden_size if "class_name" in transducer: del transducer["class_name"] transducer = Transducer(**transducer) else: assert isinstance(transducer, Transducer) - # assert transducer.encoder_net.in_feats == hf_feats.hidden_size + assert transducer.decoder.in_feats == hf_feats.hidden_size + assert transducer.joiner.in_feats == hf_feats.hidden_size super().__init__(hf_feats, transducer, feat_fusion_start, feat_fusion_method) diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index f3047c7e..bbe847d0 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -122,13 +122,8 @@ def train_epoch(self, data_loader): batch_metrics = ODict() self.model.train() self.sp = data_loader.dataset.sp - # for batch, (data, audio_length, target) in enumerate(data_loader): - # print("batch",batch) - # print("data shape",data.shape) for batch, (data, audio_length, target) in enumerate(data_loader): - # print("batch index", batch) - # print("batch size", data.shape) self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: From ff172b1f90bd953e4c6d82c74bf35224b77bf9e0 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Sat, 12 Nov 2022 19:27:06 -0500 Subject: [PATCH 047/154] update save and load for transducer model --- ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 8 +-- .../v1/global_conf/config_transducer_v1.sh | 2 + egs/librispeech/v1/run_011_train_asr.sh | 13 +---- hyperion/torch/models/transducer/decoder.py | 4 +- hyperion/torch/models/transducer/joiner.py | 4 +- .../torch/models/transducer/transducer.py | 58 ++++++++++++++++++- .../wav2transducer/hf_wav2transducer.py | 2 +- 7 files changed, 69 insertions(+), 22 deletions(-) diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index 6ac61b76..50750cd8 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -7,8 +7,8 @@ data: return_segment_info: - text sampler: - # sampler_type: 'seg_sampler' - sampler_type: 'bucketing_seg_sampler' + sampler_type: 'seg_sampler' + # sampler_type: 'bucketing_seg_sampler' min_batch_size: 4 batch_size: 4 iters_per_epoch: 6 @@ -23,8 +23,8 @@ data: return_segment_info: - text sampler: - # sampler_type: 'seg_sampler' - sampler_type: 'bucketing_seg_sampler' + sampler_type: 'seg_sampler' + # sampler_type: 'bucketing_seg_sampler' min_batch_size: 2 batch_size: 2 iters_per_epoch: 6 diff --git a/egs/librispeech/v1/global_conf/config_transducer_v1.sh b/egs/librispeech/v1/global_conf/config_transducer_v1.sh index 00b34870..9f28f551 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v1.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v1.sh @@ -8,6 +8,8 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small bpe_model=data/lang_bpe_1000/bpe.model # x-vector cfg diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index 61f00be4..7c2c0f70 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -8,7 +8,7 @@ set -e stage=1 -ngpu=1 +ngpu=2 config_file=default_config.sh interactive=false num_workers="" @@ -20,7 +20,7 @@ use_wandb=false . datapath.sh train_dir=data/${nnet_data}/ -val_dir=data/dev_clean/ +val_dir=data/${dev_data}/ #add extra args from the command line arguments if [ -n "$num_workers" ];then @@ -61,15 +61,6 @@ if [ $stage -le 1 ]; then --data.val.dataset.time-durs-file $val_dir/utt2dur \ --num-gpus $ngpu -# --cfg $xvec_train_base_cfg $xvec_train_args $extra_args \ -# --data.train.dataset.audio-file $list_dir/wav.scp \ -# --data.train.dataset.time-durs-file $list_dir/utt2dur \ -# --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ -# --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ -# --data.val.dataset.audio-file $list_dir/wav.scp \ -# --data.val.dataset.time-durs-file $list_dir/utt2dur \ -# --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ -# --trainer.exp-path $nnet_dir $args \ fi if [ $stage -le 2 ]; then diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py index 56caaef6..bd7bd202 100644 --- a/hyperion/torch/models/transducer/decoder.py +++ b/hyperion/torch/models/transducer/decoder.py @@ -113,8 +113,8 @@ def get_config(self): "hidden_dim" : self.hidden_dim, } - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) + # base_config = super().get_config() + return dict(list(config.items())) @staticmethod diff --git a/hyperion/torch/models/transducer/joiner.py b/hyperion/torch/models/transducer/joiner.py index e05e0f50..0fc1fe51 100644 --- a/hyperion/torch/models/transducer/joiner.py +++ b/hyperion/torch/models/transducer/joiner.py @@ -65,8 +65,8 @@ def get_config(self): "num_layers": self.num_layers, } - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) + # base_config = super().get_config() + return dict(list(config.items())) @staticmethod diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index 5daace99..80bf9891 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -70,6 +70,8 @@ def __init__( decoder["vocab_size"] = vocab_size joiner["out_dims"] = vocab_size + self.vocab_size = vocab_size + self.blank_id = blank_id self.decoder = Decoder(**decoder) self.joiner = Joiner(**joiner) @@ -177,12 +179,14 @@ def get_config(self): join_cfg = self.joiner.get_config() config = { + "blank_id" : self.blank_id, + "vocab_size" : self.vocab_size, "decoder": dec_cfg, "joiner": join_cfg, } - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) + # base_config = super().get_config() + return dict(list(config.items())) @staticmethod def filter_args(**kwargs): @@ -215,3 +219,53 @@ def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + + # def change_config( + # self, + # override_dropouts=False, + # dropout_rate=0, + # num_classes=None, + # loss_type="arc-softmax", + # cos_scale=64, + # margin=0.3, + # margin_warmup_epochs=10, + # intertop_k=5, + # intertop_margin=0.0, + # num_subcenters=2, + # ): + # logging.info("changing x-vector config") + # self.rebuild_output_layer( + # num_classes=num_classes, + # loss_type=loss_type, + # cos_scale=cos_scale, + # margin=margin, + # margin_warmup_epochs=margin_warmup_epochs, + # intertop_k=intertop_k, + # intertop_margin=intertop_margin, + # num_subcenters=num_subcenters, + # ) + + # if override_dropouts: + # logging.info("overriding x-vector dropouts") + # self.encoder_net.change_dropouts(dropout_rate) + # self.classif_net.change_dropouts(dropout_rate) + + # @staticmethod + # def filter_finetune_args(**kwargs): + # valid_args = ( + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return args + + # @staticmethod + # def add_finetune_args(parser, prefix=None): + # if prefix is not None: + # outer_parser = parser + # parser = ArgumentParser(prog="") + + # if prefix is not None: + # outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + # add_argparse_args = add_class_args + # add_argparse_finetune_args = add_finetune_args diff --git a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py index 7956c9ba..b5bd220f 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py @@ -338,7 +338,7 @@ def get_config(self): hf_cfg = self.hf_feats.get_config() tran_cfg = self.transducer.get_config() del hf_cfg["class_name"] - del tran_cfg["class_name"] + # del tran_cfg["class_name"] config = { "hf_feats": hf_cfg, "transducer": tran_cfg, From d1a2419c5bb94b182f3ea9649aa8ea2d0bb0a792 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 16 Nov 2022 13:37:18 -0500 Subject: [PATCH 048/154] fix nan in plot tsne --- hyperion/bin/plot_embedding_tsne_per_class.py | 1 + hyperion/np/classifiers/__init__.py | 1 + hyperion/np/classifiers/svmc.py | 356 ++++++++++++++++++ 3 files changed, 358 insertions(+) create mode 100644 hyperion/np/classifiers/svmc.py diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 5730cc06..5e832bff 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -97,6 +97,7 @@ def plot_embedding_tsne( # in the low dim space, we cannot use cosine scoring x2 = np.sum(x_tsne ** 2, axis=1)[:, None] d2 = x2 - 2 * np.dot(x_tsne, x_tsne.T) + x2.T + d2 = np.clip(d2, a_min=0, a_max=None) scores = -np.sqrt(d2) else: scores = cosine_scoring(x_c, x_c) diff --git a/hyperion/np/classifiers/__init__.py b/hyperion/np/classifiers/__init__.py index 07da0af8..92a9305d 100644 --- a/hyperion/np/classifiers/__init__.py +++ b/hyperion/np/classifiers/__init__.py @@ -10,3 +10,4 @@ from .greedy_fusion import GreedyFusionBinaryLR from .linear_svmc import LinearSVMC from .q_scoring_homo_gbe import QScoringHomoGBE +from .svmc import GaussianSVMC diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py new file mode 100644 index 00000000..764c6101 --- /dev/null +++ b/hyperion/np/classifiers/svmc.py @@ -0,0 +1,356 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import os +import logging +import pickle +import numpy as np +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo + +from sklearn.svm import SVC as SVC + +from ...hyp_defs import float_cpu +from ..np_model import NPModel +from ...utils.math import softmax + + +class GaussianSVMC(NPModel): + """Gaussian Support Vector Machine for Classification.""" + + def __init__( + self, + C=1.0, + gamma="scale", + shrinking=True, + probability=True, + tol=0.0001, + cache_size=600, + multi_class="ovr", + break_ties=True, + class_weight=None, + random_state=None, + max_iter=100, + model=None, + verbose=0, + balance_class_weight=True, + lr_seed=1024, + labels=None, + **kwargs, + ): + + super().__init__(**kwargs) + + if class_weight is None and balance_class_weight: + class_weight = "balanced" + + if random_state is None: + random_state = np.random.RandomState(seed=lr_seed) + + self.balance_class_weight = balance_class_weight + if model is None: + self.svm = SVC( + C=C, + kernel="rbf", + gamma=gamma, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=multi_class, + break_ties=break_ties, + random_state=random_state, + ) + else: + self.svm = model + self.set_labels(labels) + + @property + def model_params(self): + return self.svm.get_params() + + def set_labels(self, labels): + if isinstance(labels, np.ndarray): + labels = list(labels) + self.labels = labels + + def get_config(self): + """Gets configuration hyperparams. + Returns: + Dictionary with config hyperparams. + """ + config = { + "balance_class_weight": self.balance_class_weight, + "labels": self.labels, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def predict(self, x, eval_type="cat-post"): + """Evaluates the SVM + + Args: + x: input features (num_samples, feat_dim), + it can be (num_samples,) if feat_dim=1. + eval_type: evaluationg method: logit (log-likelihood ratio), + bin-log-post (binary log-posteriors), + bin-post (binary posteriors) + cat-log-post (categorical log-posteriors), + cat-post (categorical posteriors) + Returns: + Ouput scores (num_samples, num_classes) + """ + if eval_type == "cat-post": + return self.svm.predict_proba(x) + if eval_type == "cat-log-post": + return self.svm.predict_log_proba(x) + + return self.svm.predict_proba(x) + + def __call__(self, x, eval_type="logit"): + """Evaluates the SVM + + Args: + x: input features (num_samples, feat_dim), + it can be (num_samples,) if feat_dim=1. + eval_type: evaluationg method: logit (log-likelihood ratio), + bin-log-post (binary log-posteriors), + bin-post (binary posteriors) + cat-log-post (categorical log-posteriors), + cat-post (categorical posteriors) + Returns: + Ouput scores (num_samples, num_classes) + """ + return self.predict(x, eval_type) + + def fit(self, x, class_ids, sample_weight=None): + """Estimates the parameters of the model. + + Args: + x: input features (num_samples, feat_dim), it can be (num_samples,) if feat_dim=1. + class_ids: class integer [0, num_classes-1] identifier (num_samples,) + sample_weight: weight of each sample in the estimation (num_samples,) + """ + print("--------------", type(x[3, 2]), type(class_ids[20]), "--------------") + self.svm.fit(x, class_ids) + if self.svm.fit_status_: + print("SVM did not converge") + + def save(self, file_path): + """Saves the model to file. + + Args: + file_path: filename path. + """ + file_dir = os.path.dirname(file_path) + if not (os.path.isdir(file_dir)): + os.makedirs(file_dir, exist_ok=True) + split_path = os.path.splitext(file_path) + if not split_path[-1] == "sav": + file_path = "".join(split_path[0] + ".sav") + with open(file_path, "wb") as f: + # with h5py.File(file_path, "w") as f: + # config = self.to_json() + # f.create_dataset("config", data=np.array(config, dtype="S")) + self.save_params(f) + + @classmethod + def load(cls, file_path): + """Loads the model from file. + + Args: + file_path: path to the file where the model is stored. + + Returns: + Model object. + """ + split_path = os.path.splitext(file_path) + if not split_path[-1] == "sav": + file_path = "".join(split_path[0] + ".sav") + + # with h5py.File(file_path, "r") as f: + with open(file_path, "rb") as f: + # json_str = str(np.asarray(f["config"]).astype("U")) + # config = cls.load_config_from_json(json_str) + config = None + return cls.load_params(f, config) + + def save_params(self, f): + # params = {"A": self.A, "b": self.b} + # self._save_params_from_dict(f, params) + pickle.dump(self, f) + + @classmethod + def load_params(cls, f, config): + # param_list = ["A", "b"] + # params = cls._load_params_to_dict(f, config["name"], param_list) + # kwargs = dict(list(config.items()) + list(params.items())) + # return cls(**kwargs) + svmc = pickle.load(f) + return svmc + + @staticmethod + def filter_class_args(**kwargs): + """Extracts the hyperparams of the class from a dictionary. + + Returns: + Hyperparamter dictionary to initialize the class. + """ + valid_args = ( + "nu", + "gamma", + "shrinking", + "probability", + "tol", + "cache_size", + "multi_class", + "break_ties", + "class_weight", + "random_state", + "max_iter", + "verbose", + "balance_class_weight", + "lr_seed", + "model", + "labels", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + filter_train_args = filter_class_args + + @staticmethod + def add_class_args(parser, prefix=None): + """It adds the arguments corresponding to the class to jsonarparse. + Args: + parser: jsonargparse object + prefix: argument prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--c", + dest="C", + default=1.0, + type=float, + help="inverse of regularization strength", + ) + # parser.add_argument( + # "--class_weight", + # default=None, + # help="Class weights", + # ) + parser.add_argument( + "--gamma", + default="scale", + choices=["scale", "auto"], + help="Kernel coefficient for ‘rbf’", + ) + parser.add_argument( + "--shrinking", + default=True, + type=bool, + help="Whether to use the shrinking heuristic", + ) + parser.add_argument( + "--probability", + default=True, + type=bool, + help="Whether to enable probability estimates", + ) + parser.add_argument( + "--break_ties", + default=True, + type=bool, + help="If true, predict will break ties according to the confidence values of decision_function; otherwise \ + the first class among the tied classes is returned", + ) + parser.add_argument( + "--lr-seed", default=1024, type=int, help="random number generator seed" + ) + parser.add_argument( + "--max-iter", + dest="max_iter", + default=100, + type=int, + help="only for the newton-cg, sag and lbfgs solvers", + ) + parser.add_argument( + "--tol", default=1e-4, type=float, help="tolerance for stopping criteria" + ) + parser.add_argument( + "--multi-class", + default="ovr", + choices=["ovr", "ovo"], + help=( + "ovr fits a binary problem for each class else " + "it minimizes the multinomial loss." + ), + ) + parser.add_argument( + "--cache_size", + default=600, + type=int, + help="Specify the size of the kernel cache (in MB)", + ) + parser.add_argument( + "--verbose", + default=0, + type=int, + help="For the liblinear and lbfgs solvers", + ) + parser.add_argument( + "--balance-class-weight", + default=False, + action=ActionYesNo, + help="Balances the weight of each class when computing W", + ) + parser.add_argument("--name", default="svc", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + @staticmethod + def filter_eval_args(**kwargs): + """Extracts the evaluation time hyperparams of the class from a dictionary. + + Returns: + Hyperparameters to evaluate the class. + """ + valid_args = "eval_type" + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_eval_args(parser, prefix=None): + """It adds the arguments needed to evaluate the class to jsonarparse. + Args: + parser: jsonargparse object + prefix: argument prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--eval-type", + default="cat-post", + choices=["cat-logpost", "cat-post"], + help=("type of evaluation"), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + # for backward compatibility + filter_train_args = filter_class_args + add_argparse_args = add_class_args + add_argparse_train_args = add_class_args + add_argparse_eval_args = add_eval_args From 4182374bc6fa2e8e7ee908896256274c8e7dcc7f Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 21 Nov 2022 10:08:17 -0500 Subject: [PATCH 049/154] fix train w2v --- hyperion/bin/train_wav2vec2xvector.py | 62 ++++++++++++++++------ hyperion/np/classifiers/linear_svmc.py | 2 +- hyperion/np/classifiers/svmc.py | 12 ++--- hyperion/torch/data/audio_dataset.py | 14 ++--- hyperion/torch/trainers/xvector_trainer.py | 1 - 5 files changed, 59 insertions(+), 32 deletions(-) diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index e92b9a1a..f1281904 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -3,7 +3,7 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +# import sys import os from pathlib import Path from jsonargparse import ( @@ -25,7 +25,9 @@ from hyperion.torch.utils import ddp from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SegSamplerFactory + +# from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import ( HFWav2Vec2ResNet1dXVector, @@ -44,19 +46,21 @@ def init_data(partition, rank, num_gpus, **kwargs): kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) - sampler_args = Sampler.filter_args(**kwargs["sampler"]) + sampler_args = kwargs["sampler"] if rank == 0: logging.info("{} audio dataset args={}".format(partition, ad_args)) logging.info("{} sampler args={}".format(partition, sampler_args)) logging.info("init %s dataset", partition) - ad_args["is_val"] = partition == "val" + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val dataset = AD(**ad_args) if rank == 0: logging.info("init %s samplers", partition) - sampler = Sampler(dataset, **sampler_args) + sampler = SegSamplerFactory.create(dataset, **sampler_args) if rank == 0: logging.info("init %s dataloader", partition) @@ -70,6 +74,36 @@ def init_data(partition, rank, num_gpus, **kwargs): return data_loader +# def init_data(partition, rank, num_gpus, **kwargs): + +# kwargs = kwargs["data"][partition] +# ad_args = AD.filter_args(**kwargs["dataset"]) +# sampler_args = Sampler.filter_args(**kwargs["sampler"]) +# if rank == 0: +# logging.info("{} audio dataset args={}".format(partition, ad_args)) +# logging.info("{} sampler args={}".format(partition, sampler_args)) +# logging.info("init %s dataset", partition) + +# ad_args["is_val"] = partition == "val" +# dataset = AD(**ad_args) + +# if rank == 0: +# logging.info("init %s samplers", partition) + +# sampler = Sampler(dataset, **sampler_args) + +# if rank == 0: +# logging.info("init %s dataloader", partition) + +# num_workers = kwargs["data_loader"]["num_workers"] +# num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) +# largs = ( +# {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} +# ) +# data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) +# return data_loader + + def init_model(num_classes, rank, model_class, **kwargs): model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: @@ -97,18 +131,14 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.num_classes, **kwargs) + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, - device=device, - metrics=metrics, - ddp=world_size > 1, - **trn_args, + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) @@ -120,9 +150,10 @@ def make_parser(model_class): parser = ArgumentParser() parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") AD.add_class_args(train_parser, prefix="dataset", skip={}) - Sampler.add_class_args(train_parser, prefix="sampler") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", type=int, @@ -132,7 +163,7 @@ def make_parser(model_class): val_parser = ArgumentParser(prog="") AD.add_class_args(val_parser, prefix="dataset", skip={}) - Sampler.add_class_args(val_parser, prefix="sampler") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", type=int, @@ -144,14 +175,11 @@ def make_parser(model_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) parser.link_arguments( - "data.train.dataset.class_file", "data.val.dataset.class_file" + "data.train.dataset.class_files", "data.val.dataset.class_files" ) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) - parser.link_arguments( - "data.train.sampler.batch_size", "data.val.sampler.batch_size" - ) model_class.add_class_args(parser, prefix="model") Trainer.add_class_args( diff --git a/hyperion/np/classifiers/linear_svmc.py b/hyperion/np/classifiers/linear_svmc.py index cb95e903..607d83de 100644 --- a/hyperion/np/classifiers/linear_svmc.py +++ b/hyperion/np/classifiers/linear_svmc.py @@ -359,7 +359,7 @@ def add_eval_args(parser, prefix=None): parser.add_argument( "--eval-type", default="logit", - choices=["logit", "bin-logpost", "bin-post", "cat-logpost", "cat-post"], + choices=["logit", "bin-log-post", "bin-post", "cat-log-post", "cat-post"], help=("type of evaluation"), ) diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py index 764c6101..77a05ff9 100644 --- a/hyperion/np/classifiers/svmc.py +++ b/hyperion/np/classifiers/svmc.py @@ -90,7 +90,7 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def predict(self, x, eval_type="cat-post"): + def predict(self, x, eval_type="decision-func"): """Evaluates the SVM Args: @@ -109,9 +109,9 @@ def predict(self, x, eval_type="cat-post"): if eval_type == "cat-log-post": return self.svm.predict_log_proba(x) - return self.svm.predict_proba(x) + return self.svm.decision_function(x) - def __call__(self, x, eval_type="logit"): + def __call__(self, x, eval_type="decision-func"): """Evaluates the SVM Args: @@ -138,7 +138,7 @@ def fit(self, x, class_ids, sample_weight=None): print("--------------", type(x[3, 2]), type(class_ids[20]), "--------------") self.svm.fit(x, class_ids) if self.svm.fit_status_: - print("SVM did not converge") + logging.warning("SVM did not converge") def save(self, file_path): """Saves the model to file. @@ -339,8 +339,8 @@ def add_eval_args(parser, prefix=None): parser.add_argument( "--eval-type", - default="cat-post", - choices=["cat-logpost", "cat-post"], + default="decision-func", + choices=["cat-log-post", "cat-post", "decision-func"], help=("type of evaluation"), ) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index f24ca8c5..439c00ba 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -16,7 +16,8 @@ from ..torch_defs import floatstr_torch from ...io import RandomAccessAudioReader as AR -from ...utils.utt2info import Utt2Info + +# from ...utils.utt2info import Utt2Info from ...np.augment import SpeechAugment from torch.utils.data import Dataset @@ -618,7 +619,7 @@ def _read_audio(self, seg_id, start, duration): # read audio recording_id = self.seg_set.recording_ids(seg_id) x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration) - return x[0], fs[0] + return x[0].astype(floatstr_torch(), copy=False), fs[0] def _apply_augs(self, x, num_samples, reverb_context_samples): x_augs = [] @@ -630,7 +631,7 @@ def _apply_augs(self, x, num_samples, reverb_context_samples): x_aug, aug_info = augmenter(x) # remove the extra left context used to compute the reverberation. x_aug = x_aug[reverb_context_samples : len(x)] - x_augs.append(x_aug) + x_augs.append(x_aug.astype(floatstr_torch(), copy=False)) return x_augs @@ -663,14 +664,14 @@ def _get_resampler(self, fs): resampling_method="kaiser_window", beta=14.769656459379492, ) - self.resampler[fs] = resampler - return resampler + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + self.resamplers[fs] = resampler_f + return resampler_f def _resample(self, x, fs): try: if self.target_sample_freq is None or fs == self.target_sample_freq: return x, fs - resampler = self._get_resampler(fs) return resampler(x), self.target_sample_freq except: @@ -681,7 +682,6 @@ def __getitem__(self, segment): seg_id, start, duration = self._parse_segment_item(segment) x, fs = self._read_audio(seg_id, start, duration) x, fs = self._resample(x, fs) - if self.augmenters: # augmentations num_samples = int(duration * fs) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 4cc4bc8c..9b04fdd0 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -129,7 +129,6 @@ def train_epoch(self, data_loader): data, target = data.to(self.device), target.to(self.device) batch_size = data.shape[0] - with self.amp_autocast(): output = self.model(data, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps From 2f6547acdef9779ba1aef90fb49f974ef7e18a39 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Mon, 21 Nov 2022 22:40:22 -0500 Subject: [PATCH 050/154] Address comments for transducer --- .../v1/conf/wav2vec2xlsr300m_transducer.yaml | 8 - .../v1/global_conf/config_transducer_v1.sh | 14 - hyperion/torch/data/__init__.py | 2 +- hyperion/torch/data/audio_dataset.py | 34 +- hyperion/torch/data/bucketing_seg_sampler.py | 5 +- hyperion/torch/models/transducer/conformer.py | 2 +- .../torch/models/transducer/transducer.py | 2 +- .../torch/models/transducer/transformer.py | 2 +- hyperion/utils/text.py | 143 +++ hyperion/utils/utils.py | 979 ------------------ 10 files changed, 164 insertions(+), 1027 deletions(-) delete mode 100644 hyperion/utils/utils.py diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml index 3d9d768a..b8a1cdbb 100644 --- a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml @@ -1,14 +1,6 @@ hf_feats: pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus transducer: -# conformer_enc: - # num_features: 80 - # subsampling_factor: 4 - # d_model: 512 - # nhead: 8 - # dim_feedforward: 2048 - # num_encoder_layers: 12 - # vgg_frontend: False decoder: embedding_dim: 1024 num_layers: 2 diff --git a/egs/librispeech/v1/global_conf/config_transducer_v1.sh b/egs/librispeech/v1/global_conf/config_transducer_v1.sh index 9f28f551..ca1ca29c 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v1.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v1.sh @@ -37,17 +37,3 @@ nnet_s3_name=${nnet_name}.s3 nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name nnet_s3=$nnet_s3_dir/model_ep0002.pth nnet_s3=$nnet_s3_dir/model_ep0005.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index e289acbf..16162da8 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -11,5 +11,5 @@ # samplers from .bucketing_seg_sampler import BucketingSegSampler -from .weighted_seq_sampler import ClassWeightedSeqSampler +# from .weighted_seq_sampler import ClassWeightedSeqSampler from .seg_sampler_factory import SegSamplerFactory diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 35b7d85b..403e0d1d 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -459,7 +459,7 @@ class AudioDataset(Dataset): def __init__( self, audio_file, - segments_file=None, + segments_file, class_names=None, class_files=None, bpe_model=None, @@ -493,22 +493,21 @@ def __init__( if rank == 0: logging.info("loading segments file %s" % segments_file) - if segments_file is not None: - self.seg_set = SegmentSet.load(segments_file) - if rank == 0: - logging.info("dataset contains %d seqs" % len(self.seg_set)) + self.seg_set = SegmentSet.load(segments_file) + if rank == 0: + logging.info("dataset contains %d seqs" % len(self.seg_set)) - self.is_val = is_val - if time_durs_file is not None: - if rank == 0: - logging.info("loading durations file %s" % time_durs_file) + self.is_val = is_val + if time_durs_file is not None: + if rank == 0: + logging.info("loading durations file %s" % time_durs_file) - time_durs = SegmentSet.load(time_durs_file) - self.seg_set["duration"] = time_durs.loc[ - self.seg_set["id"] - ].class_id.values.astype(np.float, copy=False) - else: - assert "duration" in self.seg_set + time_durs = SegmentSet.load(time_durs_file) + self.seg_set["duration"] = time_durs.loc[ + self.seg_set["id"] + ].class_id.values.astype(np.float, copy=False) + else: + assert "duration" in self.seg_set logging.info("loading class-info files") self._load_class_infos(class_names, class_files, is_val) @@ -546,7 +545,6 @@ def _load_text_infos(self, text_file, is_val): text = read_text(text_file) self.seg_set["text"] = text.loc[self.seg_set["id"]].text - self.text_info = ClassInfo(text) @@ -679,8 +677,7 @@ def _get_segment_info(self, seg_id): idx = class_info.loc[seg_info, "class_idx"] seg_info = idx if info_name == "text": - text = self.text_info.loc[seg_id, "text"] - seg_info = self.sp.encode(text, out_type=int) + seg_info = self.sp.encode(seg_info, out_type=int) r.append(seg_info) @@ -713,7 +710,6 @@ def __getitem__(self, segment): seg_info = self._get_segment_info(seg_id) r.extend(seg_info) - return (*r,) @staticmethod diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index 83e6425c..8b0e855a 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -42,9 +42,8 @@ def create_buckets(self): bucket_length = cum_lengths[-1] / self.num_buckets buckets = [] for i in range(self.num_buckets): - bucket_bool = (cum_lengths <= bucket_length) & (cum_lengths > 0) - bucket_idx = np.arange(len(bucket_bool))[bucket_bool] - bucket_i = sorted_seg_set.iloc[bucket_idx] + bucket_idx = (cum_lengths <= bucket_length) & (cum_lengths > 0) + bucket_i = sorted_seg_set.loc[bucket_idx] buckets.append(bucket_i) cum_lengths -= bucket_length diff --git a/hyperion/torch/models/transducer/conformer.py b/hyperion/torch/models/transducer/conformer.py index 734c28ce..a350d579 100644 --- a/hyperion/torch/models/transducer/conformer.py +++ b/hyperion/torch/models/transducer/conformer.py @@ -25,7 +25,7 @@ from torch import Tensor, nn from .transformer import Transformer -from hyperion.utils.utils import make_pad_mask, subsequent_chunk_mask +from hyperion.utils.text import make_pad_mask, subsequent_chunk_mask class Conformer(Transformer): diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index 80bf9891..52547954 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -27,7 +27,7 @@ from .encoder_interface import EncoderInterface from ...torch_model import TorchModel -from hyperion.utils.utils import add_sos +from hyperion.utils.text import add_sos # from .conformer import Conformer from .decoder import Decoder from .joiner import Joiner diff --git a/hyperion/torch/models/transducer/transformer.py b/hyperion/torch/models/transducer/transformer.py index 38edbd62..d9d5d6fb 100644 --- a/hyperion/torch/models/transducer/transformer.py +++ b/hyperion/torch/models/transducer/transformer.py @@ -23,7 +23,7 @@ from .encoder_interface import EncoderInterface from .subsampling import Conv2dSubsampling, VggSubsampling -from hyperion.utils.utils import make_pad_mask +from hyperion.utils.text import make_pad_mask class Transformer(EncoderInterface): diff --git a/hyperion/utils/text.py b/hyperion/utils/text.py index 5e06ad0c..48be92e3 100644 --- a/hyperion/utils/text.py +++ b/hyperion/utils/text.py @@ -4,11 +4,154 @@ """ from pathlib import Path +import k2 +import k2.version import numpy as np import pandas as pd +import torch +# Copied and modified from https://github.com/wenet-e2e/wenet/blob/main/wenet/utils/mask.py +def subsequent_chunk_mask( + size: int, + chunk_size: int, + num_left_chunks: int = -1, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size) with chunk size, + this is for streaming encoder + Args: + size (int): size of mask + chunk_size (int): size of chunk + num_left_chunks (int): number of left chunks + <0: use full chunk + >=0: use num_left_chunks + device (torch.device): "cpu" or "cuda" or torch.Tensor.device + Returns: + torch.Tensor: mask + Examples: + >>> subsequent_chunk_mask(4, 2) + [[1, 1, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 1], + [1, 1, 1, 1]] + """ + ret = torch.zeros(size, size, device=device, dtype=torch.bool) + for i in range(size): + if num_left_chunks < 0: + start = 0 + else: + start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) + ending = min((i // chunk_size + 1) * chunk_size, size) + ret[i, start:ending] = True + return ret + + + +def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor: + """ + Args: + lengths: + A 1-D tensor containing sentence lengths. + Returns: + Return a 2-D bool tensor, where masked positions + are filled with `True` and non-masked positions are + filled with `False`. + + >>> lengths = torch.tensor([1, 3, 2, 5]) + >>> make_pad_mask(lengths) + tensor([[False, True, True, True, True], + [False, False, False, True, True], + [False, False, True, True, True], + [False, False, False, False, False]]) + """ + assert lengths.ndim == 1, lengths.ndim + + max_len = lengths.max() + n = lengths.size(0) + + expaned_lengths = torch.arange(max_len).expand(n, max_len).to(lengths) + + return expaned_lengths >= lengths.unsqueeze(1) + + +def concat( + ragged: k2.RaggedTensor, value: int, direction: str +) -> k2.RaggedTensor: + """Prepend a value to the beginning of each sublist or append a value. + to the end of each sublist. + + Args: + ragged: + A ragged tensor with two axes. + value: + The value to prepend or append. + direction: + It can be either "left" or "right". If it is "left", we + prepend the value to the beginning of each sublist; + if it is "right", we append the value to the end of each + sublist. + + Returns: + Return a new ragged tensor, whose sublists either start with + or end with the given value. + + >>> a = k2.RaggedTensor([[1, 3], [5]]) + >>> a + [ [ 1 3 ] [ 5 ] ] + >>> concat(a, value=0, direction="left") + [ [ 0 1 3 ] [ 0 5 ] ] + >>> concat(a, value=0, direction="right") + [ [ 1 3 0 ] [ 5 0 ] ] + + """ + dtype = ragged.dtype + device = ragged.device + + assert ragged.num_axes == 2, f"num_axes: {ragged.num_axes}" + pad_values = torch.full( + size=(ragged.tot_size(0), 1), + fill_value=value, + device=device, + dtype=dtype, + ) + pad = k2.RaggedTensor(pad_values) + + if direction == "left": + ans = k2.ragged.cat([pad, ragged], axis=1) + elif direction == "right": + ans = k2.ragged.cat([ragged, pad], axis=1) + else: + raise ValueError( + f'Unsupported direction: {direction}. " \ + "Expect either "left" or "right"' + ) + return ans + + +def add_sos(ragged: k2.RaggedTensor, sos_id: int) -> k2.RaggedTensor: + """Add SOS to each sublist. + + Args: + ragged: + A ragged tensor with two axes. + sos_id: + The ID of the SOS symbol. + + Returns: + Return a new ragged tensor, where each sublist starts with SOS. + + >>> a = k2.RaggedTensor([[1, 3], [5]]) + >>> a + [ [ 1 3 ] [ 5 ] ] + >>> add_sos(a, sos_id=0) + [ [ 0 1 3 ] [ 0 5 ] ] + + """ + return concat(ragged, sos_id, direction="left") + + def read_text(text_file: str): # assert check_argument_types() text_file = Path(text_file) diff --git a/hyperion/utils/utils.py b/hyperion/utils/utils.py deleted file mode 100644 index 2da78581..00000000 --- a/hyperion/utils/utils.py +++ /dev/null @@ -1,979 +0,0 @@ -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang -# Mingshuang Luo) -# -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import collections -import logging -import os -import re -import subprocess -from collections import defaultdict -from contextlib import contextmanager -from datetime import datetime -from pathlib import Path -from typing import Dict, Iterable, List, TextIO, Tuple, Union - -import k2 -import k2.version -import kaldialign -import sentencepiece as spm -import torch -import torch.distributed as dist -import torch.nn as nn -from torch.utils.tensorboard import SummaryWriter - -from hyperion.utils.checkpoint import average_checkpoints - -Pathlike = Union[str, Path] - - -# Pytorch issue: https://github.com/pytorch/pytorch/issues/47379 -# Fixed: https://github.com/pytorch/pytorch/pull/49853 -# The fix was included in v1.9.0 -# https://github.com/pytorch/pytorch/releases/tag/v1.9.0 -def is_jit_tracing(): - if torch.jit.is_scripting(): - return False - elif torch.jit.is_tracing(): - return True - return False - - -@contextmanager -def get_executor(): - # We'll either return a process pool or a distributed worker pool. - # Note that this has to be a context manager because we might use multiple - # context manager ("with" clauses) inside, and this way everything will - # free up the resources at the right time. - try: - # If this is executed on the CLSP grid, we will try to use the - # Grid Engine to distribute the tasks. - # Other clusters can also benefit from that, provided a - # cluster-specific wrapper. - # (see https://github.com/pzelasko/plz for reference) - # - # The following must be installed: - # $ pip install dask distributed - # $ pip install git+https://github.com/pzelasko/plz - name = subprocess.check_output("hostname -f", shell=True, text=True) - if name.strip().endswith(".clsp.jhu.edu"): - import plz - from distributed import Client - - with plz.setup_cluster() as cluster: - cluster.scale(80) - yield Client(cluster) - return - except Exception: - pass - # No need to return anything - compute_and_store_features - # will just instantiate the pool itself. - yield None - - -def str2bool(v): - """Used in argparse.ArgumentParser.add_argument to indicate - that a type is a bool type and user can enter - - - yes, true, t, y, 1, to represent True - - no, false, f, n, 0, to represent False - - See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa - """ - if isinstance(v, bool): - return v - if v.lower() in ("yes", "true", "t", "y", "1"): - return True - elif v.lower() in ("no", "false", "f", "n", "0"): - return False - else: - raise argparse.ArgumentTypeError("Boolean value expected.") - - -def setup_logger( - log_filename: Pathlike, - log_level: str = "info", - use_console: bool = True, -) -> None: - """Setup log level. - - Args: - log_filename: - The filename to save the log. - log_level: - The log level to use, e.g., "debug", "info", "warning", "error", - "critical" - use_console: - True to also print logs to console. - """ - now = datetime.now() - date_time = now.strftime("%Y-%m-%d-%H-%M-%S") - if dist.is_available() and dist.is_initialized(): - world_size = dist.get_world_size() - rank = dist.get_rank() - formatter = f"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] ({rank}/{world_size}) %(message)s" # noqa - log_filename = f"{log_filename}-{date_time}-{rank}" - else: - formatter = ( - "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - ) - log_filename = f"{log_filename}-{date_time}" - - os.makedirs(os.path.dirname(log_filename), exist_ok=True) - - level = logging.ERROR - if log_level == "debug": - level = logging.DEBUG - elif log_level == "info": - level = logging.INFO - elif log_level == "warning": - level = logging.WARNING - elif log_level == "critical": - level = logging.CRITICAL - - logging.basicConfig( - filename=log_filename, - format=formatter, - level=level, - filemode="w", - ) - if use_console: - console = logging.StreamHandler() - console.setLevel(level) - console.setFormatter(logging.Formatter(formatter)) - logging.getLogger("").addHandler(console) - - -class AttributeDict(dict): - def __getattr__(self, key): - if key in self: - return self[key] - raise AttributeError(f"No such attribute '{key}'") - - def __setattr__(self, key, value): - self[key] = value - - def __delattr__(self, key): - if key in self: - del self[key] - return - raise AttributeError(f"No such attribute '{key}'") - - -def encode_supervisions( - supervisions: dict, subsampling_factor: int -) -> Tuple[torch.Tensor, List[str]]: - """ - Encodes Lhotse's ``batch["supervisions"]`` dict into - a pair of torch Tensor, and a list of transcription strings. - - The supervision tensor has shape ``(batch_size, 3)``. - Its second dimension contains information about sequence index [0], - start frames [1] and num frames [2]. - - The batch items might become re-ordered during this operation -- the - returned tensor and list of strings are guaranteed to be consistent with - each other. - """ - supervision_segments = torch.stack( - ( - supervisions["sequence_idx"], - supervisions["start_frame"] // subsampling_factor, - supervisions["num_frames"] // subsampling_factor, - ), - 1, - ).to(torch.int32) - - indices = torch.argsort(supervision_segments[:, 2], descending=True) - supervision_segments = supervision_segments[indices] - texts = supervisions["text"] - texts = [texts[idx] for idx in indices] - - return supervision_segments, texts - - -def get_texts( - best_paths: k2.Fsa, return_ragged: bool = False -) -> Union[List[List[int]], k2.RaggedTensor]: - """Extract the texts (as word IDs) from the best-path FSAs. - Args: - best_paths: - A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e. - containing multiple FSAs, which is expected to be the result - of k2.shortest_path (otherwise the returned values won't - be meaningful). - return_ragged: - True to return a ragged tensor with two axes [utt][word_id]. - False to return a list-of-list word IDs. - Returns: - Returns a list of lists of int, containing the label sequences we - decoded. - """ - if isinstance(best_paths.aux_labels, k2.RaggedTensor): - # remove 0's and -1's. - aux_labels = best_paths.aux_labels.remove_values_leq(0) - # TODO: change arcs.shape() to arcs.shape - aux_shape = best_paths.arcs.shape().compose(aux_labels.shape) - - # remove the states and arcs axes. - aux_shape = aux_shape.remove_axis(1) - aux_shape = aux_shape.remove_axis(1) - aux_labels = k2.RaggedTensor(aux_shape, aux_labels.values) - else: - # remove axis corresponding to states. - aux_shape = best_paths.arcs.shape().remove_axis(1) - aux_labels = k2.RaggedTensor(aux_shape, best_paths.aux_labels) - # remove 0's and -1's. - aux_labels = aux_labels.remove_values_leq(0) - - assert aux_labels.num_axes == 2 - if return_ragged: - return aux_labels - else: - return aux_labels.tolist() - - -def get_alignments(best_paths: k2.Fsa, kind: str) -> List[List[int]]: - """Extract labels or aux_labels from the best-path FSAs. - - Args: - best_paths: - A k2.Fsa with best_paths.arcs.num_axes() == 3, i.e. - containing multiple FSAs, which is expected to be the result - of k2.shortest_path (otherwise the returned values won't - be meaningful). - kind: - Possible values are: "labels" and "aux_labels". Caution: When it is - "labels", the resulting alignments contain repeats. - Returns: - Returns a list of lists of int, containing the token sequences we - decoded. For `ans[i]`, its length equals to the number of frames - after subsampling of the i-th utterance in the batch. - - Example: - When `kind` is `labels`, one possible alignment example is (with - repeats):: - - c c c blk a a blk blk t t t blk blk - - If `kind` is `aux_labels`, the above example changes to:: - - c blk blk blk a blk blk blk t blk blk blk blk - - """ - assert kind in ("labels", "aux_labels") - # arc.shape() has axes [fsa][state][arc], we remove "state"-axis here - token_shape = best_paths.arcs.shape().remove_axis(1) - # token_shape has axes [fsa][arc] - tokens = k2.RaggedTensor( - token_shape, getattr(best_paths, kind).contiguous() - ) - tokens = tokens.remove_values_eq(-1) - return tokens.tolist() - - -def save_alignments( - alignments: Dict[str, List[int]], - subsampling_factor: int, - filename: str, -) -> None: - """Save alignments to a file. - - Args: - alignments: - A dict containing alignments. Keys of the dict are utterances and - values are the corresponding framewise alignments after subsampling. - subsampling_factor: - The subsampling factor of the model. - filename: - Path to save the alignments. - Returns: - Return None. - """ - ali_dict = { - "subsampling_factor": subsampling_factor, - "alignments": alignments, - } - torch.save(ali_dict, filename) - - -def load_alignments(filename: str) -> Tuple[int, Dict[str, List[int]]]: - """Load alignments from a file. - - Args: - filename: - Path to the file containing alignment information. - The file should be saved by :func:`save_alignments`. - Returns: - Return a tuple containing: - - subsampling_factor: The subsampling_factor used to compute - the alignments. - - alignments: A dict containing utterances and their corresponding - framewise alignment, after subsampling. - """ - ali_dict = torch.load(filename) - subsampling_factor = ali_dict["subsampling_factor"] - alignments = ali_dict["alignments"] - return subsampling_factor, alignments - - -def store_transcripts( - filename: Pathlike, texts: Iterable[Tuple[str, str, str]] -) -> None: - """Save predicted results and reference transcripts to a file. - - Args: - filename: - File to save the results to. - texts: - An iterable of tuples. The first element is the cur_id, the second is - the reference transcript and the third element is the predicted result. - Returns: - Return None. - """ - with open(filename, "w") as f: - for cut_id, ref, hyp in texts: - print(f"{cut_id}:\tref={ref}", file=f) - print(f"{cut_id}:\thyp={hyp}", file=f) - - - -def write_error_stats( - f: TextIO, - test_set_name: str, - results: List[Tuple[str, str]], - enable_log: bool = True, -) -> float: - """Write statistics based on predicted results and reference transcripts. - - It will write the following to the given file: - - - WER - - number of insertions, deletions, substitutions, corrects and total - reference words. For example:: - - Errors: 23 insertions, 57 deletions, 212 substitutions, over 2606 - reference words (2337 correct) - - - The difference between the reference transcript and predicted result. - An instance is given below:: - - THE ASSOCIATION OF (EDISON->ADDISON) ILLUMINATING COMPANIES - - The above example shows that the reference word is `EDISON`, - but it is predicted to `ADDISON` (a substitution error). - - Another example is:: - - FOR THE FIRST DAY (SIR->*) I THINK - - The reference word `SIR` is missing in the predicted - results (a deletion error). - results: - An iterable of tuples. The first element is the cur_id, the second is - the reference transcript and the third element is the predicted result. - enable_log: - If True, also print detailed WER to the console. - Otherwise, it is written only to the given file. - Returns: - Return None. - """ - subs: Dict[Tuple[str, str], int] = defaultdict(int) - ins: Dict[str, int] = defaultdict(int) - dels: Dict[str, int] = defaultdict(int) - - # `words` stores counts per word, as follows: - # corr, ref_sub, hyp_sub, ins, dels - words: Dict[str, List[int]] = defaultdict(lambda: [0, 0, 0, 0, 0]) - num_corr = 0 - ERR = "*" - for cut_id, ref, hyp in results: - ali = kaldialign.align(ref, hyp, ERR) - for ref_word, hyp_word in ali: - if ref_word == ERR: - ins[hyp_word] += 1 - words[hyp_word][3] += 1 - elif hyp_word == ERR: - dels[ref_word] += 1 - words[ref_word][4] += 1 - elif hyp_word != ref_word: - subs[(ref_word, hyp_word)] += 1 - words[ref_word][1] += 1 - words[hyp_word][2] += 1 - else: - words[ref_word][0] += 1 - num_corr += 1 - ref_len = sum([len(r) for _, r, _ in results]) - sub_errs = sum(subs.values()) - ins_errs = sum(ins.values()) - del_errs = sum(dels.values()) - tot_errs = sub_errs + ins_errs + del_errs - tot_err_rate = "%.2f" % (100.0 * tot_errs / ref_len) - - if enable_log: - logging.info( - f"[{test_set_name}] %WER {tot_errs / ref_len:.2%} " - f"[{tot_errs} / {ref_len}, {ins_errs} ins, " - f"{del_errs} del, {sub_errs} sub ]" - ) - - print(f"%WER = {tot_err_rate}", file=f) - print( - f"Errors: {ins_errs} insertions, {del_errs} deletions, " - f"{sub_errs} substitutions, over {ref_len} reference " - f"words ({num_corr} correct)", - file=f, - ) - print( - "Search below for sections starting with PER-UTT DETAILS:, " - "SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:", - file=f, - ) - - print("", file=f) - print("PER-UTT DETAILS: corr or (ref->hyp) ", file=f) - for cut_id, ref, hyp in results: - ali = kaldialign.align(ref, hyp, ERR) - combine_successive_errors = True - if combine_successive_errors: - ali = [[[x], [y]] for x, y in ali] - for i in range(len(ali) - 1): - if ali[i][0] != ali[i][1] and ali[i + 1][0] != ali[i + 1][1]: - ali[i + 1][0] = ali[i][0] + ali[i + 1][0] - ali[i + 1][1] = ali[i][1] + ali[i + 1][1] - ali[i] = [[], []] - ali = [ - [ - list(filter(lambda a: a != ERR, x)), - list(filter(lambda a: a != ERR, y)), - ] - for x, y in ali - ] - ali = list(filter(lambda x: x != [[], []], ali)) - ali = [ - [ - ERR if x == [] else " ".join(x), - ERR if y == [] else " ".join(y), - ] - for x, y in ali - ] - - print( - f"{cut_id}:\t" - + " ".join( - ( - ref_word - if ref_word == hyp_word - else f"({ref_word}->{hyp_word})" - for ref_word, hyp_word in ali - ) - ), - file=f, - ) - - print("", file=f) - print("SUBSTITUTIONS: count ref -> hyp", file=f) - - for count, (ref, hyp) in sorted( - [(v, k) for k, v in subs.items()], reverse=True - ): - print(f"{count} {ref} -> {hyp}", file=f) - - print("", file=f) - print("DELETIONS: count ref", file=f) - for count, ref in sorted([(v, k) for k, v in dels.items()], reverse=True): - print(f"{count} {ref}", file=f) - - print("", file=f) - print("INSERTIONS: count hyp", file=f) - for count, hyp in sorted([(v, k) for k, v in ins.items()], reverse=True): - print(f"{count} {hyp}", file=f) - - print("", file=f) - print( - "PER-WORD STATS: word corr tot_errs count_in_ref count_in_hyp", file=f - ) - for _, word, counts in sorted( - [(sum(v[1:]), k, v) for k, v in words.items()], reverse=True - ): - (corr, ref_sub, hyp_sub, ins, dels) = counts - tot_errs = ref_sub + hyp_sub + ins + dels - ref_count = corr + ref_sub + dels - hyp_count = corr + hyp_sub + ins - - print(f"{word} {corr} {tot_errs} {ref_count} {hyp_count}", file=f) - return float(tot_err_rate) - - -class MetricsTracker(collections.defaultdict): - def __init__(self): - # Passing the type 'int' to the base-class constructor - # makes undefined items default to int() which is zero. - # This class will play a role as metrics tracker. - # It can record many metrics, including but not limited to loss. - super(MetricsTracker, self).__init__(int) - - def __add__(self, other: "MetricsTracker") -> "MetricsTracker": - ans = MetricsTracker() - for k, v in self.items(): - ans[k] = v - for k, v in other.items(): - ans[k] = ans[k] + v - return ans - - def __mul__(self, alpha: float) -> "MetricsTracker": - ans = MetricsTracker() - for k, v in self.items(): - ans[k] = v * alpha - return ans - - def __str__(self) -> str: - ans_frames = "" - ans_utterances = "" - for k, v in self.norm_items(): - norm_value = "%.4g" % v - if "utt_" not in k: - ans_frames += str(k) + "=" + str(norm_value) + ", " - else: - ans_utterances += str(k) + "=" + str(norm_value) - if k == "utt_duration": - ans_utterances += " frames, " - elif k == "utt_pad_proportion": - ans_utterances += ", " - else: - raise ValueError(f"Unexpected key: {k}") - frames = "%.2f" % self["frames"] - ans_frames += "over " + str(frames) + " frames. " - if ans_utterances != "": - utterances = "%.2f" % self["utterances"] - ans_utterances += "over " + str(utterances) + " utterances." - - return ans_frames + ans_utterances - - def norm_items(self) -> List[Tuple[str, float]]: - """ - Returns a list of pairs, like: - [('ctc_loss', 0.1), ('att_loss', 0.07)] - """ - num_frames = self["frames"] if "frames" in self else 1 - num_utterances = self["utterances"] if "utterances" in self else 1 - ans = [] - for k, v in self.items(): - if k == "frames" or k == "utterances": - continue - norm_value = ( - float(v) / num_frames - if "utt_" not in k - else float(v) / num_utterances - ) - ans.append((k, norm_value)) - return ans - - def reduce(self, device): - """ - Reduce using torch.distributed, which I believe ensures that - all processes get the total. - """ - keys = sorted(self.keys()) - s = torch.tensor([float(self[k]) for k in keys], device=device) - dist.all_reduce(s, op=dist.ReduceOp.SUM) - for k, v in zip(keys, s.cpu().tolist()): - self[k] = v - - def write_summary( - self, - tb_writer: SummaryWriter, - prefix: str, - batch_idx: int, - ) -> None: - """Add logging information to a TensorBoard writer. - - Args: - tb_writer: a TensorBoard writer - prefix: a prefix for the name of the loss, e.g. "train/valid_", - or "train/current_" - batch_idx: The current batch index, used as the x-axis of the plot. - """ - for k, v in self.norm_items(): - tb_writer.add_scalar(prefix + k, v, batch_idx) - - -def concat( - ragged: k2.RaggedTensor, value: int, direction: str -) -> k2.RaggedTensor: - """Prepend a value to the beginning of each sublist or append a value. - to the end of each sublist. - - Args: - ragged: - A ragged tensor with two axes. - value: - The value to prepend or append. - direction: - It can be either "left" or "right". If it is "left", we - prepend the value to the beginning of each sublist; - if it is "right", we append the value to the end of each - sublist. - - Returns: - Return a new ragged tensor, whose sublists either start with - or end with the given value. - - >>> a = k2.RaggedTensor([[1, 3], [5]]) - >>> a - [ [ 1 3 ] [ 5 ] ] - >>> concat(a, value=0, direction="left") - [ [ 0 1 3 ] [ 0 5 ] ] - >>> concat(a, value=0, direction="right") - [ [ 1 3 0 ] [ 5 0 ] ] - - """ - dtype = ragged.dtype - device = ragged.device - - assert ragged.num_axes == 2, f"num_axes: {ragged.num_axes}" - pad_values = torch.full( - size=(ragged.tot_size(0), 1), - fill_value=value, - device=device, - dtype=dtype, - ) - pad = k2.RaggedTensor(pad_values) - - if direction == "left": - ans = k2.ragged.cat([pad, ragged], axis=1) - elif direction == "right": - ans = k2.ragged.cat([ragged, pad], axis=1) - else: - raise ValueError( - f'Unsupported direction: {direction}. " \ - "Expect either "left" or "right"' - ) - return ans - - -def add_sos(ragged: k2.RaggedTensor, sos_id: int) -> k2.RaggedTensor: - """Add SOS to each sublist. - - Args: - ragged: - A ragged tensor with two axes. - sos_id: - The ID of the SOS symbol. - - Returns: - Return a new ragged tensor, where each sublist starts with SOS. - - >>> a = k2.RaggedTensor([[1, 3], [5]]) - >>> a - [ [ 1 3 ] [ 5 ] ] - >>> add_sos(a, sos_id=0) - [ [ 0 1 3 ] [ 0 5 ] ] - - """ - return concat(ragged, sos_id, direction="left") - - -def add_eos(ragged: k2.RaggedTensor, eos_id: int) -> k2.RaggedTensor: - """Add EOS to each sublist. - - Args: - ragged: - A ragged tensor with two axes. - eos_id: - The ID of the EOS symbol. - - Returns: - Return a new ragged tensor, where each sublist ends with EOS. - - >>> a = k2.RaggedTensor([[1, 3], [5]]) - >>> a - [ [ 1 3 ] [ 5 ] ] - >>> add_eos(a, eos_id=0) - [ [ 1 3 0 ] [ 5 0 ] ] - - """ - return concat(ragged, eos_id, direction="right") - - -def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor: - """ - Args: - lengths: - A 1-D tensor containing sentence lengths. - Returns: - Return a 2-D bool tensor, where masked positions - are filled with `True` and non-masked positions are - filled with `False`. - - >>> lengths = torch.tensor([1, 3, 2, 5]) - >>> make_pad_mask(lengths) - tensor([[False, True, True, True, True], - [False, False, False, True, True], - [False, False, True, True, True], - [False, False, False, False, False]]) - """ - assert lengths.ndim == 1, lengths.ndim - - max_len = lengths.max() - n = lengths.size(0) - - expaned_lengths = torch.arange(max_len).expand(n, max_len).to(lengths) - - return expaned_lengths >= lengths.unsqueeze(1) - - -# Copied and modified from https://github.com/wenet-e2e/wenet/blob/main/wenet/utils/mask.py -def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), -) -> torch.Tensor: - """Create mask for subsequent steps (size, size) with chunk size, - this is for streaming encoder - Args: - size (int): size of mask - chunk_size (int): size of chunk - num_left_chunks (int): number of left chunks - <0: use full chunk - >=0: use num_left_chunks - device (torch.device): "cpu" or "cuda" or torch.Tensor.device - Returns: - torch.Tensor: mask - Examples: - >>> subsequent_chunk_mask(4, 2) - [[1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 1, 1], - [1, 1, 1, 1]] - """ - ret = torch.zeros(size, size, device=device, dtype=torch.bool) - for i in range(size): - if num_left_chunks < 0: - start = 0 - else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) - ret[i, start:ending] = True - return ret - - -def l1_norm(x): - return torch.sum(torch.abs(x)) - - -def l2_norm(x): - return torch.sum(torch.pow(x, 2)) - - -def linf_norm(x): - return torch.max(torch.abs(x)) - - -def measure_weight_norms( - model: nn.Module, norm: str = "l2" -) -> Dict[str, float]: - """ - Compute the norms of the model's parameters. - - :param model: a torch.nn.Module instance - :param norm: how to compute the norm. Available values: 'l1', 'l2', 'linf' - :return: a dict mapping from parameter's name to its norm. - """ - with torch.no_grad(): - norms = {} - for name, param in model.named_parameters(): - if norm == "l1": - val = l1_norm(param) - elif norm == "l2": - val = l2_norm(param) - elif norm == "linf": - val = linf_norm(param) - else: - raise ValueError(f"Unknown norm type: {norm}") - norms[name] = val.item() - return norms - - -def measure_gradient_norms( - model: nn.Module, norm: str = "l1" -) -> Dict[str, float]: - """ - Compute the norms of the gradients for each of model's parameters. - - :param model: a torch.nn.Module instance - :param norm: how to compute the norm. Available values: 'l1', 'l2', 'linf' - :return: a dict mapping from parameter's name to its gradient's norm. - """ - with torch.no_grad(): - norms = {} - for name, param in model.named_parameters(): - if norm == "l1": - val = l1_norm(param.grad) - elif norm == "l2": - val = l2_norm(param.grad) - elif norm == "linf": - val = linf_norm(param.grad) - else: - raise ValueError(f"Unknown norm type: {norm}") - norms[name] = val.item() - return norms - - -def optim_step_and_measure_param_change( - model: nn.Module, - old_parameters: Dict[str, nn.parameter.Parameter], -) -> Dict[str, float]: - """ - Measure the "relative change in parameters per minibatch." - It is understood as a ratio between the L2 norm of the difference between original and updates parameters, - and the L2 norm of the original parameter. It is given by the formula: - - .. math:: - \begin{aligned} - \delta = \frac{\Vert\theta - \theta_{new}\Vert^2}{\Vert\theta\Vert^2} - \end{aligned} - - This function is supposed to be used as follows: - - .. code-block:: python - - old_parameters = { - n: p.detach().clone() for n, p in model.named_parameters() - } - - optimizer.step() - - deltas = optim_step_and_measure_param_change(old_parameters) - - Args: - model: A torch.nn.Module instance. - old_parameters: - A Dict of named_parameters before optimizer.step(). - - Return: - A Dict containing the relative change for each parameter. - """ - relative_change = {} - with torch.no_grad(): - for n, p_new in model.named_parameters(): - p_orig = old_parameters[n] - delta = l2_norm(p_orig - p_new) / l2_norm(p_orig) - relative_change[n] = delta.item() - return relative_change - - -def load_averaged_model( - model_dir: str, - model: torch.nn.Module, - epoch: int, - avg: int, - device: torch.device, -): - """ - Load a model which is the average of all checkpoints - - :param model_dir: a str of the experiment directory - :param model: a torch.nn.Module instance - - :param epoch: the last epoch to load from - :param avg: how many models to average from - :param device: move model to this device - - :return: A model averaged - """ - - # start cannot be negative - start = max(epoch - avg + 1, 0) - filenames = [f"{model_dir}/epoch-{i}.pt" for i in range(start, epoch + 1)] - - logging.info(f"averaging {filenames}") - model.to(device) - model.load_state_dict(average_checkpoints(filenames, device=device)) - - return model - - -def tokenize_by_bpe_model( - sp: spm.SentencePieceProcessor, - txt: str, -) -> str: - """ - Tokenize text with bpe model. This function is from - https://github1s.com/wenet-e2e/wenet/blob/main/wenet/dataset/processor.py#L322-L342. - Args: - sp: spm.SentencePieceProcessor. - txt: str - - Return: - A new string which includes chars and bpes. - """ - tokens = [] - # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - pattern = re.compile(r"([\u4e00-\u9fff])") - # Example: - # txt = "你好 ITS'S OKAY 的" - # chars = ["你", "好", " ITS'S OKAY ", "的"] - chars = pattern.split(txt.upper()) - mix_chars = [w for w in chars if len(w.strip()) > 0] - for ch_or_w in mix_chars: - # ch_or_w is a single CJK charater(i.e., "你"), do nothing. - if pattern.fullmatch(ch_or_w) is not None: - tokens.append(ch_or_w) - # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "), - # encode ch_or_w using bpe_model. - else: - for p in sp.encode_as_pieces(ch_or_w): - tokens.append(p) - txt_with_bpe = "/".join(tokens) - - return txt_with_bpe - - -def display_and_save_batch( - batch: dict, - params: AttributeDict, - sp: spm.SentencePieceProcessor, -) -> None: - """Display the batch statistics and save the batch into disk. - - Args: - batch: - A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()` - for the content in it. - params: - Parameters for training. See :func:`get_params`. - sp: - The BPE model. - """ - from lhotse.utils import uuid4 - - filename = f"{params.exp_dir}/batch-{uuid4()}.pt" - logging.info(f"Saving batch to {filename}") - torch.save(batch, filename) - - supervisions = batch["supervisions"] - features = batch["inputs"] - - logging.info(f"features shape: {features.shape}") - - y = sp.encode(supervisions["text"], out_type=int) - num_tokens = sum(len(i) for i in y) - logging.info(f"num tokens: {num_tokens}") From 2cbefda1d3a541ff68fc4bc926621fd08d784ff4 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 29 Nov 2022 20:31:14 -0500 Subject: [PATCH 051/154] audio to duration python script --- egs/librispeech/v1/local/data_prep.sh | 6 +- egs/librispeech/v1/run_001_prepare_data.sh | 5 +- .../v1/run_003_prepare_noises_rirs.sh | 6 +- hyp_utils/xvectors/audio_to_duration.sh | 45 ++++++ .../preprocess_audios_for_nnet_train.sh | 6 +- hyperion/bin/audio_to_duration.py | 66 +++++++++ hyperion/io/audio_reader.py | 122 ++++++++-------- .../torch/models/transducer/transducer.py | 42 +++--- hyperion/torch/utils/dummy_k2.py | 10 ++ hyperion/utils/info_table.py | 130 +++++++++++------- hyperion/utils/text.py | 32 ++--- 11 files changed, 307 insertions(+), 163 deletions(-) create mode 100755 hyp_utils/xvectors/audio_to_duration.sh create mode 100755 hyperion/bin/audio_to_duration.py create mode 100644 hyperion/torch/utils/dummy_k2.py diff --git a/egs/librispeech/v1/local/data_prep.sh b/egs/librispeech/v1/local/data_prep.sh index 0923aeab..d1ec7f52 100755 --- a/egs/librispeech/v1/local/data_prep.sh +++ b/egs/librispeech/v1/local/data_prep.sh @@ -72,9 +72,9 @@ done spk2utt=$dst/spk2utt utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 -utils/data/get_utt2dur.sh $dst -awk 'sub(/ *$/, "", $0)' $dst/utt2dur > $dst/utt2dur2 -mv $dst/utt2dur2 $dst/utt2dur +# utils/data/get_utt2dur.sh $dst +# awk 'sub(/ *$/, "", $0)' $dst/utt2dur > $dst/utt2dur2 +# mv $dst/utt2dur2 $dst/utt2dur ntrans=$(wc -l <$trans) nutt2spk=$(wc -l <$utt2spk) diff --git a/egs/librispeech/v1/run_001_prepare_data.sh b/egs/librispeech/v1/run_001_prepare_data.sh index c6c15692..ba24f5d6 100755 --- a/egs/librispeech/v1/run_001_prepare_data.sh +++ b/egs/librispeech/v1/run_001_prepare_data.sh @@ -24,8 +24,9 @@ if [ ${stage} -le 1 ]; then ### But you can utilize Kaldi recipes in most cases echo "stage 0: Data preparation" for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do - # use underscore-separated names in data directories. - local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} + # use underscore-separated names in data directories. + #local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} + steps_xvec/audio_to_duration.sh data/${part//-/_} done fi diff --git a/egs/librispeech/v1/run_003_prepare_noises_rirs.sh b/egs/librispeech/v1/run_003_prepare_noises_rirs.sh index a448af9a..6bdcb4f2 100755 --- a/egs/librispeech/v1/run_003_prepare_noises_rirs.sh +++ b/egs/librispeech/v1/run_003_prepare_noises_rirs.sh @@ -20,11 +20,11 @@ if [ $stage -le 1 ]; then # Prepare the MUSAN corpus, which consists of music, speech, and noise # suitable for augmentation. local/make_musan.sh $musan_root 16 data - + for name in musan_noise musan_music do steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ - --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \ data/${name} data/${name}_proc_audio exp/${name}_proc_audio utils/fix_data_dir.sh data/${name}_proc_audio done @@ -37,7 +37,7 @@ if [ $stage -le 2 ]; then for name in musan_speech do steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ - --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + --storage_name librispeech-v1-$(date +'%m_%d_%H_%M') \ data/${name} data/${name}_babble exp/${name}_babble # utils/fix_data_dir.sh data/${name}_babble done diff --git a/hyp_utils/xvectors/audio_to_duration.sh b/hyp_utils/xvectors/audio_to_duration.sh new file mode 100755 index 00000000..56a8ffe2 --- /dev/null +++ b/hyp_utils/xvectors/audio_to_duration.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# +# 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +set -e +nj=40 +cmd="run.pl" +stage=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data_in=$1 +output_dir=$data_in/durations + +name=`basename $data_in` + +for f in $data_in/wav.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +mkdir -p $output_dir/log + +$cmd JOB=1:$nj $output_dir/log/audio_to_duration.JOB.log \ + hyp_utils/conda_env.sh \ + audio_to_duration.py \ + --audio-file $data_in/wav.scp \ + --output-file $output_dir/utt2dur.JOB + +for n in $(seq $nj); do + cat $output_dir/utt2dur.$n || exit 1; +done > ${data_in}/utt2dur || exit 1 + +echo "$0: Succeeded processing audios for $name" diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index 7c35b234..8321169f 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -56,9 +56,13 @@ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $output_dir/storage ]; then elif [ "$nodes" == "s01" ];then utils/create_split_dir.pl \ /export/s01/$dir_name $output_dir/storage - else + elif [ "$nodes" == "c0" ];then utils/create_split_dir.pl \ /export/c{01,06,07,08,09}/$dir_name $output_dir/storage + elif [ "$nodes" == "fs05" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/fs05/$dir_name $output_dir/storage fi for f in $(awk '{ print $1}' $data_in/wav.scp); do diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py new file mode 100755 index 00000000..04acb76c --- /dev/null +++ b/hyperion/bin/audio_to_duration.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +""" + Copyright 2022 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import SequentialAudioReader as AR + + +def audio_to_duration(audio_file, output_file, **kwargs): + + input_args = AR.filter_args(**kwargs) + logging.info(f"input_args={input_args}") + + keys = [] + durations = [] + with AR(audio_file, **input_args) as reader: + for data in reader: + key, x, fs = data + duration = x.shape[0] / fs + keys.append(key) + durations.append(duration) + logging.info("read audio %s duration=%.3f", key, duration) + + print(len(keys), len(durations)) + seg_set = SegmentSet.from_lists(keys, ["duration"], [durations]) + seg_set.save(output_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Writes audio file durations to table") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-file", required=True) + parser.add_argument("--output-file", required=True) + AR.add_class_args(parser) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + audio_to_duration(**namespace_to_dict(args)) diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 043ae778..e1bdaca8 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -5,6 +5,7 @@ import os import logging +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import io import math import subprocess @@ -47,8 +48,7 @@ class AudioReader(object): segments_path: segments file with format: segment_id file_id tbeg tend wav_scale: multiplies signal by scale factor """ - - def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1): self.file_path = file_path if isinstance(file_path, SCPList): self.scp = file_path @@ -64,9 +64,9 @@ def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): if isinstance(file_path, SegmentList): self.segments = segments_path else: - self.segments = SegmentList.load( - segments_path, sep=" ", index_by_file=False - ) + self.segments = SegmentList.load(segments_path, + sep=" ", + index_by_file=False) self.wav_scale = wav_scale @@ -93,7 +93,10 @@ def __exit__(self, exc_type, exc_value, traceback): pass @staticmethod - def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): + def read_wavspecifier(wavspecifier, + scale=2**15, + time_offset=0, + time_dur=0): """Reads an audiospecifier (audio_file/pipe) It reads from pipe or from all the files that can be read by `libsndfile ` @@ -143,20 +146,20 @@ def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): raise Exception("Unknown format for %s" % (wavspecifier)) @staticmethod - def read_pipe(wavspecifier, scale=2 ** 15): + def read_pipe(wavspecifier, scale=2**15): """Reads wave file from a pipe Args: wavspecifier: Shell command with pipe output scale: Multiplies signal by scale factor """ # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) + proc = subprocess.Popen(wavspecifier, + shell=True, + stdout=subprocess.PIPE) pipe = proc.communicate()[0] if proc.returncode != 0: - raise Exception( - "Wave read pipe command %s returned code %d" - % (wavspecifier, proc.returncode) - ) + raise Exception("Wave read pipe command %s returned code %d" % + (wavspecifier, proc.returncode)) x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) x *= scale return x, fs @@ -184,8 +187,7 @@ def _read_segment(self, segment, time_offset=0, time_dur=0): if s_beg >= num_samples_i: raise Exception( "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" - % (file_id, t_beg, s_beg, file_id, num_samples_i) - ) + % (file_id, t_beg, s_beg, file_id, num_samples_i)) s_end = int(t_end * fs_i) if s_end > num_samples_i or t_end < 0: @@ -203,7 +205,7 @@ def __init__( self, file_path, segments_path=None, - wav_scale=2 ** 15 - 1, + wav_scale=2**15 - 1, part_idx=1, num_parts=1, ): @@ -213,11 +215,12 @@ def __init__( self.num_parts = num_parts if self.num_parts > 1: if self.with_segments: - self.segments = self.segments.split(self.part_idx, self.num_parts) + self.segments = self.segments.split(self.part_idx, + self.num_parts) else: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=False - ) + self.scp = self.scp.split(self.part_idx, + self.num_parts, + group_by_key=False) def __iter__(self): """Needed to build an iterator, e.g.: @@ -297,9 +300,8 @@ def read(self, num_records=0, time_offset=0, time_durs=0): x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: key, file_path, _, _ = self.scp[self.cur_item] - x_i, fs_i = self.read_wavspecifier( - file_path, self.wav_scale, offset_i, dur_i - ) + x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale, + offset_i, dur_i) keys.append(key) data.append(x_i) @@ -315,42 +317,45 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "wav-scale", - default=2 ** 15 - 1, + "--wav-scale", + default=2**15 - 1, type=float, help=("multiplicative factor for waveform"), ) try: parser.add_argument( - p1 + "part-idx", + "--part-idx", type=int, default=1, - help=( - "splits the list of files into num-parts and " "processes part-idx" - ), + help=("splits the list of files into num-parts and " + "processes part-idx"), ) parser.add_argument( - p1 + "num-parts", + "--num-parts", type=int, default=1, - help=( - "splits the list of files into num-parts and " "processes part-idx" - ), + help=("splits the list of files into num-parts and " + "processes part-idx"), ) except: pass + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + add_argparse_args = add_class_args class RandomAccessAudioReader(AudioReader): - def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1): super().__init__(file_path, segments_path, wav_scale) def _read(self, keys, time_offset=0, time_durs=0): @@ -386,9 +391,8 @@ def _read(self, keys, time_offset=0, time_durs=0): raise Exception("Key %s not found" % key) file_path, _, _ = self.scp[key] - x_i, fs_i = self.read_wavspecifier( - file_path, self.wav_scale, offset_i, dur_i - ) + x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale, + offset_i, dur_i) data.append(x_i) fs.append(fs_i) @@ -406,7 +410,9 @@ def read(self, keys, time_offset=0, time_durs=0): fs: List of sampling freq. """ try: - x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) + x, fs = self._read(keys, + time_offset=time_offset, + time_durs=time_durs) except: if isinstance(keys, str): keys = [keys] @@ -422,23 +428,17 @@ def read(self, keys, time_offset=0, time_durs=0): # we try to read from # time-offset to the end of the file, and remove the extra frames later, # this solves the problem in most cases - logging.info( - ( - "error-1 reading at keys={} offset={} " - "retrying reading until end-of-file ..." - ).format(keys, time_offset) - ) + logging.info(("error-1 reading at keys={} offset={} " + "retrying reading until end-of-file ...").format( + keys, time_offset)) x, fs = self._read(keys, time_offset=time_offset) for i in range(len(x)): end_sample = int(time_durs[i] * fs[i]) x[i] = x[i][:end_sample] except: # try to read the full file - logging.info( - ( - "error-2 reading at key={}, " "retrying reading full file ..." - ).format(keys) - ) + logging.info(("error-2 reading at key={}, " + "retrying reading full file ...").format(keys)) x, fs = self._read(keys) for i in range(len(x)): start_sample = int(time_offset[i] * fs[i]) @@ -449,21 +449,25 @@ def read(self, keys, time_offset=0, time_durs=0): @staticmethod def filter_args(**kwargs): - valid_args = ("wav_scale",) + valid_args = ("wav_scale", ) return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "wav-scale", - default=2 ** 15 - 1, + "--wav-scale", + default=2**15 - 1, type=float, help=("multiplicative factor for waveform"), ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index 52547954..b2a90f4b 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -13,13 +13,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Note we use `rnnt_loss` from torchaudio, which exists only in torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0 """ from jsonargparse import ArgumentParser, ActionParser, ActionYesNo -import k2 +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + import torch import torch.nn as nn import torchaudio @@ -37,9 +40,8 @@ class Transducer(TorchModel): """It implements https://arxiv.org/pdf/1211.3711.pdf "Sequence Transduction with Recurrent Neural Networks" """ - def __init__( - self, + self, vocab_size, blank_id, # conformer_enc, @@ -75,9 +77,6 @@ def __init__( self.decoder = Decoder(**decoder) self.joiner = Joiner(**joiner) - - - def forward( self, x: torch.Tensor, @@ -100,7 +99,7 @@ def forward( assert x.ndim == 3, x.shape assert x_lens.ndim == 1, x_lens.shape assert y.num_axes == 2, y.num_axes - + assert x.size(0) == x_lens.size(0) == y.dim0 # wav2vec2 works as encoder @@ -128,11 +127,9 @@ def forward( assert hasattr(torchaudio.functional, "rnnt_loss"), ( f"Current torchaudio version: {torchaudio.__version__}\n" - "Please install a version >= 0.10.0" - ) - - x_lens = x_lens.to(torch.int32) + "Please install a version >= 0.10.0") + x_lens = x_lens.to(torch.int32) loss = torchaudio.functional.rnnt_loss( logits=logits, @@ -145,7 +142,6 @@ def forward( return logits, loss - def set_train_mode(self, mode): if mode == self._train_mode: return @@ -162,8 +158,6 @@ def set_train_mode(self, mode): self._train_mode = mode - - def _train(self, train_mode: str): if train_mode in ["full", "frozen"]: super()._train(train_mode) @@ -179,8 +173,8 @@ def get_config(self): join_cfg = self.joiner.get_config() config = { - "blank_id" : self.blank_id, - "vocab_size" : self.vocab_size, + "blank_id": self.blank_id, + "vocab_size": self.vocab_size, "decoder": dec_cfg, "joiner": join_cfg, } @@ -199,7 +193,7 @@ def filter_args(**kwargs): args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) args["decoder"] = decoder_args - args["joiner"] = joiner_args + args["joiner"] = joiner_args return args @staticmethod @@ -209,17 +203,13 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - Decoder.add_class_args( - parser, prefix="decoder" - ) + Decoder.add_class_args(parser, prefix="decoder") - Joiner.add_class_args( - parser, prefix="joiner" - ) + Joiner.add_class_args(parser, prefix="joiner") if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) # def change_config( # self, diff --git a/hyperion/torch/utils/dummy_k2.py b/hyperion/torch/utils/dummy_k2.py new file mode 100644 index 00000000..27d387de --- /dev/null +++ b/hyperion/torch/utils/dummy_k2.py @@ -0,0 +1,10 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +version = 0.0 + + +class RaggedTensor: + pass diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index ad676e72..d6ff7602 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -7,6 +7,7 @@ import logging from collections import OrderedDict from copy import deepcopy +import re import numpy as np import pandas as pd @@ -21,7 +22,6 @@ class InfoTable(object): Attributes: df: pandas dataframe. """ - def __init__(self, df): self.df = df assert "id" in df, f"info_table={df}" @@ -89,8 +89,9 @@ def save(self, file_path, sep=None): file_path = Path(file_path) file_path.parent.mkdir(parents=True, exist_ok=True) ext = file_path.suffix - if ext in ["", ".scp"]: + if ext in ["", ".scp"] or re.match(r"\.[0-9]+$", ext): # if no extension we save as kaldi utt2spk file + assert len(self.df.columns) == 2 self.df.to_csv(file_path, sep=" ", header=False, index=False) return @@ -99,6 +100,22 @@ def save(self, file_path, sep=None): self.df.to_csv(file_path, sep=sep, index=False) + @classmethod + def from_lists(cls, ids, column_names, column_data): + df_dict = {"id": ids} + assert len(column_names) == len(column_data) + for name, data in zip(column_names, column_data): + assert len(ids) == len(data) + df_dict[name] = data + df = pd.DataFrame(df_dict) + return cls(df) + + @classmethod + def from_dict(cls, df_dict): + assert "id" in df_dict + df = pd.DataFrame(df_dict) + return cls(df) + @classmethod def load(cls, file_path, sep=None, name="class_id"): """Loads utt2info list from text file. @@ -120,7 +137,10 @@ def load(cls, file_path, sep=None, name="class_id"): sep=" ", header=None, names=["id", name], - dtype={"id": np.str, name: np.str}, + dtype={ + "id": np.str, + name: np.str + }, ) else: if sep is None: @@ -149,7 +169,8 @@ def split(self, idx, num_parts, group_by=None): if group_by is None: _, idx1 = split_list(self.df["id"], idx, num_parts) else: - _, idx1 = split_list_group_by_key(self.df[group_by], idx, num_parts) + _, idx1 = split_list_group_by_key(self.df[group_by], idx, + num_parts) df = self.df.iloc[idx1] return self.__class__(df) @@ -168,10 +189,14 @@ def merge(cls, tables): df = pd.concat(df_list) return cls(df) - def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): - assert ( - items is None or iindex is None - ), "items and iindex cannot be not None at the same time" + def filter(self, + items=None, + iindex=None, + columns=None, + by="id", + keep=True): + assert (items is None or iindex is None + ), "items and iindex cannot be not None at the same time" df = self.df if not keep: @@ -220,7 +245,51 @@ def __cmp__(self, other): return 0 return 1 - # def __len__(self): + def shuffle(self, seed=1024, rng=None): + """Shuffles the elements of the list. + + Args: + seed: Seed for random number generator. + rng: numpy random number generator object. + + Returns: + Index used to shuffle the list. + """ + if rng is None: + rng = np.random.RandomState(seed=seed) + index = np.arange(len(self.df)) + rng.shuffle(index) + self.df = self.df.iloc[index] + return index + + def set_index(self, keys, inplace=True): + if inplace: + self.df.set_index(keys, drop=False, inplace=True) + return + + df = self.df.set_index(keys, drop=False, inplace=False) + return type(self)(df) + + def reset_index(self): + self.df.set_index("id", drop=False, inplace=True) + + def get_loc(self, keys): + if isinstance(keys, (list, np.ndarray)): + return self.df.index.get_indexer(keys) + + loc = self.df.index.get_loc(keys) + if isinstance(loc, int): + return loc + elif isinstance(loc, np.ndarray) and loc.dtype == np.bool: + return np.nonzero(loc)[0] + else: + return list(range(loc.start, loc.stop, loc.step)) + + def get_col_idx(self, keys): + return self.df.columns.get_loc(keys) + + # def __len__(self): + # """Returns the number of elements in the list.""" # return len(self.df) @@ -375,46 +444,3 @@ def __cmp__(self, other): # utt_info = self.utt_info.iloc[index] # return Utt2Info(utt_info) - - def shuffle(self, seed=1024, rng=None): - """Shuffles the elements of the list. - - Args: - seed: Seed for random number generator. - rng: numpy random number generator object. - - Returns: - Index used to shuffle the list. - """ - if rng is None: - rng = np.random.RandomState(seed=seed) - index = np.arange(len(self.df)) - rng.shuffle(index) - self.df = self.df.iloc[index] - return index - - def set_index(self, keys, inplace=True): - if inplace: - self.df.set_index(keys, drop=False, inplace=True) - return - - df = self.df.set_index(keys, drop=False, inplace=False) - return type(self)(df) - - def reset_index(self): - self.df.set_index("id", drop=False, inplace=True) - - def get_loc(self, keys): - if isinstance(keys, (list, np.ndarray)): - return self.df.index.get_indexer(keys) - - loc = self.df.index.get_loc(keys) - if isinstance(loc, int): - return loc - elif isinstance(loc, np.ndarray) and loc.dtype == np.bool: - return np.nonzero(loc)[0] - else: - return list(range(loc.start, loc.stop, loc.step)) - - def get_col_idx(self, keys): - return self.df.columns.get_loc(keys) diff --git a/hyperion/utils/text.py b/hyperion/utils/text.py index 48be92e3..be70f638 100644 --- a/hyperion/utils/text.py +++ b/hyperion/utils/text.py @@ -4,20 +4,23 @@ """ from pathlib import Path -import k2 -import k2.version +try: + import k2 + import k2.version +except ModuleNotFoundError: + from ..torch.utils import dummy_k2 as k2 + import numpy as np import pandas as pd import torch - # Copied and modified from https://github.com/wenet-e2e/wenet/blob/main/wenet/utils/mask.py def subsequent_chunk_mask( - size: int, - chunk_size: int, - num_left_chunks: int = -1, - device: torch.device = torch.device("cpu"), + size: int, + chunk_size: int, + num_left_chunks: int = -1, + device: torch.device = torch.device("cpu"), ) -> torch.Tensor: """Create mask for subsequent steps (size, size) with chunk size, this is for streaming encoder @@ -48,7 +51,6 @@ def subsequent_chunk_mask( return ret - def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor: """ Args: @@ -76,9 +78,8 @@ def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor: return expaned_lengths >= lengths.unsqueeze(1) -def concat( - ragged: k2.RaggedTensor, value: int, direction: str -) -> k2.RaggedTensor: +def concat(ragged: k2.RaggedTensor, value: int, + direction: str) -> k2.RaggedTensor: """Prepend a value to the beginning of each sublist or append a value. to the end of each sublist. @@ -123,10 +124,8 @@ def concat( elif direction == "right": ans = k2.ragged.cat([ragged, pad], axis=1) else: - raise ValueError( - f'Unsupported direction: {direction}. " \ - "Expect either "left" or "right"' - ) + raise ValueError(f'Unsupported direction: {direction}. " \ + "Expect either "left" or "right"') return ans @@ -156,7 +155,7 @@ def read_text(text_file: str): # assert check_argument_types() text_file = Path(text_file) - data = {"id":[],"text":[]} + data = {"id": [], "text": []} with Path(text_file).open("r", encoding="utf-8") as f: for linenum, line in enumerate(f, 1): sps = line.rstrip().split(maxsplit=1) @@ -169,4 +168,3 @@ def read_text(text_file: str): data["id"].append(k) data["text"].append(v) return pd.DataFrame(data=data, index=data["id"]) - From 898d4e1b553455a4f13f148c68b80bd4a015cfce Mon Sep 17 00:00:00 2001 From: neillu23 Date: Wed, 30 Nov 2022 14:38:13 -0500 Subject: [PATCH 052/154] upload missing file: download_lm.py --- egs/librispeech/v1/local/download_lm.py | 99 +++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100755 egs/librispeech/v1/local/download_lm.py diff --git a/egs/librispeech/v1/local/download_lm.py b/egs/librispeech/v1/local/download_lm.py new file mode 100755 index 00000000..030122aa --- /dev/null +++ b/egs/librispeech/v1/local/download_lm.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This file downloads the following LibriSpeech LM files: + + - 3-gram.pruned.1e-7.arpa.gz + - 4-gram.arpa.gz + - librispeech-vocab.txt + - librispeech-lexicon.txt + - librispeech-lm-norm.txt.gz + +from http://www.openslr.org/resources/11 +and save them in the user provided directory. + +Files are not re-downloaded if they already exist. + +Usage: + ./local/download_lm.py --out-dir ./download/lm +""" + +import argparse +import gzip +import logging +import os +import shutil +from pathlib import Path + +from lhotse.utils import urlretrieve_progress +from tqdm.auto import tqdm + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--out-dir", type=str, help="Output directory.") + + args = parser.parse_args() + return args + + +def main(out_dir: str): + url = "http://www.openslr.org/resources/11" + out_dir = Path(out_dir) + + files_to_download = ( + "3-gram.pruned.1e-7.arpa.gz", + "4-gram.arpa.gz", + "librispeech-vocab.txt", + "librispeech-lexicon.txt", + "librispeech-lm-norm.txt.gz", + ) + + for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"): + filename = out_dir / f + if filename.is_file() is False: + urlretrieve_progress( + f"{url}/{f}", + filename=filename, + desc=f"Downloading {filename}", + ) + else: + logging.info(f"{filename} already exists - skipping") + + if ".gz" in str(filename): + unzipped = Path(os.path.splitext(filename)[0]) + if unzipped.is_file() is False: + with gzip.open(filename, "rb") as f_in: + with open(unzipped, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + else: + logging.info(f"{unzipped} already exist - skipping") + + +if __name__ == "__main__": + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + + logging.basicConfig(format=formatter, level=logging.INFO) + + args = get_args() + logging.info(f"out_dir: {args.out_dir}") + + main(out_dir=args.out_dir) From 7f9ee742da9e4544185c827994531b0da2aec79c Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 30 Nov 2022 16:33:10 -0500 Subject: [PATCH 053/154] added script to install k2 --- egs/librispeech/v1/run_004_compute_bpe.sh | 9 +-- tools/install_k2_from_src.sh | 68 +++++++++++++++++++++++ 2 files changed, 70 insertions(+), 7 deletions(-) create mode 100755 tools/install_k2_from_src.sh diff --git a/egs/librispeech/v1/run_004_compute_bpe.sh b/egs/librispeech/v1/run_004_compute_bpe.sh index f1fa36b1..0bfeacb9 100755 --- a/egs/librispeech/v1/run_004_compute_bpe.sh +++ b/egs/librispeech/v1/run_004_compute_bpe.sh @@ -5,12 +5,7 @@ # . ./cmd.sh . ./path.sh - - set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -. ./datapath.sh vocab_sizes=( # 5000 @@ -19,13 +14,13 @@ vocab_sizes=( 500 ) - dl_dir=$PWD/download -stage=2 +stage=1 config_file=default_config.sh . parse_options.sh || exit 1; +. ./datapath.sh . $config_file diff --git a/tools/install_k2_from_src.sh b/tools/install_k2_from_src.sh new file mode 100755 index 00000000..0bd2e972 --- /dev/null +++ b/tools/install_k2_from_src.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +if [ $# -ne 2 ];then + echo "Usage: $0 " + echo " e.g.: $0 hyperion /usr/local/cuda" +fi + +env_name=$1 +CUDA_ROOT=$2 + +eval "$(conda shell.bash hook)" +conda activate $env_name + +#module load cuda10.2/toolkit +#module load gcc + +#conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch + +CUDA_VERSION=$(echo "import torch; print(torch.version.cuda)" | python) +CUDNN_VERSION=$(echo "import torch; print(torch.__config__.show())" | python | awk '/CuDNN/ { print $NF}') + +# Install cmake +echo "Installing CMAKE" +conda install -c anaconda cmake +echo "Installing NVIDIDA CUDA=$CUDA_VERSION CUDNN=$CUDNN_VERSION" +conda install -c nvidia cudnn=$CUDNN_VERSION cudatoolkit=$CUDA_VERSION + +#conda install -c k2-fsa -c conda-forge kaldilm + +echo "Download k2" +git clone https://github.com/k2-fsa/k2.git +cd k2 + +ENV_PATH=$(which python | sed 's@/bin/python$@@') +NVCC=$CUDA_ROOT/bin/nvcc +CUDNN_LIBRARY_PATH=${ENV_PATH}/lib +CUDNN_INCLUDE_PATH=${ENV_PATH}/include +CUDA_TOOLKIT_DIR=$ENV_PATH +export PATH=$CUDA_ROOT/bin:$PATH + +export K2_CMAKE_ARGS="\ +-DCMAKE_BUILD_TYPE=Release \ +-DCMAKE_CUDA_COMPILER=$NVCC \ +-DPYTHON_EXECUTABLE=$(which python) \ +-DCUDNN_LIBRARY_PATH=$CUDNN_LIBRARY_PATH/libcudnn.so \ +-DCUDNN_INCLUDE_PATH=$CUDNN_INCLUDE_PATH \ +-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_ROOT" + +export K2_MAKE_ARGS="-j6" + +echo "Compile k2 with CMAKE_ARGS=$K2_CMAKE_ARGS" +python setup.py install +cd - + + +# pip install lhotse + +# export OT_CMAKE_ARGS=$K2_CMAKE_ARGS +# git clone https://github.com/csukuangfj/optimized_transducer +# cd optimized_transducer +# python setup.py install +# cd - + + +# git clone https://github.com/k2-fsa/icefall +# cd icefall +# pip install -r requirements.txt +# export PYTHONPATH=./icefall:$PYTHONPATH From 72b8b3af323d83a4d727eece8f43866b23aa7a2d Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 5 Dec 2022 15:44:01 -0500 Subject: [PATCH 054/154] fixed duration extraction in libri recipe --- egs/librispeech/v1/cmd.sh | 4 ++-- egs/librispeech/v1/local/data_prep.sh | 2 +- egs/librispeech/v1/local/prepare_lang.py | 13 +++++-------- egs/librispeech/v1/local/prepare_lang_bpe.py | 18 ++++++++---------- egs/librispeech/v1/run_001_prepare_data.sh | 7 ++++--- hyp_utils/conda_env.sh | 3 ++- hyp_utils/xvectors/audio_to_duration.sh | 4 +++- 7 files changed, 25 insertions(+), 26 deletions(-) diff --git a/egs/librispeech/v1/cmd.sh b/egs/librispeech/v1/cmd.sh index 71f3bae0..89dbb7d8 100755 --- a/egs/librispeech/v1/cmd.sh +++ b/egs/librispeech/v1/cmd.sh @@ -19,8 +19,8 @@ if [ "$(hostname -d)" == "cm.gemini" ];then #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else - export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" - export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " + export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G" export cuda_eval_cmd="$train_cmd" fi diff --git a/egs/librispeech/v1/local/data_prep.sh b/egs/librispeech/v1/local/data_prep.sh index d1ec7f52..cb446a12 100755 --- a/egs/librispeech/v1/local/data_prep.sh +++ b/egs/librispeech/v1/local/data_prep.sh @@ -53,7 +53,7 @@ for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do fi find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ - awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1 + awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac\n", $0, dir, $0}' >>$wav_scp|| exit 1 chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 diff --git a/egs/librispeech/v1/local/prepare_lang.py b/egs/librispeech/v1/local/prepare_lang.py index 74e09629..39d76146 100755 --- a/egs/librispeech/v1/local/prepare_lang.py +++ b/egs/librispeech/v1/local/prepare_lang.py @@ -14,8 +14,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - """ This script takes as input a lexicon file "data/lang_phone/lexicon.txt" consisting of words and tokens (i.e., phones) and does the following: @@ -43,7 +41,6 @@ import torch from hyperion.utils.lexicon import read_lexicon, write_lexicon -from hyperion.utils.utils import str2bool Lexicon = List[Tuple[str, List[str]]] @@ -61,8 +58,8 @@ def get_args(): parser.add_argument( "--debug", - type=str2bool, default=False, + action="store_true", help="""True for debugging, which will generate a visualization of the lexicon FST. @@ -205,9 +202,8 @@ def generate_id_map(symbols: List[str]) -> Dict[str, int]: return {sym: i for i, sym in enumerate(symbols)} -def add_self_loops( - arcs: List[List[Any]], disambig_token: int, disambig_word: int -) -> List[List[Any]]: +def add_self_loops(arcs: List[List[Any]], disambig_token: int, + disambig_word: int) -> List[List[Any]]: """Adds self-loops to states of an FST to propagate disambiguation symbols through it. They are added on each state with non-epsilon output symbols on at least one arc out of the state. @@ -406,7 +402,8 @@ def main(): L_disambig.labels_sym = labels_sym L_disambig.aux_labels_sym = aux_labels_sym - L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt") + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", + title="L_disambig.pt") if __name__ == "__main__": diff --git a/egs/librispeech/v1/local/prepare_lang_bpe.py b/egs/librispeech/v1/local/prepare_lang_bpe.py index d8cee8ed..7838b6a0 100755 --- a/egs/librispeech/v1/local/prepare_lang_bpe.py +++ b/egs/librispeech/v1/local/prepare_lang_bpe.py @@ -15,9 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - # Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) - """ This script takes as input `lang_dir`, which should contain:: @@ -49,8 +47,6 @@ write_mapping, ) -from hyperion.utils.utils import str2bool - def lexicon_to_fst_no_sil( lexicon: Lexicon, @@ -126,9 +122,8 @@ def lexicon_to_fst_no_sil( return fsa -def generate_lexicon( - model_file: str, words: List[str] -) -> Tuple[Lexicon, Dict[str, int]]: +def generate_lexicon(model_file: str, + words: List[str]) -> Tuple[Lexicon, Dict[str, int]]: """Generate a lexicon from a BPE model. Args: @@ -180,8 +175,8 @@ def get_args(): parser.add_argument( "--debug", - type=str2bool, default=False, + action="store_true", help="""True for debugging, which will generate a visualization of the lexicon FST. @@ -204,7 +199,9 @@ def main(): words = word_sym_table.symbols - excluded = ["", "!SIL", "", "", "#0", "", ""] + excluded = [ + "", "!SIL", "", "", "#0", "", "" + ] for w in excluded: if w in words: words.remove(w) @@ -254,7 +251,8 @@ def main(): L_disambig.labels_sym = labels_sym L_disambig.aux_labels_sym = aux_labels_sym - L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", title="L_disambig.pt") + L_disambig.draw(f"{lang_dir / 'L_disambig.svg'}", + title="L_disambig.pt") if __name__ == "__main__": diff --git a/egs/librispeech/v1/run_001_prepare_data.sh b/egs/librispeech/v1/run_001_prepare_data.sh index ba24f5d6..8502b724 100755 --- a/egs/librispeech/v1/run_001_prepare_data.sh +++ b/egs/librispeech/v1/run_001_prepare_data.sh @@ -23,10 +23,11 @@ if [ ${stage} -le 1 ]; then ### Task dependent. You have to make data the following preparation part by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 0: Data preparation" - for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do + for part in train-clean-360 train-other-500 #dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500 + do # use underscore-separated names in data directories. - #local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} - steps_xvec/audio_to_duration.sh data/${part//-/_} + local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} + steps_xvec/audio_to_duration.sh --cmd "$train_cmd" data/${part//-/_} done fi diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index 0a8f7a41..905567ee 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -66,7 +66,8 @@ if [ $num_gpus -gt 0 ];then fi fi echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" - export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters + #export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters + #export $LD_LIBRARY_PATH=/opt/NVIDIA/cuda-10/targets/x86_64-linux/lib/stubs:$LD_LIBRARY_PATH if [ $num_gpus -gt 1 ];then # export CUDA_LAUNCH_BLOCKING=1 [[ $(type -P "$torchrun") ]] && command="torchrun" \ diff --git a/hyp_utils/xvectors/audio_to_duration.sh b/hyp_utils/xvectors/audio_to_duration.sh index 56a8ffe2..f4187919 100755 --- a/hyp_utils/xvectors/audio_to_duration.sh +++ b/hyp_utils/xvectors/audio_to_duration.sh @@ -36,7 +36,9 @@ $cmd JOB=1:$nj $output_dir/log/audio_to_duration.JOB.log \ hyp_utils/conda_env.sh \ audio_to_duration.py \ --audio-file $data_in/wav.scp \ - --output-file $output_dir/utt2dur.JOB + --output-file $output_dir/utt2dur.JOB \ + --part-idx JOB --num-parts $nj + for n in $(seq $nj); do cat $output_dir/utt2dur.$n || exit 1; From 9460cc340b0d1919f8578c8bda64629dd293e3fb Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 5 Dec 2022 17:34:19 -0500 Subject: [PATCH 055/154] fix dataprep --- egs/librispeech/v1/run_001_prepare_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/librispeech/v1/run_001_prepare_data.sh b/egs/librispeech/v1/run_001_prepare_data.sh index 8502b724..0708e667 100755 --- a/egs/librispeech/v1/run_001_prepare_data.sh +++ b/egs/librispeech/v1/run_001_prepare_data.sh @@ -23,7 +23,7 @@ if [ ${stage} -le 1 ]; then ### Task dependent. You have to make data the following preparation part by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 0: Data preparation" - for part in train-clean-360 train-other-500 #dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500 + for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500 do # use underscore-separated names in data directories. local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} From 118d1d4f0de5b4d1b8412353a7174fa7066aa692 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 7 Dec 2022 08:59:37 -0500 Subject: [PATCH 056/154] fixed bucketing sampler --- ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 18 +- .../v1/conf/wav2vec2xlsr300m_transducer.yaml | 3 + hyp_utils/conda_env.sh | 3 +- hyperion/bin/train_wav2vec2transducer.py | 78 +++++---- hyperion/torch/data/bucketing_seg_sampler.py | 43 +++-- hyperion/torch/data/seg_sampler.py | 76 ++++---- hyperion/torch/data/seg_sampler_factory.py | 33 ++-- hyperion/torch/models/transducer/decoder.py | 58 ++++--- .../torch/models/transducer/transducer.py | 1 - .../wav2transducer/hf_wav2transducer.py | 90 +++++----- hyperion/torch/trainers/torch_trainer.py | 162 ++++++++++-------- hyperion/torch/trainers/transducer_trainer.py | 24 ++- 12 files changed, 320 insertions(+), 269 deletions(-) diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index 50750cd8..f89108ea 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -7,11 +7,10 @@ data: return_segment_info: - text sampler: - sampler_type: 'seg_sampler' - # sampler_type: 'bucketing_seg_sampler' - min_batch_size: 4 - batch_size: 4 - iters_per_epoch: 6 + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 drop_last: true data_loader: num_workers: 8 @@ -23,11 +22,10 @@ data: return_segment_info: - text sampler: - sampler_type: 'seg_sampler' - # sampler_type: 'bucketing_seg_sampler' - min_batch_size: 2 - batch_size: 2 - iters_per_epoch: 6 + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 drop_last: true data_loader: num_workers: 8 diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml index b8a1cdbb..a7071b8c 100644 --- a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml @@ -5,6 +5,9 @@ transducer: embedding_dim: 1024 num_layers: 2 hidden_dim: 512 + #embedding_dim: 128 + #num_layers: 1 + #hidden_dim: 64 joiner: num_layers: 1 feat_fusion_method: weighted-avg diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index 905567ee..35c14680 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -67,9 +67,8 @@ if [ $num_gpus -gt 0 ];then fi echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" #export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters - #export $LD_LIBRARY_PATH=/opt/NVIDIA/cuda-10/targets/x86_64-linux/lib/stubs:$LD_LIBRARY_PATH if [ $num_gpus -gt 1 ];then - # export CUDA_LAUNCH_BLOCKING=1 + #export CUDA_LAUNCH_BLOCKING=1 [[ $(type -P "$torchrun") ]] && command="torchrun" \ || command="python -m torch.distributed.run" command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 3e4ccb84..8156f9b1 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -31,7 +31,6 @@ from hyperion.torch.models import HFWav2Vec2Transducer from torch.nn.utils.rnn import pad_sequence - model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, } @@ -49,8 +48,7 @@ def transducer_collate(batch): audio = pad_sequence(audio) audio_length = torch.as_tensor(audio_length) target = k2.RaggedTensor(target) - return torch.transpose(audio,0,1), audio_length, target - + return torch.transpose(audio, 0, 1), audio_length, target def init_data(partition, rank, num_gpus, **kwargs): @@ -76,10 +74,14 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ( - {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} - ) - data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=transducer_collate) return data_loader @@ -87,7 +89,7 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: logging.info("model network args={}".format(model_args)) - # TODO: check model_args + # TODO: check model_args model_args["transducer"]["blank_id"] = blank_id model_args["transducer"]["vocab_size"] = vocab_size model = model_class(**model_args) @@ -96,9 +98,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): return model - - - def train_model(gpu_id, args): config_logger(args.verbose) @@ -108,6 +107,9 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = False + torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) @@ -121,12 +123,13 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), **kwargs) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} #{"acc": CategoricalAccuracy()} + metrics = {} #{"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, @@ -142,7 +145,7 @@ def train_model(gpu_id, args): def make_parser(model_class): parser = ArgumentParser() - + parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") AD.add_class_args(train_parser, prefix="dataset", skip={}) @@ -164,46 +167,51 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) - parser.add_argument( "--data.train.dataset.text_file", - type=str, + type=str, ) - - parser.add_argument("--data.val.dataset.text_file", type=str) - + + parser.add_argument("--data.val.dataset.text_file", type=str) + parser.add_argument( "--data.train.dataset.bpe_model", - type=str, + type=str, ) - parser.link_arguments( - "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" - ) + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") - parser.link_arguments( - "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" - ) + parser.link_arguments("data.train.dataset.bpe_model", + "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args( - parser, prefix="trainer", train_modes=model_class.valid_train_modes() - ) + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) ddp.add_ddp_args(parser) - parser.add_argument("--seed", type=int, default=1123581321, help="random seed") - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) return parser if __name__ == "__main__": - parser = ArgumentParser(description="Train Wav2Vec2Transducer model from audio files") + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index 8b0e855a..9c8384bf 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -9,21 +9,20 @@ import numpy as np import torch +import torch.distributed as dist + from .hyp_sampler import HypSampler from .seg_sampler import SegSampler -import torch.distributed as dist class BucketingSegSampler(HypSampler): - def __init__( - self, - seg_set, - base_sampler=SegSampler, - num_buckets=10, - length_column="duration", - seed=1234, - **base_kwargs - ): + def __init__(self, + seg_set, + base_sampler=SegSampler, + num_buckets=10, + length_column="duration", + seed=1234, + **base_kwargs): super().__init__(shuffle=False, seed=seed) self.seg_set = seg_set self.base_sampler = base_sampler @@ -33,12 +32,13 @@ def __init__( self.length_column = length_column self._create_bucket_samplers() self._compute_len() - self.depleted_buckets = torch.zeros((num_buckets,), dtype=torch.bool) + self.depleted_buckets = torch.zeros((num_buckets, ), dtype=torch.bool) def create_buckets(self): - sort_idx = torch.argsort(torch.from_numpy(self.seg_set[self.length_column].values)) + sort_idx = np.argsort(self.seg_set[self.length_column].values) sorted_seg_set = self.seg_set.iloc[sort_idx] - cum_lengths = torch.cumsum(torch.from_numpy(sorted_seg_set[self.length_column].values),dim=0) + cum_lengths = np.cumsum(sorted_seg_set[self.length_column].values, + axis=0) bucket_length = cum_lengths[-1] / self.num_buckets buckets = [] for i in range(self.num_buckets): @@ -72,6 +72,7 @@ def set_epoch(self, epoch): def __iter__(self): super().__iter__() + self.depleted_buckets[:] = False for i in range(self.num_buckets): self.bucket_samplers[i].__iter__() @@ -86,9 +87,10 @@ def __next__(self): raise StopIteration while True: - bucket_idx = torch.randint( - low=0, high=self.num_buckets, size=(1,), generator=self.rng - ).item() + bucket_idx = torch.randint(low=0, + high=self.num_buckets, + size=(1, ), + generator=self.rng).item() if self.depleted_buckets[bucket_idx]: continue @@ -107,6 +109,15 @@ def __next__(self): self.batch += 1 return batch + @property + def avg_batch_size(self): + avg_batch_size = 0 + for sampler in self.bucket_samplers: + avg_batch_size += sampler.avg_batch_size + + avg_batch_size /= self.num_buckets + return avg_batch_size + @staticmethod def filter_args(**kwargs): diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index 73319dca..8c5ad306 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -33,19 +33,20 @@ def __init__( self.var_batch_size = max_batch_length is not None self.length_name = length_name if self.var_batch_size: - avg_batch_size = max_batch_length / torch.mean( - self.seg_set[self.length_name] - ) + avg_batch_size = max_batch_length / np.mean( + self.seg_set[self.length_name]) else: avg_batch_size = min_batch_size self.avg_batch_size = avg_batch_size - num_batches = len(self.seg_set) / avg_batch_size / self.world_size if drop_last: - self._len = int(num_batches) + self._len = int( + len(self.seg_set) / (avg_batch_size * self.world_size)) else: - self._len = int(math.ceil(num_batches)) + self._len = int( + math.ceil( + (len(self.seg_set) // self.world_size) / avg_batch_size)) self._permutation = None @@ -53,9 +54,8 @@ def __len__(self): return self._len def _shuffle_segs(self): - self._permutation = torch.randperm( - len(self.seg_set), generator=self.rng - ).numpy() + self._permutation = torch.randperm(len(self.seg_set), + generator=self.rng).numpy() def __iter__(self): super().__iter__() @@ -71,50 +71,47 @@ def __next__(self): raise StopIteration if self.var_batch_size: + column_idx = self.seg_set.columns.get_loc(self.length_name) idxs = [] max_length = 0 batch_size = 0 while True: - if self._shuffle: + if self.shuffle: idx = self._permutation[self.start] else: idx = self.start - max_length = max(max_length, self.seg_set.iloc[idx].duration.values) + max_length = max(max_length, self.seg_set.iloc[idx, + column_idx]) if max_length * (batch_size + 1) > self.max_batch_length: break idxs.append(idx) self.start = (self.start + self.world_size) % len(self.seg_set) batch_size += 1 - if ( - self.max_batch_size is not None - and batch_size >= self.max_batch_size - ): + if (self.max_batch_size is not None + and batch_size >= self.max_batch_size): break - assert len(idxs) > self.min_batch_size + assert len( + idxs + ) >= 1, f"increase max_batch_length {self.max_batch_length} >= {max_length}" else: - stop = min( - self.start + self.world_size * self.min_batch_size, len(self.seg_set) - ) + stop = min(self.start + self.world_size * self.min_batch_size, + len(self.seg_set)) if self.shuffle: - idx = self._permutation[self.start : stop : self.world_size] + idxs = self._permutation[self.start:stop:self.world_size] else: - idx = slice(self.start, stop, self.world_size) + idxs = slice(self.start, stop, self.world_size) self.start += self.world_size * self.min_batch_size if "chunk_start" in self.seg_set: - chunks = self.seg_set.iloc[idx] - seg_ids = [ - (id, s, d) - for id, s, d in zip( - chunks.seg_id, chunks.chunk_start, chunks[self.length_name] - ) - ] + chunks = self.seg_set.iloc[idxs] + seg_ids = [(id, s, d) for id, s, d in zip( + chunks.seg_id, chunks.chunk_start, chunks[self.length_name])] else: - seg_ids = self.seg_set.iloc[idx].id + seg_ids = self.seg_set.iloc[idxs].id if self.batch == 0: logging.info("batch 0 chunks=%s", str(seg_ids[:10])) @@ -153,18 +150,18 @@ def add_class_args(parser, prefix=None): "--max-batch-size", type=int, default=None, - help=( - "maximum batch size per gpu, if None, estimated from max_batch_length" - ), + help= + ("maximum batch size per gpu, if None, estimated from max_batch_length" + ), ) parser.add_argument( "--max-batch-duration", type=float, default=None, - help=( - "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" - ), + help= + ("maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), ) parser.add_argument( @@ -176,7 +173,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--shuffle", action=ActionYesNo, - help="shuffles the segments or chunks at the beginning of the epoch", + help= + "shuffles the segments or chunks at the beginning of the epoch", ) parser.add_argument( @@ -189,7 +187,9 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--length-name", default="duration", - help="which column in the segment table indicates the duration of the file", + help= + "which column in the segment table indicates the duration of the file", ) if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index 251d937b..512f2f64 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -15,7 +15,8 @@ from .bucketing_seg_sampler import BucketingSegSampler sampler_dict = { - "class_weighted_random_seg_chunk_sampler": ClassWeightedRandomSegChunkSampler, + "class_weighted_random_seg_chunk_sampler": + ClassWeightedRandomSegChunkSampler, "seg_sampler": SegSampler, "seg_chunk_sampler": SegChunkSampler, "bucketing_seg_sampler": BucketingSegSampler, @@ -26,7 +27,6 @@ class SegSamplerFactory(object): """Factory class to create different types of samplers for sequencial data like audio or acoustic features. """ - @staticmethod def create( dataset: Union[AudioDataset, FeatSeqDataset], @@ -112,7 +112,8 @@ def add_class_args(parser, prefix=None): "--base-sampler-type", choices=["seg_sampler", "bucketing_seg_sampler"], default="seg_sampler", - help="base sampler used for seg_chunk_sampler or bucketing_seg_sampler", + help= + "base sampler used for seg_chunk_sampler or bucketing_seg_sampler", ) parser.add_argument( @@ -139,9 +140,9 @@ def add_class_args(parser, prefix=None): "--max-batch-size", type=int, default=None, - help=( - "maximum batch size per gpu, if None, estimated from max_batch_length" - ), + help= + ("maximum batch size per gpu, if None, estimated from max_batch_length" + ), ) parser.add_argument( @@ -152,12 +153,12 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--max-batch-duration", + "--max-batch-length", type=float, default=None, - help=( - "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" - ), + help= + ("maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), ) parser.add_argument( @@ -223,7 +224,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--shuffle", action=ActionYesNo, - help="shuffles the segments or chunks at the beginning of the epoch", + help= + "shuffles the segments or chunks at the beginning of the epoch", ) parser.add_argument( "--seed", @@ -235,13 +237,16 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--length-name", default="duration", - help="which column in the segment table indicates the duration of the segment", + help= + "which column in the segment table indicates the duration of the segment", ) parser.add_argument( "--class-name", default="class_id", - help="which column in the segment table indicates the class of the segment", + help= + "which column in the segment table indicates the class of the segment", ) if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py index bd7bd202..3effea9e 100644 --- a/hyperion/torch/models/transducer/decoder.py +++ b/hyperion/torch/models/transducer/decoder.py @@ -98,6 +98,7 @@ def forward( """ embedding_out = self.embedding(y) embedding_out = self.embedding_dropout(embedding_out) + #print("yy", y.shape, embedding_out.shape, y) rnn_out, (h, c) = self.rnn(embedding_out, states) out = self.output_linear(rnn_out) @@ -105,18 +106,17 @@ def forward( def get_config(self): config = { - "in_feats" : self.in_feats, - "blank_id" : self.blank_id, - "vocab_size" : self.vocab_size, - "embedding_dim" :self.embedding_dim, - "num_layers" : self.num_layers, - "hidden_dim" : self.hidden_dim, + "in_feats": self.in_feats, + "blank_id": self.blank_id, + "vocab_size": self.vocab_size, + "embedding_dim": self.embedding_dim, + "num_layers": self.num_layers, + "hidden_dim": self.hidden_dim, } # base_config = super().get_config() return dict(list(config.items())) - @staticmethod def filter_args(**kwargs): valid_args = ( @@ -132,36 +132,38 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None, skip=set(["in_feats", "blank_id", "vocab_size" ])): + def add_class_args(parser, + prefix=None, + skip=set(["in_feats", "blank_id", "vocab_size"])): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") if "in_feats" not in skip: - parser.add_argument( - "--in-feats", type=int, required=True, help=("input feature dimension") - ) + parser.add_argument("--in-feats", + type=int, + required=True, + help=("input feature dimension")) if "blank_id" not in skip: - parser.add_argument( - "--blank-id", type=int, required=True, help=("blank id from sp model") - ) + parser.add_argument("--blank-id", + type=int, + required=True, + help=("blank id from sp model")) if "vocab_size" not in skip: - parser.add_argument( - "--vocab-size", type=int, required=True, help=("output prediction dimension") - ) - parser.add_argument( - "--embedding-dim", default=1024, type=int, help=("feature dimension") - ) + parser.add_argument("--vocab-size", + type=int, + required=True, + help=("output prediction dimension")) + parser.add_argument("--embedding-dim", + default=1024, + type=int, + help=("feature dimension")) - parser.add_argument( - "--num-layers", default=2, type=int, help=("") - ) + parser.add_argument("--num-layers", default=2, type=int, help=("")) - parser.add_argument( - "--hidden-dim", default=512, type=int, help=("") - ) + parser.add_argument("--hidden-dim", default=512, type=int, help=("")) if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index b2a90f4b..2d523b7c 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -204,7 +204,6 @@ def add_class_args(parser, prefix=None, skip=set()): parser = ArgumentParser(prog="") Decoder.add_class_args(parser, prefix="decoder") - Joiner.add_class_args(parser, prefix="joiner") if prefix is not None: diff --git a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py index b5bd220f..d21bb777 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py @@ -16,7 +16,6 @@ # from ..wav2xvectors.hf_wav2xvector import HFWav2XVector - class HFWav2Transducer(TorchModel): """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. @@ -28,10 +27,11 @@ class HFWav2Transducer(TorchModel): feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more than one layer is used. """ - - def __init__( - self, hf_feats, transducer, feat_fusion_start=0, feat_fusion_method="weighted-avg" - ): + def __init__(self, + hf_feats, + transducer, + feat_fusion_start=0, + feat_fusion_method="weighted-avg"): super().__init__() self.hf_feats = hf_feats @@ -52,9 +52,12 @@ def _make_fuser(self): self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "linear": self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers + self.feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers elif self.feat_fusion_method == "cat": - self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + self.feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) def _fuse_hid_feats(self, hid_feats): """Fuses the hidden features from the Wav2Vec model. @@ -69,7 +72,7 @@ def _fuse_hid_feats(self, hid_feats): # There is only one layer of features return hid_feats[0] - hid_feats = hid_feats[self.feat_fusion_start :] + hid_feats = hid_feats[self.feat_fusion_start:] if self.feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) @@ -119,14 +122,14 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) - def forward_feats( - self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False - ): - return_hid_states = ( - False - if return_feat_layers is None and self.feat_fusion_method == "last" - else True - ) + def forward_feats(self, + x, + x_lengths, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + return_hid_states = (False if return_feat_layers is None + and self.feat_fusion_method == "last" else True) with self._hf_context: hf_output = self.hf_feats( x, @@ -148,8 +151,7 @@ def forward_feats( # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) # as the hidden features of the x-vector encoder. hid_feats = [ - f.transpose(1, 2) - for i, f in enumerate(hid_feats) + f.transpose(1, 2) for i, f in enumerate(hid_feats) if i in return_feat_layers ] else: @@ -189,11 +191,10 @@ def forward( "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) """ feats, hid_feats, feat_lengths = self.forward_feats( - x, x_lengths, return_feat_layers - ) + x, x_lengths, return_feat_layers) feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) - + output, loss = self.transducer( feats, feat_lengths, @@ -226,17 +227,16 @@ def extract_embed( x, x_lengths = remove_silence(x, x_lengths) feats, _, feat_lengths = self.forward_feats( - x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks - ) - xvec_chunk_length = int( - xvec_chunk_length - * self.hf_feats.sample_frequency - * feats.size(-1) - // x.size(-1) - ) - return self.transducer.extract_embed( - feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks - ) + x, + x_lengths, + chunk_length=hf_chunk_length, + detach_chunks=detach_chunks) + xvec_chunk_length = int(xvec_chunk_length * + self.hf_feats.sample_frequency * + feats.size(-1) // x.size(-1)) + return self.transducer.extract_embed(feats, feat_lengths, + xvec_chunk_length, embed_layer, + detach_chunks) def freeze_feat_fuser(self): if self.feat_fuser is None: @@ -299,11 +299,11 @@ def _train(self, train_mode: str): self.hf_feats.train() self.transducer._train("ft-embed_affine") elif train_mode in [ - "ft-transducer", - "hf-feats-frozen", - "ft-transducer-nograd", - "hf-feats-frozen-nograd", - "hf-feat-extractor-frozen", + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", ]: self.hf_feats.train() self.transducer._train("full") @@ -365,26 +365,20 @@ def add_class_args(parser, prefix=None, skip=set()): "--feat-fusion-start", default=0, type=int, - help=( - "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" - "the wav2vec num_layers" - ), + help= + ("the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers"), ) parser.add_argument( "--feat-fusion-method", default="weighted-avg", choices=["weighted-avg", "linear", "cat", "last"], - help=( - "method to fuse the hidden layers from the wav2vec model " - "in [weighted-avg, cat]" - ), + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]"), ) if prefix is not None: outer_parser.add_argument( "--" + prefix, action=ActionParser(parser=parser), - help="xvector options", ) - - diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 5f573904..8d396719 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -67,7 +67,6 @@ class TorchTrainer(object): swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__( self, model, @@ -107,9 +106,8 @@ def __init__( self.exp_path = Path(exp_path) if loggers is None: - self.loggers = self._default_loggers( - log_interval, use_tensorboard, use_wandb, wandb - ) + self.loggers = self._default_loggers(log_interval, use_tensorboard, + use_wandb, wandb) elif isinstance(loggers, list): self.loggers = LoggerList(loggers) else: @@ -142,25 +140,31 @@ def __init__( self.rank = dist.get_rank() self.world_size = dist.get_world_size() if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + self.model = nn.SyncBatchNorm.convert_sync_batchnorm( + self.model) if self.rank == 0: logging.info( "training in multiple gpus with distributed-data-parallel" ) oss = False if ddp_type == DDPType.DDP else True - self.optimizer = self._make_optimizer(optim, self.model, oss=oss) + self.optimizer = self._make_optimizer(optim, + self.model, + oss=oss) self.model = TorchDDP( self.model, device_ids=[device], output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + self.model = nn.SyncBatchNorm.convert_sync_batchnorm( + self.model) if self.rank == 0: logging.info( "training in multiple gpus with fair sharded-distributed-data-parallel" ) - self.optimizer = self._make_optimizer(optim, self.model, oss=True) + self.optimizer = self._make_optimizer(optim, + self.model, + oss=True) self.model = FairShardedDDP(self.model, self.optimizer) else: if self.rank == 0: @@ -173,7 +177,9 @@ def __init__( mixed_precision=self.use_amp, move_params_to_cpu=cpu_offload, ) - self.optimizer = self._make_optimizer(optim, self.model, oss=False) + self.optimizer = self._make_optimizer(optim, + self.model, + oss=False) else: self.optimizer = self._make_optimizer(optim, self.model) @@ -203,9 +209,9 @@ def __init__( if self.rank == 0: logging.info("init SWA model") self.swa_model = AveragedModel(self.model) - self.swa_scheduler = SWALR( - self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs - ) + self.swa_scheduler = SWALR(self.optimizer, + swa_lr=self.swa_lr, + anneal_epochs=self.swa_anneal_epochs) def set_epoch(self, data_loader): try: @@ -239,7 +245,8 @@ def fit(self, train_data, val_data=None): if self.lr_scheduler is not None: # this is needed by cosine scheduler epoch_updates = int(len(train_data) / self.grad_acc_steps) - self.lr_scheduler.on_epoch_begin(epoch, epoch_updates=epoch_updates) + self.lr_scheduler.on_epoch_begin(epoch, + epoch_updates=epoch_updates) logs = self.train_epoch(train_data) if val_data is not None: @@ -261,7 +268,8 @@ def fit(self, train_data, val_data=None): self.save_checkpoint(logs) if self.in_swa: - self.loggers.on_epoch_begin(self.cur_epoch, batches=len(train_data)) + self.loggers.on_epoch_begin(self.cur_epoch, + batches=len(train_data)) self.model = self.swa_model.module logs = self.bn_update_epoch(train_data) @@ -366,9 +374,9 @@ def bn_update_epoch(self, data_loader): def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): if self.ddp: if self.ddp_type == DDPType.DDP: - nn.utils.clip_grad_norm_( - model.parameters(), grad_clip, norm_type=grad_clip_norm - ) + nn.utils.clip_grad_norm_(model.parameters(), + grad_clip, + norm_type=grad_clip_norm) return if self.ddp_type == DDPType.FULLY_SHARDED_DDP: # we have to use the member function in FullyShardedDDP class @@ -380,26 +388,24 @@ def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): optim.clip_grad_norm(grad_clip, norm_type=grad_clip_norm) # if no DDP clip normally - nn.utils.clip_grad_norm_( - model.parameters(), grad_clip, norm_type=grad_clip_norm - ) + nn.utils.clip_grad_norm_(model.parameters(), + grad_clip, + norm_type=grad_clip_norm) def update_model(self): """Updates the model and does gradding clipping.""" if self.use_amp: if self.grad_clip > 0: self.grad_scaler.unscale_(self.optimizer) - self._clip_grad_norm( - self.model, self.optimizer, self.grad_clip, self.grad_clip_norm - ) + self._clip_grad_norm(self.model, self.optimizer, + self.grad_clip, self.grad_clip_norm) self.grad_scaler.step(self.optimizer) self.grad_scaler.update() else: if self.grad_clip > 0: - self._clip_grad_norm( - self.model, self.optimizer, self.grad_clip, self.grad_clip_norm - ) + self._clip_grad_norm(self.model, self.optimizer, + self.grad_clip, self.grad_clip_norm) self.optimizer.step() @@ -428,21 +434,20 @@ def _make_lr_sched(self, lr_sched, optim): lr_sched = LRSF.create(optim, **args) return lr_sched - def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): + def _default_loggers(self, log_interval, use_tensorboard, use_wandb, + wandb): """Creates the default data loaders""" prog_log = ProgLogger(interval=log_interval) csv_log = CSVLogger(self.exp_path / "train.log", append=True) loggers = [prog_log, csv_log] if use_tensorboard: loggers.append( - TensorBoardLogger(self.exp_path / "tb", interval=log_interval) - ) + TensorBoardLogger(self.exp_path / "tb", interval=log_interval)) if use_wandb: loggers.append( - WAndBLogger( - **wandb, path=self.exp_path / "wandb", interval=log_interval - ) - ) + WAndBLogger(**wandb, + path=self.exp_path / "wandb", + interval=log_interval)) return LoggerList(loggers) def _get_lr(self): @@ -458,7 +463,7 @@ def _compute_grad_acc_steps(self, data_loader): try: batch_size = data_loader.batch_sampler.avg_batch_size except: - logging.warn( + logging.warning( "batch sampler doesn't have avg_batch_size property, " "we cannot estimate grad_acc_steps, using grad_acc_steps=%d", self.grad_acc_steps, @@ -466,8 +471,7 @@ def _compute_grad_acc_steps(self, data_loader): return self.grad_acc_steps = int( - math.ceil(self.eff_batch_size / batch_size / self.world_size) - ) + math.ceil(self.eff_batch_size / batch_size / self.world_size)) logging.info( "Setting grad_acc_steps=%d for " "eff_batch_size=%d, avg_batch_size=%d, world_size=%d", @@ -478,7 +482,7 @@ def _compute_grad_acc_steps(self, data_loader): ) return - logging.warn( + logging.warning( "We cannot determine the batch_size, " "we cannot estimate grad_acc_steps, using grad_acc_steps=%d", self.grad_acc_steps, @@ -491,24 +495,30 @@ def checkpoint(self, logs=None): logs: logs containing the current value of the metrics. """ checkpoint = { - "epoch": self.cur_epoch, - "rng_state": torch.get_rng_state(), - "model_cfg": self.model.get_config(), - "model_state_dict": self.model.state_dict(), - "optimizer_state_dict": self.optimizer.state_dict(), - "loss_state_dict": self.loss.state_dict() - if self.loss is not None - else None, + "epoch": + self.cur_epoch, + "rng_state": + torch.get_rng_state(), + "model_cfg": + self.model.get_config(), + "model_state_dict": + self.model.state_dict(), + "optimizer_state_dict": + self.optimizer.state_dict(), + "loss_state_dict": + self.loss.state_dict() if self.loss is not None else None, } if self.lr_scheduler is not None: - checkpoint["lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() + checkpoint[ + "lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() if logs is not None: checkpoint["logs"] = logs if self.in_swa: checkpoint["swa_model_state_dict"] = self.swa_model.state_dict() - checkpoint["swa_scheduler_state_dict"] = self.swa_scheduler.state_dict() + checkpoint[ + "swa_scheduler_state_dict"] = self.swa_scheduler.state_dict() return checkpoint @@ -518,9 +528,8 @@ def save_checkpoint(self, logs=None): Args: logs: logs containing the current value of the metrics. """ - if self.ddp and ( - self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP - ): + if self.ddp and (self.ddp_type == DDPType.OSS_DDP + or self.ddp_type == DDPType.OSS_SHARDED_DDP): # Not sure what this does, just copying from the example in # https://github.com/facebookresearch/fairscale/blob/master/benchmarks/oss.py # Check the checkpointing in the case of the OSS optimizer @@ -575,16 +584,17 @@ def load_checkpoint(self, file_path): if self.loss is not None: self.loss.load_state_dict(checkpoint["loss_state_dict"]) if self.lr_scheduler is not None: - self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) + self.lr_scheduler.load_state_dict( + checkpoint["lr_scheduler_state_dict"]) # if self.use_amp: # amp.load_state_dict(checkpoint['amp']) if self.do_swa: if "swa_model_state_dict" in checkpoint: - self.swa_model.load_state_dict(checkpoint["swa_model_state_dict"]) + self.swa_model.load_state_dict( + checkpoint["swa_model_state_dict"]) self.swa_scheduler.load_state_dict( - checkpoint["swa_scheduler_state_dict"] - ) + checkpoint["swa_scheduler_state_dict"]) else: self.swa_scheduler = SWALR( self.optimizer, @@ -662,9 +672,13 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=[]): "--eff-batch-size", type=int, default=None, - help="effective total batch size, if given, it overrides grad_acc_steps", + help= + "effective total batch size, if given, it overrides grad_acc_steps", ) - parser.add_argument("--epochs", type=int, default=200, help="number of epochs") + parser.add_argument("--epochs", + type=int, + default=200, + help="number of epochs") if train_modes is not None: parser.add_argument( "--train-mode", @@ -684,12 +698,19 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=[]): default=False, help="use tensorboard logger", ) - parser.add_argument( - "--use-wandb", action="store_true", default=False, help="use wandb logger" - ) - parser.add_argument("--wandb.project", default=None, help="wandb project name") - parser.add_argument("--wandb.group", default=None, help="wandb group name") - parser.add_argument("--wandb.name", default=None, help="wandb display name") + parser.add_argument("--use-wandb", + action="store_true", + default=False, + help="use wandb logger") + parser.add_argument("--wandb.project", + default=None, + help="wandb project name") + parser.add_argument("--wandb.group", + default=None, + help="wandb group name") + parser.add_argument("--wandb.name", + default=None, + help="wandb display name") # parser.add_argument( # '--wandb.path', default=None, # help='wandb directory') @@ -718,9 +739,10 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=[]): default=False, help="CPU offload of gradients when using fully_sharded_ddp", ) - parser.add_argument( - "--grad-clip", type=float, default=0, help="gradient clipping norm value" - ) + parser.add_argument("--grad-clip", + type=float, + default=0, + help="gradient clipping norm value") parser.add_argument( "--grad-clip-norm", default=2, @@ -733,9 +755,10 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=[]): default=0, help="start epoch for SWA, if 0 it does not use SWA", ) - parser.add_argument( - "--swa-lr", type=float, default=1e-3, help="learning rate for SWA phase" - ) + parser.add_argument("--swa-lr", + type=float, + default=1e-3, + help="learning rate for SWA phase") parser.add_argument( "--swa-anneal-epochs", type=int, @@ -746,6 +769,7 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=[]): parser.add_argument("--exp-path", help="experiment path") if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index bbe847d0..932c3ed4 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -47,7 +47,6 @@ class TransducerTrainer(TorchTrainer): swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__( self, model, @@ -129,13 +128,19 @@ def train_epoch(self, data_loader): if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() # TODO: Check and Modify data, target - data, audio_length, target = data.to(self.device), audio_length.to(self.device), target.to(self.device) + data, audio_length, target = data.to(self.device), audio_length.to( + self.device), target.to(self.device) batch_size = data.shape[0] with self.amp_autocast(): - output, loss = self.model(data, x_lengths=audio_length, y=target) + # print("xx", data.shape, data.shape[0] * data.shape[1] / 16000, + # torch.sum(audio_length).item() / 16000, + # torch.min(audio_length).item() / 16000, + # torch.max(audio_length).item() / 16000) + output, loss = self.model(data, + x_lengths=audio_length, + y=target) loss = loss.mean() / self.grad_acc_steps - # loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -161,7 +166,6 @@ def train_epoch(self, data_loader): logs["lr"] = self._get_lr() return logs - def validation_epoch(self, data_loader, swa_update_bn=False): """Validation epoch loop @@ -181,13 +185,17 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, (data, audio_length, target) in enumerate(data_loader): - data, audio_length, target = data.to(self.device), audio_length.to(self.device), target.to(self.device) + data, audio_length, target = data.to( + self.device), audio_length.to(self.device), target.to( + self.device) batch_size = data.shape[0] # data, target = data.to(self.device), target.to(self.device) # batch_size = data.shape[0] with self.amp_autocast(): - output, loss = self.model(data, x_lengths=audio_length, y=target) + output, loss = self.model(data, + x_lengths=audio_length, + y=target) # output = self.model(data) # loss = self.loss(output, target) @@ -199,4 +207,4 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) - return logs \ No newline at end of file + return logs From 49b829b0a3fb8c3a8f28dd61775f877619aff00d Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 8 Dec 2022 09:41:27 -0500 Subject: [PATCH 057/154] transducer training, working in clsp grid, cudnn=False --- ...in_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml | 9 +++++---- egs/librispeech/v1/run_011_train_asr.sh | 2 +- hyp_utils/conda_env.sh | 14 ++++++++++++-- hyperion/torch/data/seg_sampler.py | 4 ++-- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index f89108ea..9349efa7 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -11,9 +11,9 @@ data: sampler_type: 'bucketing_seg_sampler' max_batch_length: 85. min_batch_size: 1 - drop_last: true + drop_last: false data_loader: - num_workers: 8 + num_workers: 4 val: dataset: aug_cfgs: @@ -28,7 +28,7 @@ data: min_batch_size: 1 drop_last: true data_loader: - num_workers: 8 + num_workers: 4 model: wav2vec2xlsr300m_transducer.yaml trainer: optim: @@ -48,7 +48,8 @@ trainer: use_amp: true log_interval: 1000 epochs: 60 - eff_batch_size: 1024 + # eff_batch_size: 1024 + eff_batch_size: 128 train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index 7c2c0f70..868cf4d1 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -46,7 +46,7 @@ if [ $stage -le 1 ]; then mkdir -p $nnet_s1_dir/log $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index 35c14680..ceee4e93 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -14,6 +14,7 @@ if [ -n "$HYP_ENV" ];then else conda_env=base fi +max_split_size_mb="" while true do @@ -25,6 +26,10 @@ do shift; conda_env=$1 shift; + elif [ "$1" == "--max-split-size-mb" ];then + shift; + max_split_size_mb=$1 + shift; else break fi @@ -49,7 +54,7 @@ fi conda activate $conda_env command="python" if [ $num_gpus -gt 0 ];then - # set CUDA_VISIBLE_DEVICES + # set CUDA_VISIBLE_DEVICES if [ ! -z "$SGE_HGR_gpu" ]; then echo "SGE_HGR_gpu=$SGE_HGR_gpu" export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') @@ -66,9 +71,14 @@ if [ $num_gpus -gt 0 ];then fi fi echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" + if [ -n "$max_split_size_mb" ];then + export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:${max_split_size_mb}" + echo "PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF} + fi + #export CUDA_LAUNCH_BLOCKING=1 #export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters if [ $num_gpus -gt 1 ];then - #export CUDA_LAUNCH_BLOCKING=1 + [[ $(type -P "$torchrun") ]] && command="torchrun" \ || command="python -m torch.distributed.run" command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index 8c5ad306..cd976d11 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -111,10 +111,10 @@ def __next__(self): seg_ids = [(id, s, d) for id, s, d in zip( chunks.seg_id, chunks.chunk_start, chunks[self.length_name])] else: - seg_ids = self.seg_set.iloc[idxs].id + seg_ids = self.seg_set.iloc[idxs].id.values if self.batch == 0: - logging.info("batch 0 chunks=%s", str(seg_ids[:10])) + logging.info("batch 0 seg_ids=%s", str(seg_ids[:10])) self.batch += 1 return seg_ids From 829aa7de1aff8e8ae654923be41b06b97392ce88 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 13 Dec 2022 12:42:29 -0500 Subject: [PATCH 058/154] w2v2 transducer with do --- ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 2 +- ...v2vec2xlsr300m_transducer_stage1_v3.0.yaml | 55 +++++++++++++++++++ ...v2vec2xlsr300m_transducer_stage1_v3.1.yaml | 55 +++++++++++++++++++ .../conf/wav2vec2xlsr300m_transducer_do.yaml | 13 +++++ .../wav2vec2xlsr300m_transducer_do0.2.yaml | 13 +++++ .../v1/global_conf/config_transducer_v3.1.sh | 39 +++++++++++++ .../v1/global_conf/config_transducer_v3.sh | 39 +++++++++++++ hyperion/torch/lr_schedulers/noam_lr.py | 3 +- hyperion/torch/models/transducer/decoder.py | 22 ++++++-- 9 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml create mode 100644 egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do.yaml create mode 100644 egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v3.1.sh create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v3.sh diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml index 9349efa7..edc0af5e 100644 --- a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml @@ -47,7 +47,7 @@ trainer: grad_clip: 100 use_amp: true log_interval: 1000 - epochs: 60 + epochs: 120 # eff_batch_size: 1024 eff_batch_size: 128 train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml new file mode 100644 index 00000000..49077fd6 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml new file mode 100644 index 00000000..9f070bbe --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.2.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do.yaml new file mode 100644 index 00000000..c7fc2df7 --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.1 + rnn_dropout_rate: 0.1 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml new file mode 100644 index 00000000..1ee4ec72 --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.2 + rnn_dropout_rate: 0.2 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.1.sh b/egs/librispeech/v1/global_conf/config_transducer_v3.1.sh new file mode 100644 index 00000000..0aa4d949 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v3.1.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.1 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.sh b/egs/librispeech/v1/global_conf/config_transducer_v3.sh new file mode 100644 index 00000000..3871ee55 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v3.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/torch/lr_schedulers/noam_lr.py b/hyperion/torch/lr_schedulers/noam_lr.py index 4acdc3b9..8e4f2f1c 100644 --- a/hyperion/torch/lr_schedulers/noam_lr.py +++ b/hyperion/torch/lr_schedulers/noam_lr.py @@ -28,7 +28,6 @@ class NoamLR(InvPowLR): step: initial training step, this is needed to restart the model training. """ - def __init__( self, optimizer, @@ -47,7 +46,7 @@ def __init__( # different modules of the model max_lr = 0 for group in optimizer.param_groups: - max_lr = max(lr, max_lr) + max_lr = max(group["lr"], max_lr) for group in optimizer.param_groups: group["lr"] = lr * group["lr"] / max_lr super().__init__( diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py index 3effea9e..833394d0 100644 --- a/hyperion/torch/models/transducer/decoder.py +++ b/hyperion/torch/models/transducer/decoder.py @@ -31,8 +31,8 @@ def __init__( num_layers: int, hidden_dim: int, in_feats: int, - embedding_dropout: float = 0.0, - rnn_dropout: float = 0.0, + embedding_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, ): """ Args: @@ -59,14 +59,14 @@ def __init__( embedding_dim=embedding_dim, padding_idx=blank_id, ) - self.embedding_dropout = nn.Dropout(embedding_dropout) + self.embedding_dropout = nn.Dropout(embedding_dropout_rate) # TODO(fangjun): Use layer normalized LSTM self.rnn = nn.LSTM( input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True, - dropout=rnn_dropout, + dropout=rnn_dropout_rate, ) self.in_feats = in_feats @@ -75,6 +75,8 @@ def __init__( self.embedding_dim = embedding_dim self.num_layers = num_layers self.hidden_dim = hidden_dim + self.embedding_dropout_rate = embedding_dropout_rate + self.rnn_dropout_rate = rnn_dropout_rate self.output_linear = nn.Linear(hidden_dim, in_feats) def forward( @@ -112,6 +114,8 @@ def get_config(self): "embedding_dim": self.embedding_dim, "num_layers": self.num_layers, "hidden_dim": self.hidden_dim, + "embedding_dropout_rate": self.embedding_dropout_rate, + "rnn_dropout_rate": self.rnn_dropout_rate, } # base_config = super().get_config() @@ -126,6 +130,8 @@ def filter_args(**kwargs): "embedding_dim", "num_layers", "hidden_dim", + "embedding_dropout_rate", + "rnn_dropout_rate", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -159,6 +165,14 @@ def add_class_args(parser, default=1024, type=int, help=("feature dimension")) + parser.add_argument("--embedding-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) parser.add_argument("--num-layers", default=2, type=int, help=("")) From 43f6c4cdaa035c6a626f46c599c183c9cb5efc2f Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 15 Dec 2022 10:15:23 -0500 Subject: [PATCH 059/154] more transducer configs --- ...v2vec2xlsr300m_transducer_stage1_v2.0.yaml | 55 +++++++++++++++++++ ...v2vec2xlsr300m_transducer_stage1_v3.2.yaml | 55 +++++++++++++++++++ .../wav2vec2xlsr300m_transducer_do0.3.yaml | 13 +++++ .../wav2vec2xlsr300m_transducer_enclast.yaml | 11 ++++ .../v1/global_conf/config_transducer_v2.sh | 39 +++++++++++++ .../v1/global_conf/config_transducer_v3.2.sh | 39 +++++++++++++ 6 files changed, 212 insertions(+) create mode 100644 egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml create mode 100644 egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml create mode 100644 egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v2.sh create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v3.2.sh diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml new file mode 100644 index 00000000..aefddc7e --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_enclast.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml new file mode 100644 index 00000000..d787a373 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.3.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml new file mode 100644 index 00000000..ca7c1995 --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.3 + rnn_dropout_rate: 0.3 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml new file mode 100644 index 00000000..1d46c33c --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml @@ -0,0 +1,11 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + joiner: + num_layers: 1 +feat_fusion_method: last + diff --git a/egs/librispeech/v1/global_conf/config_transducer_v2.sh b/egs/librispeech/v1/global_conf/config_transducer_v2.sh new file mode 100644 index 00000000..f663e2dd --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v2.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.2.sh b/egs/librispeech/v1/global_conf/config_transducer_v3.2.sh new file mode 100644 index 00000000..2ff8d3c9 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v3.2.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From 63a1216b2778565e840068f178ee2463d74fc550 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Tue, 20 Dec 2022 21:45:57 -0500 Subject: [PATCH 060/154] Merge and add decode for ASR --- ...et50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 68 ---- .../v1/global_conf/config_transducer_v1.sh | 5 +- egs/librispeech/v1/run_030_inference.sh | 52 +-- .../decode_wav2vec2transducer.sh | 80 ++++ egs/voxceleb/v2/path.sh | 2 +- hyperion/bin/decode_wav2transducer.py | 360 ++++++++++++++++++ hyperion/bin/finetune_wav2vec2xvector.py | 2 +- hyperion/bin/train_wav2vec2transducer.py | 3 +- hyperion/torch/models/transducer/joiner.py | 2 + .../models/wav2transducer/beam_search.py | 216 +++++++++++ hyperion/torch/trainers/transducer_trainer.py | 3 + hyperion/torch/utils/__init__.py | 1 + 12 files changed, 682 insertions(+), 112 deletions(-) delete mode 100644 egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh create mode 100755 egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh create mode 100755 hyperion/bin/decode_wav2transducer.py create mode 100644 hyperion/torch/models/wav2transducer/beam_search.py diff --git a/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index bf6c2fb8..00000000 --- a/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,68 +0,0 @@ -# LResNet34 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_16k.yaml -feat_type=fbank80_stmn - -# x-vector training -nnet_data=voxcelebcat -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -ipe=$nnet_num_augs -min_chunk=4 -max_chunk=4 -lr=0.05 - -nnet_type=res2net50 -dropout=0 -embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxcelebcat -else - plda_data=voxcelebcat_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - -diar_plda_num_augs=0 -if [ $diar_plda_num_augs -eq 0 ]; then - diar_plda_data=voxcelebcat -else - diar_plda_data=voxcelebcat_augx${plda_num_augs} -fi -diar_plda_type=splda -diar_lda_dim=150 -diar_plda_y_dim=150 -diar_plda_z_dim=150 - -diar_plda_name=lda${diar_lda_dim}_${diar_plda_type}y${diar_plda_y_dim}_v1_${diar_plda_data} -diar_thr=-7 -diar_dir=exp/diarization/$nnet_name/${diar_plda_name}/ahc_pcar1_thr${diar_thr} -diar_name=diar_res2net50w26s4_thr${diar_thr} diff --git a/egs/librispeech/v1/global_conf/config_transducer_v1.sh b/egs/librispeech/v1/global_conf/config_transducer_v1.sh index ca1ca29c..7fc1508f 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v1.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v1.sh @@ -9,6 +9,7 @@ hf_model_name=wav2vec2xlsr300m # x-vector training nnet_data=train_clean_100 dev_data=dev_clean +test_data=test_clean # nnet_data=train_clean_small bpe_model=data/lang_bpe_1000/bpe.model @@ -23,9 +24,9 @@ nnet_name=${hf_model_name}_transducer_v1.0 nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0060.pth +nnet_s1=${nnet_s1_dir}_pre/model_ep0060.pth -nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name diff --git a/egs/librispeech/v1/run_030_inference.sh b/egs/librispeech/v1/run_030_inference.sh index 67122f85..fb76088b 100755 --- a/egs/librispeech/v1/run_030_inference.sh +++ b/egs/librispeech/v1/run_030_inference.sh @@ -7,20 +7,17 @@ . ./path.sh set -e -stage=2 config_file=default_config.sh use_gpu=false -nnet_stage=3 -hf_chunk_length=120 #seconds -xvec_chunk_length=120 #seconds +nnet_stage=1 . parse_options.sh || exit 1; . $config_file if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" - xvec_cmd="$cuda_eval_cmd --mem 6G" + transducer_args="--use-gpu true" + transducer_cmd="$cuda_eval_cmd --mem 6G" else - xvec_cmd="$train_cmd --mem 12G" + transducer_cmd="$train_cmd --mem 12G" fi if [ $nnet_stage -eq 1 ];then @@ -34,41 +31,18 @@ elif [ $nnet_stage -eq 3 ];then nnet_name=$nnet_s3_name fi -xvector_dir=exp/xvectors/$nnet_name +transducer_dir=exp/transducer/$nnet_name -if [ $stage -le 1 ]; then - # Extract xvectors for training LDA/PLDA - for name in voxceleb2cat_train - do - if [ $plda_num_augs -eq 0 ]; then - steps_xvec/extract_wav2vec2xvectors.sh \ - --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ - $nnet data/${name} \ - $xvector_dir/${name} - else - steps_xvec/extract_wav2vec2xvectors.sh \ - --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ - --aug-config $plda_aug_config --num-augs $plda_num_augs \ - $nnet data/${name} \ - $xvector_dir/${name}_augx${plda_num_augs} \ - data/${name}_augx${plda_num_augs} - fi - done -fi -if [ $stage -le 2 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test +test_data=test_clean + + +# Extracts x-vectors for evaluation +for name in $test_data do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_wav2vec2xvectors.sh \ - --cmd "$xvec_cmd" --nj $nj ${xvec_args} \ + nj=16 + steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ $nnet data/$name \ - $xvector_dir/$name + $transducer_dir/$name $bpe_model done -fi - exit diff --git a/egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh b/egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh new file mode 100755 index 00000000..143087a5 --- /dev/null +++ b/egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" + +use_gpu=false +write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +num_augs=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + echo " --use-bin-vad # If true, uses binary VAD from vad.scp" + echo " --write-utt2num-frames # If true, write utt2num_frames file." + echo " --chunk-length # If provided, applies encoder with specified chunk-length and " + echo " # concatenates the chunks outputs before pooling" + echo " --feat-config # feature/mvn config file" + echo " --aug-config # augmentation config file" + echo " --random-utt-length # If true, extracts a random chunk from the utterance between " + echo " # min_utt_length and max_utt_length" + echo " --min-utt-length # " + echo " --max-utt-length # " + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +args="" +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + args="--use-gpu" +fi + +if [ "$write_utt2num_frames" == "true" ];then + write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +fi + +if [ $stage -le 0 ];then + set +e + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2transducer.py \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --output $output_dir/transducer.JOB.text + set -e +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text +fi diff --git a/egs/voxceleb/v2/path.sh b/egs/voxceleb/v2/path.sh index 6994fdab..0dc5a9d5 100755 --- a/egs/voxceleb/v2/path.sh +++ b/egs/voxceleb/v2/path.sh @@ -2,4 +2,4 @@ export HYP_ROOT=$(readlink -f `pwd -P`/../../..) export TOOLS_ROOT=$HYP_ROOT/tools -. $TOOLS_ROOT/path.sh + . $TOOLS_ROOT/path.sh diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py new file mode 100755 index 00000000..c71df79e --- /dev/null +++ b/hyperion/bin/decode_wav2transducer.py @@ -0,0 +1,360 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + + +from typing import Dict, List, Tuple + +import sentencepiece as spm +import torch.nn as nn + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np +import pandas as pd + +import torch + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.utils import Utt2Info +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.augment import SpeechAugment + +from hyperion.torch.utils import open_device +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch import TorchModelLoader as TML + +from hyperion.torch.models.wav2transducer.beam_search import greedy_search, beam_search + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("transducer-model={}".format(model)) + model.to(device) + model.eval() + return model + + + +# def decode_dataset( +# dl: torch.utils.data.DataLoader, +# params: AttributeDict, +# model: nn.Module, +# sp: spm.SentencePieceProcessor, +# ) -> Dict[str, List[Tuple[str, List[str], List[str]]]]: +# """Decode dataset. +# Args: +# dl: +# PyTorch's dataloader containing the dataset to decode. +# params: +# It is returned by :func:`get_params`. +# model: +# The neural model. +# sp: +# The BPE model. +# Returns: +# Return a dict, whose key may be "greedy_search" if greedy search +# is used, or it may be "beam_7" if beam size of 7 is used. +# Its value is a list of tuples. Each tuple contains two elements: +# The first is the reference transcript, and the second is the +# predicted result. +# """ +# num_cuts = 0 + +# try: +# num_batches = len(dl) +# except TypeError: +# num_batches = "?" + +# if decoding_method == "greedy_search": +# log_interval = 100 +# else: +# log_interval = 2 + +# results = defaultdict(list) +# for batch_idx, batch in enumerate(dl): +# texts = batch["supervisions"]["text"] +# cut_ids = [cut.id for cut in batch["supervisions"]["cut"]] + +# hyps_dict = decode_one_batch( +# params=params, +# model=model, +# sp=sp, +# batch=batch, +# ) + +# for name, hyps in hyps_dict.items(): +# this_batch = [] +# assert len(hyps) == len(texts) +# for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts): +# ref_words = ref_text.split() +# this_batch.append((cut_id, ref_words, hyp_words)) + +# results[name].extend(this_batch) + +# num_cuts += len(texts) + +# if batch_idx % log_interval == 0: +# batch_str = f"{batch_idx}/{num_batches}" + +# logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}") +# return results + + +def decode_one_batch( + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method = "beam_search" +) -> Dict[str, List[List[str]]]: + """Decode one batch and return the result in a dict. The dict has the + following format: + - key: It indicates the setting used for decoding. For example, + if greedy_search is used, it would be "greedy_search" + If beam search with a beam size of 7 is used, it would be + "beam_7" + - value: It contains the decoding result. `len(value)` equals to + batch size. `value[i]` is the decoding result for the i-th + utterance in the given batch. + Args: + params: + It's the return value of :func:`get_params`. + model: + The neural model. + sp: + The BPE model. + batch: + It is the return value from iterating + `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation + for the format of the `batch`. + Returns: + Return the decoding result. See above description for the format of + the returned dict. + """ + device = model.device + feature = x #batch["inputs"] + assert x.shape[0] == 1 + assert feature.ndim == 2 + + feature = feature.to(device) + # at entry, feature is (N, T, C) + + feature_lens = torch.Tensor([x.shape[1]]).int() #batch["supervisions"] + # feature_lens = supervisions["num_frames"].to(device) + + # encoder_out, encoder_out_lens = model.encoder(x=feature, x_lens=feature_lens) + + # print("feature",feature.shape) + # print("feature_lens",feature_lens) + encoder_out, hid_feats, encoder_out_lens = model.forward_feats(x=feature, x_lengths=feature_lens) + + hyps = [] + batch_size = encoder_out.size(0) + + encoder_out = encoder_out.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + for i in range(batch_size): + # fmt: off + encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]] + # fmt: on + if decoding_method == "greedy_search": + hyp = greedy_search(model=model, encoder_out=encoder_out_i) + elif decoding_method == "beam_search": + hyp = beam_search( + model=model, encoder_out=encoder_out_i, beam=5 + ) + else: + raise ValueError(f"Unsupported decoding method: {decoding_method}") + hyps.append(sp.decode(hyp).split()) + + logging.info("hyps:{}".format(" ".join(hyps[0]))) + + if decoding_method == "greedy_search": + return hyps[0] #{"greedy_search": hyps} + else: + return hyps[0] #{f"beam_{params.beam_size}": hyps} + + +def decode_transducer( + input_spec, + output_spec, + scp_sep, + model_path, + bpe_model, + use_gpu, + **kwargs +): + + device = init_device(use_gpu) + model = load_model(model_path, device) + + sp = spm.SentencePieceProcessor() + sp.load(bpe_model) + # blank_id = self.sp.piece_to_id("") + # vocab_size = self.sp.get_piece_size() + + # if write_num_frames_spec is not None: + # keys = [] + # info = [] + + augmenter = None + aug_df = None + num_augs = 1 + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output: %s" % (output_spec)) + # with DWF.create(output_spec, scp_sep=scp_sep) as writer: + with open(output_spec,"w") as writer: + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) + with AR(input_spec, **ar_args) as reader: + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + t2 = time.time() + + logging.info("processing utt %s" % (key0)) + for aug_id in range(num_augs): + t3 = time.time() + key, x = key0, x0 #augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + + t5 = time.time() + tot_frames = x.shape[1] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + ) + + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + # x = x.transpose(1, 2).contiguous() + # x = torch.unsqueeze(x,2) + # writer.write(key + ' ' + "abc") + y = decode_one_batch(model=model, sp=sp, x=x) + writer.write(key + ' ' + ' '.join(y) + "\n") + + # y = ( + # model.extract_embed( + # x, + # chunk_length=chunk_length, + # embed_layer=embed_layer, + # ) + # .cpu() + # .numpy()[0] + # ) + + t7 = time.time() + # writer.write([key], [y]) + # if write_num_frames_spec is not None: + # keys.append(key) + # info.append(str(x.shape[-1])) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + ) + + # if write_num_frames_spec is not None: + # logging.info("writing num-frames to %s" % (write_num_frames_spec)) + # u2nf = Utt2Info.create(keys, info) + # u2nf.save(write_num_frames_spec) + + # if aug_info_path is not None: + # aug_df = pd.concat(aug_df, ignore_index=True) + # aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=( + "Extracts x-vectors from waveform computing " "acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + + AR.add_class_args(parser) + + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + + parser.add_argument("--bpe-model", required=True) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + decode_transducer(**namespace_to_dict(args)) diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index 25722b35..adde6ed5 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -25,7 +25,7 @@ from hyperion.torch.utils import ddp from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +# from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import ( HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 8156f9b1..7313c661 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -18,7 +18,7 @@ import multiprocessing import numpy as np - +import soundfile as sf import torch import torch.nn as nn @@ -41,6 +41,7 @@ def transducer_collate(batch): audio_length = [] target = [] for record in batch: + # sf.write('/export/c06/ylu125/GSP/hyperion/egs/librispeech/v1/wavs/mix_{}.wav'.format(np.random.randn()), record[0], 16000) wav = torch.as_tensor(record[0]) audio.append(wav) audio_length.append(wav.shape[0]) diff --git a/hyperion/torch/models/transducer/joiner.py b/hyperion/torch/models/transducer/joiner.py index 0fc1fe51..57587992 100644 --- a/hyperion/torch/models/transducer/joiner.py +++ b/hyperion/torch/models/transducer/joiner.py @@ -40,6 +40,8 @@ def forward( Returns: Return a tensor of shape (N, T, U, C). """ + # print("encoder_out",encoder_out.shape) + # print("decoder_out",decoder_out.shape) assert encoder_out.ndim == decoder_out.ndim == 3 assert encoder_out.size(0) == decoder_out.size(0) assert encoder_out.size(2) == decoder_out.size(2) diff --git a/hyperion/torch/models/wav2transducer/beam_search.py b/hyperion/torch/models/wav2transducer/beam_search.py new file mode 100644 index 00000000..95f6fadb --- /dev/null +++ b/hyperion/torch/models/wav2transducer/beam_search.py @@ -0,0 +1,216 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch + +from .hf_wav2transducer import HFWav2Transducer + + +def greedy_search(model: HFWav2Transducer, encoder_out: torch.Tensor) -> List[int]: + """ + Args: + model: + An instance of `Transducer`. + encoder_out: + A tensor of shape (N, T, C) from the encoder. Support only N==1 for now. + Returns: + Return the decoded result. + """ + assert encoder_out.ndim == 3 + + # support only batch_size == 1 for now + assert encoder_out.size(0) == 1, encoder_out.size(0) + blank_id = model.transducer.decoder.blank_id + device = model.device + + sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1) + decoder_out, (h, c) = model.transducer.decoder(sos) + T = encoder_out.size(1) + t = 0 + hyp = [] + + sym_per_frame = 0 + sym_per_utt = 0 + + max_sym_per_utt = 1000 + max_sym_per_frame = 3 + + while t < T and sym_per_utt < max_sym_per_utt: + # fmt: off + current_encoder_out = encoder_out[:, t:t+1, :] + # fmt: on + logits = model.transducer.joiner(current_encoder_out, decoder_out) + # logits is (1, 1, 1, vocab_size) + + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + # TODO: Use logits.argmax() + y = log_prob.argmax() + if y != blank_id: + hyp.append(y.item()) + y = y.reshape(1, 1) + decoder_out, (h, c) = model.transducer.decoder(y, (h, c)) + + sym_per_utt += 1 + sym_per_frame += 1 + + if y == blank_id or sym_per_frame > max_sym_per_frame: + sym_per_frame = 0 + t += 1 + + return hyp + + +@dataclass +class Hypothesis: + ys: List[int] # the predicted sequences so far + log_prob: float # The log prob of ys + + # Optional decoder state. We assume it is LSTM for now, + # so the state is a tuple (h, c) + decoder_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + + +def beam_search( + model: HFWav2Transducer, + encoder_out: torch.Tensor, + beam: int = 5, +) -> List[int]: + """ + It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf + espnet/nets/beam_search_transducer.py#L247 is used as a reference. + Args: + model: + An instance of `Transducer`. + encoder_out: + A tensor of shape (N, T, C) from the encoder. Support only N==1 for now. + beam: + Beam size. + Returns: + Return the decoded result. + """ + assert encoder_out.ndim == 3 + + # support only batch_size == 1 for now + assert encoder_out.size(0) == 1, encoder_out.size(0) + blank_id = model.transducer.decoder.blank_id + device = model.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + decoder_out, (h, c) = model.transducer.decoder(sos) + T = encoder_out.size(1) + t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, decoder_state=None)] + max_u = 20000 # terminate after this number of steps + u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} + + while t < T and u < max_u: + # fmt: off + current_encoder_out = encoder_out[:, t:t+1, :] + # fmt: on + A = B + B = [] + # for hyp in A: + # for h in A: + # if h.ys == hyp.ys[:-1]: + # # update the score of hyp + # decoder_input = torch.tensor( + # [h.ys[-1]], device=device + # ).reshape(1, 1) + # decoder_out, _ = model.decoder( + # decoder_input, h.decoder_state + # ) + # logits = model.joiner(current_encoder_out, decoder_out) + # log_prob = logits.log_softmax(dim=-1) + # log_prob = log_prob.squeeze() + # hyp.log_prob += h.log_prob + log_prob[hyp.ys[-1]].item() + + while u < max_u: + y_star = max(A, key=lambda hyp: hyp.log_prob) + A.remove(y_star) + + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + decoder_input = torch.tensor([y_star.ys[-1]], device=device).reshape( + 1, 1 + ) + + decoder_out, decoder_state = model.transducer.decoder( + decoder_input, + y_star.decoder_state, + ) + cache[cached_key] = (decoder_out, decoder_state) + else: + decoder_out, decoder_state = cache[cached_key] + + logits = model.transducer.joiner(current_encoder_out, decoder_out) + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() + # Now log_prob is (vocab_size,) + + # If we choose blank here, add the new hypothesis to B. + # Otherwise, add the new hypothesis to A + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.decoder_state here + decoder_state=y_star.decoder_state, + ) + B.append(new_y_star) + + # Second, choose other labels + for i, v in enumerate(log_prob.tolist()): + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + decoder_state=decoder_state, + ) + A.append(new_hyp) + u += 1 + # check whether B contains more than "beam" elements more probable + # than the most probable in A + A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + B = sorted( + [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + if len(B) >= beam: + B = B[:beam] + break + t += 1 + best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:])) + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys \ No newline at end of file diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 932c3ed4..1e36f9af 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -109,6 +109,7 @@ def __init__( cpu_offload=cpu_offload, ) + @record def train_epoch(self, data_loader): """Training epoch loop @@ -124,6 +125,8 @@ def train_epoch(self, data_loader): for batch, (data, audio_length, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) + # print("data",data.shape) + # print("audio_length",audio_length) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index 3a4692dc..eec8a36a 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -5,6 +5,7 @@ from .devices import open_device from .metric_acc import MetricAcc +# from .recognition_acc import RecogAcc from .masking import seq_lengths_to_mask, scale_seq_lengths from .collation import collate_seq_1d, collate_seq_2d, collate_seq_nd from .eval_utils import eval_nnet_by_chunks, eval_nnet_overlap_add From 300d82b2f06e1946739d05dbd28fee4d91bf6720 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Tue, 20 Dec 2022 22:09:35 -0500 Subject: [PATCH 061/154] recover mistakenly deleted/changed files --- ...et50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 68 +++++++++++++++++++ .../v1/global_conf/config_transducer_v1.sh | 9 ++- egs/voxceleb/v2/path.sh | 2 +- hyperion/bin/finetune_wav2vec2xvector.py | 2 +- hyperion/bin/train_wav2vec2transducer.py | 2 - hyperion/torch/models/transducer/joiner.py | 2 - hyperion/torch/trainers/transducer_trainer.py | 3 - hyperion/torch/utils/__init__.py | 1 - 8 files changed, 74 insertions(+), 15 deletions(-) create mode 100644 egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh diff --git a/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh new file mode 100644 index 00000000..213380da --- /dev/null +++ b/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -0,0 +1,68 @@ +# LResNet34 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +# x-vector training +nnet_data=voxcelebcat +nnet_num_augs=6 +aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" + +batch_size_1gpu=32 +eff_batch_size=512 # effective batch size +ipe=$nnet_num_augs +min_chunk=4 +max_chunk=4 +lr=0.05 + +nnet_type=res2net50 +dropout=0 +embed_dim=256 +width_factor=1.625 +scale=4 +ws_tag=w26s4 + +s=30 +margin_warmup=20 +margin=0.3 + +nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" + +opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" +lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" + +nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_num_epochs=70 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=6 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxcelebcat +else + plda_data=voxcelebcat_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + +diar_plda_num_augs=0 +if [ $diar_plda_num_augs -eq 0 ]; then + diar_plda_data=voxcelebcat +else + diar_plda_data=voxcelebcat_augx${plda_num_augs} +fi +diar_plda_type=splda +diar_lda_dim=150 +diar_plda_y_dim=150 +diar_plda_z_dim=150 + +diar_plda_name=lda${diar_lda_dim}_${diar_plda_type}y${diar_plda_y_dim}_v1_${diar_plda_data} +diar_thr=-7 +diar_dir=exp/diarization/$nnet_name/${diar_plda_name}/ahc_pcar1_thr${diar_thr} +diar_name=diar_res2net50w26s4_thr${diar_thr} \ No newline at end of file diff --git a/egs/librispeech/v1/global_conf/config_transducer_v1.sh b/egs/librispeech/v1/global_conf/config_transducer_v1.sh index 7fc1508f..b88fe2a7 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v1.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v1.sh @@ -1,4 +1,4 @@ -# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 # hugging face model hf_model_name=wav2vec2xlsr300m @@ -10,7 +10,6 @@ hf_model_name=wav2vec2xlsr300m nnet_data=train_clean_100 dev_data=dev_clean test_data=test_clean -# nnet_data=train_clean_small bpe_model=data/lang_bpe_1000/bpe.model # x-vector cfg @@ -24,9 +23,9 @@ nnet_name=${hf_model_name}_transducer_v1.0 nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=${nnet_s1_dir}_pre/model_ep0060.pth +nnet_s1=$nnet_s1_dir/model_ep0060.pth -nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name @@ -37,4 +36,4 @@ nnet_s3_args="" nnet_s3_name=${nnet_name}.s3 nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name nnet_s3=$nnet_s3_dir/model_ep0002.pth -nnet_s3=$nnet_s3_dir/model_ep0005.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth \ No newline at end of file diff --git a/egs/voxceleb/v2/path.sh b/egs/voxceleb/v2/path.sh index 0dc5a9d5..6994fdab 100755 --- a/egs/voxceleb/v2/path.sh +++ b/egs/voxceleb/v2/path.sh @@ -2,4 +2,4 @@ export HYP_ROOT=$(readlink -f `pwd -P`/../../..) export TOOLS_ROOT=$HYP_ROOT/tools - . $TOOLS_ROOT/path.sh +. $TOOLS_ROOT/path.sh diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index adde6ed5..25722b35 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -25,7 +25,7 @@ from hyperion.torch.utils import ddp from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.data import AudioDataset as AD -# from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import ( HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 7313c661..0f1e8a3d 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -18,7 +18,6 @@ import multiprocessing import numpy as np -import soundfile as sf import torch import torch.nn as nn @@ -41,7 +40,6 @@ def transducer_collate(batch): audio_length = [] target = [] for record in batch: - # sf.write('/export/c06/ylu125/GSP/hyperion/egs/librispeech/v1/wavs/mix_{}.wav'.format(np.random.randn()), record[0], 16000) wav = torch.as_tensor(record[0]) audio.append(wav) audio_length.append(wav.shape[0]) diff --git a/hyperion/torch/models/transducer/joiner.py b/hyperion/torch/models/transducer/joiner.py index 57587992..0fc1fe51 100644 --- a/hyperion/torch/models/transducer/joiner.py +++ b/hyperion/torch/models/transducer/joiner.py @@ -40,8 +40,6 @@ def forward( Returns: Return a tensor of shape (N, T, U, C). """ - # print("encoder_out",encoder_out.shape) - # print("decoder_out",decoder_out.shape) assert encoder_out.ndim == decoder_out.ndim == 3 assert encoder_out.size(0) == decoder_out.size(0) assert encoder_out.size(2) == decoder_out.size(2) diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 1e36f9af..932c3ed4 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -109,7 +109,6 @@ def __init__( cpu_offload=cpu_offload, ) - @record def train_epoch(self, data_loader): """Training epoch loop @@ -125,8 +124,6 @@ def train_epoch(self, data_loader): for batch, (data, audio_length, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) - # print("data",data.shape) - # print("audio_length",audio_length) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index eec8a36a..3a4692dc 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -5,7 +5,6 @@ from .devices import open_device from .metric_acc import MetricAcc -# from .recognition_acc import RecogAcc from .masking import seq_lengths_to_mask, scale_seq_lengths from .collation import collate_seq_1d, collate_seq_2d, collate_seq_nd from .eval_utils import eval_nnet_by_chunks, eval_nnet_overlap_add From 8984b349e56d4b6e6ae846658515d6c28b83422f Mon Sep 17 00:00:00 2001 From: neillu23 Date: Tue, 20 Dec 2022 22:18:11 -0500 Subject: [PATCH 062/154] fix typo --- egs/librispeech/v1/global_conf/config_transducer_v1.sh | 4 ++-- hyperion/bin/train_wav2vec2transducer.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/egs/librispeech/v1/global_conf/config_transducer_v1.sh b/egs/librispeech/v1/global_conf/config_transducer_v1.sh index b88fe2a7..39c4d90f 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v1.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v1.sh @@ -1,4 +1,4 @@ - WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 # hugging face model hf_model_name=wav2vec2xlsr300m @@ -36,4 +36,4 @@ nnet_s3_args="" nnet_s3_name=${nnet_name}.s3 nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name nnet_s3=$nnet_s3_dir/model_ep0002.pth -nnet_s3=$nnet_s3_dir/model_ep0005.pth \ No newline at end of file +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 0f1e8a3d..8156f9b1 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -18,6 +18,7 @@ import multiprocessing import numpy as np + import torch import torch.nn as nn From e14a840e6754405732bfb98f3ccc422d93f89273 Mon Sep 17 00:00:00 2001 From: neillu23 Date: Wed, 21 Dec 2022 00:09:37 -0500 Subject: [PATCH 063/154] Add fine-tune function for transducer ASR --- ...et50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 2 +- ...v2vec2xlsr300m_transducer_stage2_v3.2.yaml | 61 +++++ .../v1/global_conf/config_transducer_v3.2.sh | 4 +- egs/librispeech/v1/run_011_train_asr.sh | 50 ++-- egs/librispeech/v1/run_030_inference.sh | 2 +- hyperion/bin/decode_wav2transducer.py | 114 +-------- hyperion/bin/finetune_wav2vec2transducer.py | 238 ++++++++++++++++++ hyperion/bin/train_wav2vec2transducer.py | 2 +- hyperion/torch/models/transducer/decoder.py | 66 +++++ .../torch/models/transducer/transducer.py | 87 +++---- .../wav2transducer/hf_wav2transducer.py | 16 +- 11 files changed, 449 insertions(+), 193 deletions(-) create mode 100644 egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml create mode 100755 hyperion/bin/finetune_wav2vec2transducer.py diff --git a/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 213380da..bf6c2fb8 100644 --- a/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/chime5_spkdet/v1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -65,4 +65,4 @@ diar_plda_z_dim=150 diar_plda_name=lda${diar_lda_dim}_${diar_plda_type}y${diar_plda_y_dim}_v1_${diar_plda_data} diar_thr=-7 diar_dir=exp/diarization/$nnet_name/${diar_plda_name}/ahc_pcar1_thr${diar_thr} -diar_name=diar_res2net50w26s4_thr${diar_thr} \ No newline at end of file +diar_name=diar_res2net50w26s4_thr${diar_thr} diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml new file mode 100644 index 00000000..69c489b0 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml @@ -0,0 +1,61 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + transducer: + decoder: + override_dropouts: true + embedding_dropout_rate: 0.3 + rnn_dropout_rate: 0.3 + +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.2.sh b/egs/librispeech/v1/global_conf/config_transducer_v3.2.sh index 2ff8d3c9..9185cc3f 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v3.2.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v3.2.sh @@ -23,9 +23,9 @@ nnet_name=${hf_model_name}_transducer_v3.2 nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0060.pth +nnet_s1=$nnet_s1_dir/model_ep0120.pth -nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml nnet_s2_args="" nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index 868cf4d1..3d0e6eb1 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -1,6 +1,6 @@ #!/bin/bash # Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) # Apache 2.0. # . ./cmd.sh @@ -68,23 +68,25 @@ if [ $stage -le 2 ]; then if [ "$use_wandb" == "true" ];then extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" fi - + mkdir -p $nnet_s2_dir/log $cuda_cmd \ --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2vec2xvector.py $nnet_type \ + finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ - --data.train.dataset.time-durs-file $list_dir/utt2dur \ - --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ - --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ - --data.val.dataset.time-durs-file $list_dir/utt2dur \ - --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ - --in-model-file $nnet_s1 \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ - --num-gpus $ngpu \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu fi @@ -94,22 +96,24 @@ if [ $stage -le 3 ]; then extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" fi + mkdir -p $nnet_s3_dir/log $cuda_cmd \ --gpu $ngpu $nnet_s3_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2vec2xvector.py $nnet_type \ + finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ - --data.train.dataset.time-durs-file $list_dir/utt2dur \ - --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ - --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ - --data.val.dataset.time-durs-file $list_dir/utt2dur \ - --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ - --in-model-file $nnet_s2 \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ - --num-gpus $ngpu \ - + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu fi diff --git a/egs/librispeech/v1/run_030_inference.sh b/egs/librispeech/v1/run_030_inference.sh index fb76088b..73ac2b8f 100755 --- a/egs/librispeech/v1/run_030_inference.sh +++ b/egs/librispeech/v1/run_030_inference.sh @@ -1,6 +1,6 @@ #!/bin/bash # Copyright -# 2020 Johns Hopkins University (Author: Jesus Villalba) +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) # Apache 2.0. # . ./cmd.sh diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index c71df79e..265a3536 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ - Copyright 2019 Jesus Villalba (Johns Hopkins University) + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ @@ -55,73 +55,6 @@ def load_model(model_path, device): model.eval() return model - - -# def decode_dataset( -# dl: torch.utils.data.DataLoader, -# params: AttributeDict, -# model: nn.Module, -# sp: spm.SentencePieceProcessor, -# ) -> Dict[str, List[Tuple[str, List[str], List[str]]]]: -# """Decode dataset. -# Args: -# dl: -# PyTorch's dataloader containing the dataset to decode. -# params: -# It is returned by :func:`get_params`. -# model: -# The neural model. -# sp: -# The BPE model. -# Returns: -# Return a dict, whose key may be "greedy_search" if greedy search -# is used, or it may be "beam_7" if beam size of 7 is used. -# Its value is a list of tuples. Each tuple contains two elements: -# The first is the reference transcript, and the second is the -# predicted result. -# """ -# num_cuts = 0 - -# try: -# num_batches = len(dl) -# except TypeError: -# num_batches = "?" - -# if decoding_method == "greedy_search": -# log_interval = 100 -# else: -# log_interval = 2 - -# results = defaultdict(list) -# for batch_idx, batch in enumerate(dl): -# texts = batch["supervisions"]["text"] -# cut_ids = [cut.id for cut in batch["supervisions"]["cut"]] - -# hyps_dict = decode_one_batch( -# params=params, -# model=model, -# sp=sp, -# batch=batch, -# ) - -# for name, hyps in hyps_dict.items(): -# this_batch = [] -# assert len(hyps) == len(texts) -# for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts): -# ref_words = ref_text.split() -# this_batch.append((cut_id, ref_words, hyp_words)) - -# results[name].extend(this_batch) - -# num_cuts += len(texts) - -# if batch_idx % log_interval == 0: -# batch_str = f"{batch_idx}/{num_batches}" - -# logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}") -# return results - - def decode_one_batch( model: nn.Module, sp: spm.SentencePieceProcessor, @@ -160,13 +93,8 @@ def decode_one_batch( feature = feature.to(device) # at entry, feature is (N, T, C) - feature_lens = torch.Tensor([x.shape[1]]).int() #batch["supervisions"] - # feature_lens = supervisions["num_frames"].to(device) + feature_lens = torch.Tensor([x.shape[1]]).int() - # encoder_out, encoder_out_lens = model.encoder(x=feature, x_lens=feature_lens) - - # print("feature",feature.shape) - # print("feature_lens",feature_lens) encoder_out, hid_feats, encoder_out_lens = model.forward_feats(x=feature, x_lengths=feature_lens) hyps = [] @@ -191,9 +119,9 @@ def decode_one_batch( logging.info("hyps:{}".format(" ".join(hyps[0]))) if decoding_method == "greedy_search": - return hyps[0] #{"greedy_search": hyps} + return hyps[0] else: - return hyps[0] #{f"beam_{params.beam_size}": hyps} + return hyps[0] def decode_transducer( @@ -211,12 +139,6 @@ def decode_transducer( sp = spm.SentencePieceProcessor() sp.load(bpe_model) - # blank_id = self.sp.piece_to_id("") - # vocab_size = self.sp.get_piece_size() - - # if write_num_frames_spec is not None: - # keys = [] - # info = [] augmenter = None aug_df = None @@ -268,27 +190,10 @@ def decode_transducer( if x.shape[1] == 0: y = np.zeros((model.embed_dim,), dtype=float_cpu()) else: - # x = x.transpose(1, 2).contiguous() - # x = torch.unsqueeze(x,2) - # writer.write(key + ' ' + "abc") y = decode_one_batch(model=model, sp=sp, x=x) - writer.write(key + ' ' + ' '.join(y) + "\n") - - # y = ( - # model.extract_embed( - # x, - # chunk_length=chunk_length, - # embed_layer=embed_layer, - # ) - # .cpu() - # .numpy()[0] - # ) t7 = time.time() - # writer.write([key], [y]) - # if write_num_frames_spec is not None: - # keys.append(key) - # info.append(str(x.shape[-1])) + writer.write(key + ' ' + ' '.join(y) + "\n") t8 = time.time() read_time = t2 - t1 @@ -313,15 +218,6 @@ def decode_transducer( ) ) - # if write_num_frames_spec is not None: - # logging.info("writing num-frames to %s" % (write_num_frames_spec)) - # u2nf = Utt2Info.create(keys, info) - # u2nf.save(write_num_frames_spec) - - # if aug_info_path is not None: - # aug_df = pd.concat(aug_df, ignore_index=True) - # aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") - if __name__ == "__main__": diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py new file mode 100755 index 00000000..b940c024 --- /dev/null +++ b/hyperion/bin/finetune_wav2vec2transducer.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import k2 +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD + +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import HFWav2Vec2Transducer +from torch.nn.utils.rnn import pad_sequence + + +model_dict = { + "hf_wav2vec2transducer": HFWav2Vec2Transducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + for record in batch: + wav = torch.as_tensor(record[0]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record[1]) + audio = pad_sequence(audio) + audio_length = torch.as_tensor(audio_length) + target = k2.RaggedTensor(target) + return torch.transpose(audio,0,1), audio_length, target + + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) + return data_loader + + +def init_model(in_model_file, rank, model_class, **kwargs): + model_args = model_class.filter_finetune_args(**kwargs["model"]) + # model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network ft args={}".format(model_args)) + model = TML.load(in_model_file) + model.change_config(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + + + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(**kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + parser.link_arguments( + "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" + ) + + + parser.add_argument("--in-model-file", required=True) + model_class.add_finetune_args(parser, prefix="model") + # model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + # multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 8156f9b1..ee60080a 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ - Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import sys diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py index 833394d0..e7a40ec0 100644 --- a/hyperion/torch/models/transducer/decoder.py +++ b/hyperion/torch/models/transducer/decoder.py @@ -15,8 +15,10 @@ # limitations under the License. from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +import logging from typing import Optional, Tuple +import logging import torch import torch.nn as nn @@ -137,6 +139,17 @@ def filter_args(**kwargs): return args + + @staticmethod + def filter_finetune_args(**kwargs): + valid_args = ( + "embedding_dropout_rate", + "rnn_dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + @staticmethod def add_class_args(parser, prefix=None, @@ -181,3 +194,56 @@ def add_class_args(parser, if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + + def change_config( + self, + override_dropouts=False, + embedding_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + ): + logging.info("changing decoder config") + + if override_dropouts: + logging.info("overriding decoder dropouts") + + # for module in self.modules(): + # if isinstance(module, DropConnect1d): + # module.p *= drop_connect_rate / self.drop_connect_rate + + self.rnn_dropout_rate = rnn_dropout_rate + self.rnn.p = self.rnn_dropout_rate + + self.embedding_dropout_rate = embedding_dropout_rate + self.embedding_dropout = nn.Dropout(self.embedding_dropout_rate) + + + + @staticmethod + def add_finetune_args(parser, + prefix=None, + skip=set(["in_feats", "blank_id", "vocab_size"])): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument("--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + )) + parser.add_argument("--embedding-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index 2d523b7c..8d2a09e8 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -23,6 +23,7 @@ except ModuleNotFoundError: from ...utils import dummy_k2 as k2 +import logging import torch import torch.nn as nn import torchaudio @@ -210,51 +211,41 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # def change_config( - # self, - # override_dropouts=False, - # dropout_rate=0, - # num_classes=None, - # loss_type="arc-softmax", - # cos_scale=64, - # margin=0.3, - # margin_warmup_epochs=10, - # intertop_k=5, - # intertop_margin=0.0, - # num_subcenters=2, - # ): - # logging.info("changing x-vector config") - # self.rebuild_output_layer( - # num_classes=num_classes, - # loss_type=loss_type, - # cos_scale=cos_scale, - # margin=margin, - # margin_warmup_epochs=margin_warmup_epochs, - # intertop_k=intertop_k, - # intertop_margin=intertop_margin, - # num_subcenters=num_subcenters, - # ) - - # if override_dropouts: - # logging.info("overriding x-vector dropouts") - # self.encoder_net.change_dropouts(dropout_rate) - # self.classif_net.change_dropouts(dropout_rate) - - # @staticmethod - # def filter_finetune_args(**kwargs): - # valid_args = ( - # ) - # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - # return args - - # @staticmethod - # def add_finetune_args(parser, prefix=None): - # if prefix is not None: - # outer_parser = parser - # parser = ArgumentParser(prog="") - - # if prefix is not None: - # outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - - # add_argparse_args = add_class_args - # add_argparse_finetune_args = add_finetune_args + def change_config(self, + decoder, + # joiner, + ): + logging.info("changing transducer config") + self.decoder.change_config(**decoder) + # self.joiner.change_config(**joiner) + + @staticmethod + def filter_finetune_args(**kwargs): + # get arguments for pooling + decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) + # joiner_args = Joiner.filter_finetune_args(**kwargs["joiner"]) + + valid_args = ( + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + args["decoder"] = decoder_args + # args["joiner"] = joiner_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Decoder.add_finetune_args(parser, prefix="decoder") + # Joiner.add_finetune_args(parser, prefix="joiner") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + add_argparse_args = add_class_args + add_argparse_finetune_args = add_finetune_args + + diff --git a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py index d21bb777..ec4c83b0 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py @@ -263,11 +263,11 @@ def set_train_mode(self, mode): self.unfreeze() elif mode == "frozen": self.freeze() - elif mode == "ft-embed-affine": - self.unfreeze() - self.freeze_feat_fuser() - self.freeze_hf_feats() - self.transducer.freeze_preembed_layers() + # elif mode == "ft-embed-affine": + # self.unfreeze() + # self.freeze_feat_fuser() + # self.freeze_hf_feats() + # self.transducer.freeze_preembed_layers() elif mode in ["ft-transducer", "ft-transducer-nograd"]: self.unfreeze() self.freeze_hf_feats() @@ -295,9 +295,9 @@ def _train(self, train_mode: str): if train_mode in ["full", "frozen"]: super()._train(train_mode) - elif train_mode == "ft-embed-affine": - self.hf_feats.train() - self.transducer._train("ft-embed_affine") + # elif train_mode == "ft-embed-affine": + # self.hf_feats.train() + # self.transducer._train("ft-embed_affine") elif train_mode in [ "ft-transducer", "hf-feats-frozen", From 90c97af3a1543eb39c2b46eeed7b2bbbaa9ada0b Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 22 Dec 2022 06:09:37 -0500 Subject: [PATCH 064/154] more transducer configs --- ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml | 55 +++++++++++++++++++ ...v2vec2xlsr300m_transducer_stage1_v4.3.yaml | 55 +++++++++++++++++++ ...v2vec2xlsr300m_transducer_stage1_v4.4.yaml | 55 +++++++++++++++++++ .../wav2vec2xlsr300m_transducer_do0.4.yaml | 13 +++++ .../v1/global_conf/config_transducer_v3.3.sh | 39 +++++++++++++ .../v1/global_conf/config_transducer_v4.3.sh | 39 +++++++++++++ .../v1/global_conf/config_transducer_v4.4.sh | 39 +++++++++++++ 7 files changed, 295 insertions(+) create mode 100644 egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml create mode 100644 egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v3.3.sh create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v4.3.sh create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v4.4.sh diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml new file mode 100644 index 00000000..76d676f2 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 85. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 30 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml new file mode 100644 index 00000000..35b2b47c --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml new file mode 100644 index 00000000..855bfc98 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml @@ -0,0 +1,55 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + #sampler_type: 'seg_sampler' + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 80 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2xlsr300m_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 42000 + hold_steps: 15000 + min_lr: 4e-5 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 1200 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml new file mode 100644 index 00000000..9fed09e7 --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.3.sh b/egs/librispeech/v1/global_conf/config_transducer_v3.3.sh new file mode 100644 index 00000000..41f9e21f --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v3.3.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v3.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v1/global_conf/config_transducer_v4.3.sh b/egs/librispeech/v1/global_conf/config_transducer_v4.3.sh new file mode 100644 index 00000000..de00c55a --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v4.3.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_all +dev_data=dev_all +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v4.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/librispeech/v1/global_conf/config_transducer_v4.4.sh b/egs/librispeech/v1/global_conf/config_transducer_v4.4.sh new file mode 100644 index 00000000..3114af61 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v4.4.sh @@ -0,0 +1,39 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_all +dev_data=dev_all +# nnet_data=train_clean_small + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v4.4 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0060.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth From 02bb457d5034dd2243107ccd0f0f578a45eca243 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sat, 24 Dec 2022 14:05:53 -0500 Subject: [PATCH 065/154] fix beam search --- egs/librispeech/v1/conf/clsp.conf | 2 +- .../v1/global_conf/config_transducer_v3.3.sh | 2 +- hyperion/bin/decode_wav2transducer.py | 125 ++++++++---------- .../models/wav2transducer/beam_search.py | 38 ++++-- 4 files changed, 85 insertions(+), 82 deletions(-) diff --git a/egs/librispeech/v1/conf/clsp.conf b/egs/librispeech/v1/conf/clsp.conf index 4ed38246..959c62a7 100644 --- a/egs/librispeech/v1/conf/clsp.conf +++ b/egs/librispeech/v1/conf/clsp.conf @@ -7,5 +7,5 @@ option num_threads=* -pe smp $0 option num_threads=1 # Do not add anything to qsub_opts option max_jobs_run=* -tc $0 default gpu=0 -option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*' option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.3.sh b/egs/librispeech/v1/global_conf/config_transducer_v3.3.sh index 41f9e21f..490baba7 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v3.3.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v3.3.sh @@ -23,7 +23,7 @@ nnet_name=${hf_model_name}_transducer_v3.3 nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0060.pth +nnet_s1=$nnet_s1_dir/model_ep0120.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s2_args="" diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index 265a3536..bbcd0dc7 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -4,8 +4,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - - from typing import Dict, List, Tuple import sentencepiece as spm @@ -39,6 +37,7 @@ from hyperion.torch.models.wav2transducer.beam_search import greedy_search, beam_search + def init_device(use_gpu): set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 @@ -55,12 +54,12 @@ def load_model(model_path, device): model.eval() return model + def decode_one_batch( - model: nn.Module, - sp: spm.SentencePieceProcessor, - x: torch.Tensor, - decoding_method = "beam_search" -) -> Dict[str, List[List[str]]]: + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search") -> Dict[str, List[List[str]]]: """Decode one batch and return the result in a dict. The dict has the following format: - key: It indicates the setting used for decoding. For example, @@ -86,7 +85,7 @@ def decode_one_batch( the returned dict. """ device = model.device - feature = x #batch["inputs"] + feature = x #batch["inputs"] assert x.shape[0] == 1 assert feature.ndim == 2 @@ -95,8 +94,9 @@ def decode_one_batch( feature_lens = torch.Tensor([x.shape[1]]).int() - encoder_out, hid_feats, encoder_out_lens = model.forward_feats(x=feature, x_lengths=feature_lens) - + encoder_out, hid_feats, encoder_out_lens = model.forward_feats( + x=feature, x_lengths=feature_lens) + hyps = [] batch_size = encoder_out.size(0) @@ -104,40 +104,31 @@ def decode_one_batch( for i in range(batch_size): # fmt: off - encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]] + encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]] # fmt: on if decoding_method == "greedy_search": hyp = greedy_search(model=model, encoder_out=encoder_out_i) elif decoding_method == "beam_search": - hyp = beam_search( - model=model, encoder_out=encoder_out_i, beam=5 - ) + hyp = beam_search(model=model, encoder_out=encoder_out_i, beam=5) else: raise ValueError(f"Unsupported decoding method: {decoding_method}") hyps.append(sp.decode(hyp).split()) - + logging.info("hyps:{}".format(" ".join(hyps[0]))) - + if decoding_method == "greedy_search": return hyps[0] else: return hyps[0] -def decode_transducer( - input_spec, - output_spec, - scp_sep, - model_path, - bpe_model, - use_gpu, - **kwargs -): +def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, + use_gpu, **kwargs): device = init_device(use_gpu) model = load_model(model_path, device) - sp = spm.SentencePieceProcessor() + sp = spm.SentencePieceProcessor() sp.load(bpe_model) augmenter = None @@ -147,10 +138,9 @@ def decode_transducer( ar_args = AR.filter_args(**kwargs) logging.info("opening output: %s" % (output_spec)) # with DWF.create(output_spec, scp_sep=scp_sep) as writer: - with open(output_spec,"w") as writer: - logging.info( - "opening input stream: {} with args={}".format(input_spec, ar_args) - ) + with open(output_spec, "w") as writer: + logging.info("opening input stream: {} with args={}".format( + input_spec, ar_args)) with AR(input_spec, **ar_args) as reader: while not reader.eof(): t1 = time.time() @@ -165,30 +155,28 @@ def decode_transducer( logging.info("processing utt %s" % (key0)) for aug_id in range(num_augs): t3 = time.time() - key, x = key0, x0 #augment(key0, x0, augmenter, aug_df, aug_id) + key, x = key0, x0 #augment(key0, x0, augmenter, aug_df, aug_id) t4 = time.time() with torch.no_grad(): x = torch.tensor( - x[None, :], dtype=torch.get_default_dtype() - ).to(device) + x[None, :], + dtype=torch.get_default_dtype()).to(device) t5 = time.time() tot_frames = x.shape[1] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( + "utt %s detected %d/%d (%.2f %%) speech frames" % ( key, x.shape[1], tot_frames, x.shape[1] / tot_frames * 100, - ) - ) - + )) t6 = time.time() if x.shape[1] == 0: - y = np.zeros((model.embed_dim,), dtype=float_cpu()) + y = np.zeros((model.embed_dim, ), + dtype=float_cpu()) else: y = decode_one_batch(model=model, sp=sp, x=x) @@ -199,41 +187,36 @@ def decode_transducer( read_time = t2 - t1 tot_time = read_time + t8 - t3 logging.info( - ( - "utt %s total-time=%.3f read-time=%.3f " - "aug-time=%.3f feat-time=%.3f " - "vad-time=%.3f embed-time=%.3f write-time=%.3f " - "rt-factor=%.2f" - ) - % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - ) - ) + ("utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f") % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + )) if __name__ == "__main__": parser = ArgumentParser( - description=( - "Extracts x-vectors from waveform computing " "acoustic features on the fly" - ) - ) + description=("Extracts x-vectors from waveform computing " + "acoustic features on the fly")) parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument("--input", dest="input_spec", required=True) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument("--scp-sep", + default=" ", + help=("scp file field separator")) AR.add_class_args(parser) - AF.add_class_args(parser, prefix="feats") parser.add_argument("--model-path", required=True) @@ -241,12 +224,16 @@ def decode_transducer( parser.add_argument("--bpe-model", required=True) parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument( - "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" - ) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) + parser.add_argument("--use-gpu", + default=False, + action="store_true", + help="extract xvectors in gpu") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) args = parser.parse_args() config_logger(args.verbose) diff --git a/hyperion/torch/models/wav2transducer/beam_search.py b/hyperion/torch/models/wav2transducer/beam_search.py index 95f6fadb..b23a0769 100644 --- a/hyperion/torch/models/wav2transducer/beam_search.py +++ b/hyperion/torch/models/wav2transducer/beam_search.py @@ -22,7 +22,8 @@ from .hf_wav2transducer import HFWav2Transducer -def greedy_search(model: HFWav2Transducer, encoder_out: torch.Tensor) -> List[int]: +def greedy_search(model: HFWav2Transducer, + encoder_out: torch.Tensor) -> List[int]: """ Args: model: @@ -39,7 +40,8 @@ def greedy_search(model: HFWav2Transducer, encoder_out: torch.Tensor) -> List[in blank_id = model.transducer.decoder.blank_id device = model.device - sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1) + sos = torch.tensor([blank_id], device=device, + dtype=torch.int64).reshape(1, 1) decoder_out, (h, c) = model.transducer.decoder(sos) T = encoder_out.size(1) t = 0 @@ -53,7 +55,7 @@ def greedy_search(model: HFWav2Transducer, encoder_out: torch.Tensor) -> List[in while t < T and sym_per_utt < max_sym_per_utt: # fmt: off - current_encoder_out = encoder_out[:, t:t+1, :] + current_encoder_out = encoder_out[:, t:t + 1, :] # fmt: on logits = model.transducer.joiner(current_encoder_out, decoder_out) # logits is (1, 1, 1, vocab_size) @@ -120,11 +122,12 @@ def beam_search( max_u = 20000 # terminate after this number of steps u = 0 - cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]] = {} while t < T and u < max_u: # fmt: off - current_encoder_out = encoder_out[:, t:t+1, :] + current_encoder_out = encoder_out[:, t:t + 1, :] # fmt: on A = B B = [] @@ -152,9 +155,8 @@ def beam_search( cached_key = "_".join(map(str, y_star.ys)) if cached_key not in cache: - decoder_input = torch.tensor([y_star.ys[-1]], device=device).reshape( - 1, 1 - ) + decoder_input = torch.tensor([y_star.ys[-1]], + device=device).reshape(1, 1) decoder_out, decoder_state = model.transducer.decoder( decoder_input, @@ -176,7 +178,8 @@ def beam_search( # First, choose blank skip_log_prob = log_prob[blank_id] new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() - + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) # ys[:] returns a copy of ys new_y_star = Hypothesis( ys=y_star.ys[:], @@ -186,8 +189,13 @@ def beam_search( ) B.append(new_y_star) + topk_log_prob = log_prob.topk(beam, dim=-1) + # Second, choose other labels - for i, v in enumerate(log_prob.tolist()): + #for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() if i == blank_id: continue new_ys = y_star.ys + [i] @@ -202,15 +210,23 @@ def beam_search( # check whether B contains more than "beam" elements more probable # than the most probable in A A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) B = sorted( [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob], key=lambda hyp: hyp.log_prob, reverse=True, ) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) if len(B) >= beam: B = B[:beam] break t += 1 best_hyp = max(B, key=lambda hyp: hyp.log_prob / len(hyp.ys[1:])) ys = best_hyp.ys[1:] # [1:] to remove the blank - return ys \ No newline at end of file + return ys From 3bfb7f08a672ffdbae6511e965227992ba96c09e Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 26 Dec 2022 05:46:48 -0500 Subject: [PATCH 066/154] updated binaries to train w2v2 x-vectors --- hyperion/bin/extract_wav2vec2xvectors.py | 32 +++++++++++++-- hyperion/bin/finetune_wav2vec2xvector.py | 40 ++++++------------- hyperion/bin/finetune_xvector_from_wav.py | 2 - hyperion/bin/train_wav2vec2xvector.py | 31 -------------- hyperion/torch/data/audio_dataset.py | 8 ++++ .../data/class_weighted_seg_chunk_sampler.py | 1 - .../models/wav2xvectors/hf_wav2xvector.py | 4 ++ hyperion/torch/models/xvectors/xvector.py | 2 +- 8 files changed, 53 insertions(+), 67 deletions(-) diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index cfa28f0a..a09e5c11 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -19,6 +19,7 @@ import pandas as pd import torch +import torchaudio.transforms as tat from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.utils import Utt2Info @@ -30,6 +31,25 @@ from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML +resamplers = {} + + +def get_resampler(source_fs, target_fs): + if source_fs in resamplers: + return resamplers[source_fs] + + resampler = tat.Resample( + int(source_fs), + int(target_fs), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + resamplers[source_fs] = resampler_f + return resampler_f + def init_device(use_gpu): set_float_cpu("float32") @@ -102,7 +122,7 @@ def extract_xvectors( num_augs, aug_info_path, use_gpu, - **kwargs + **kwargs, ): rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) @@ -122,12 +142,11 @@ def extract_xvectors( num_augs = 1 ar_args = AR.filter_args(**kwargs) + ar_args["wav_scale"] = 1.0 logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec, scp_sep=scp_sep) as writer: - logging.info( - "opening input stream: {} with args={}".format(input_spec, ar_args) - ) + logging.info(f"opening input stream: {input_spec} with args={ar_args}") with AR(input_spec, **ar_args) as reader: if vad_spec is not None: @@ -146,6 +165,11 @@ def extract_xvectors( key0 = key[0] fs = fs[0] t2 = time.time() + if fs != model.sample_frequency: + resampler = get_resampler(fs, model.sample_frequency) + print(f"x01 {x0.shape} {np.max(x0)}") + x0 = resampler(x0) + print(f"x01 {x0.shape} {np.max(x0)}") logging.info("processing utt %s", key0) for aug_id in range(num_augs): diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index 25722b35..718aeeb9 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -25,7 +25,8 @@ from hyperion.torch.utils import ddp from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SegSamplerFactory + from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import ( HFWav2Vec2ResNet1dXVector, @@ -45,19 +46,21 @@ def init_data(partition, rank, num_gpus, **kwargs): kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) - sampler_args = Sampler.filter_args(**kwargs["sampler"]) + sampler_args = kwargs["sampler"] if rank == 0: logging.info("{} audio dataset args={}".format(partition, ad_args)) logging.info("{} sampler args={}".format(partition, sampler_args)) logging.info("init %s dataset", partition) - ad_args["is_val"] = partition == "val" + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val dataset = AD(**ad_args) if rank == 0: logging.info("init %s samplers", partition) - sampler = Sampler(dataset, **sampler_args) + sampler = SegSamplerFactory.create(dataset, **sampler_args) if rank == 0: logging.info("init %s dataloader", partition) @@ -71,18 +74,6 @@ def init_data(partition, rank, num_gpus, **kwargs): return data_loader -# def init_model(num_classes, in_model_file, rank, **kwargs): -# xvec_args = kwargs["model"]["xvector"] -# if rank == 0: -# logging.info("xvector network ft args={}".format(xvec_args)) -# xvec_args["num_classes"] = num_classes -# model = TML.load(in_model_file) -# model.rebuild_output_layer(**xvec_args) -# if rank == 0: -# logging.info("model={}".format(model)) -# return model - - def init_model(num_classes, in_model_file, rank, **kwargs): model_args = kwargs["model"] if rank == 0: @@ -127,7 +118,7 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.num_classes, **kwargs) + model = init_model(list(train_loader.dataset.num_classes.values())[0], **kwargs) init_hard_prototype_mining(model, train_loader, val_loader, rank) trn_args = Trainer.filter_args(**kwargs["trainer"]) @@ -135,11 +126,7 @@ def train_model(gpu_id, args): logging.info("trainer args={}".format(trn_args)) metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, - device=device, - metrics=metrics, - ddp=world_size > 1, - **trn_args, + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) @@ -153,7 +140,7 @@ def make_parser(model_class): parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") AD.add_class_args(train_parser, prefix="dataset", skip={}) - Sampler.add_class_args(train_parser, prefix="sampler") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", type=int, @@ -163,7 +150,7 @@ def make_parser(model_class): val_parser = ArgumentParser(prog="") AD.add_class_args(val_parser, prefix="dataset", skip={}) - Sampler.add_class_args(val_parser, prefix="sampler") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", type=int, @@ -175,14 +162,11 @@ def make_parser(model_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) parser.link_arguments( - "data.train.dataset.class_file", "data.val.dataset.class_file" + "data.train.dataset.class_files", "data.val.dataset.class_files" ) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) - parser.link_arguments( - "data.train.sampler.batch_size", "data.val.sampler.batch_size" - ) parser.add_argument("--in-model-file", required=True) model_class.add_finetune_args(parser, prefix="model") diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 0f23fb0a..c6239b45 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -21,11 +21,9 @@ from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.utils import ddp -# from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.data import AudioDataset as AD -# from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index f1281904..7187c13c 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -27,7 +27,6 @@ from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory -# from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import ( HFWav2Vec2ResNet1dXVector, @@ -74,36 +73,6 @@ def init_data(partition, rank, num_gpus, **kwargs): return data_loader -# def init_data(partition, rank, num_gpus, **kwargs): - -# kwargs = kwargs["data"][partition] -# ad_args = AD.filter_args(**kwargs["dataset"]) -# sampler_args = Sampler.filter_args(**kwargs["sampler"]) -# if rank == 0: -# logging.info("{} audio dataset args={}".format(partition, ad_args)) -# logging.info("{} sampler args={}".format(partition, sampler_args)) -# logging.info("init %s dataset", partition) - -# ad_args["is_val"] = partition == "val" -# dataset = AD(**ad_args) - -# if rank == 0: -# logging.info("init %s samplers", partition) - -# sampler = Sampler(dataset, **sampler_args) - -# if rank == 0: -# logging.info("init %s dataloader", partition) - -# num_workers = kwargs["data_loader"]["num_workers"] -# num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) -# largs = ( -# {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} -# ) -# data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) -# return data_loader - - def init_model(num_classes, rank, model_class, **kwargs): model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 439c00ba..a52e7ab3 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -697,6 +697,14 @@ def __getitem__(self, segment): else: r = [x] + # try: + # import soundfile as sf + + # for i, z in enumerate(r): + # sf.write(f"file_{seg_id}.wav", z, fs, "PCM_16") + # except: + # print("soundfile failed", flush=True) + # adds the segment labels seg_info = self._get_segment_info(seg_id) r.extend(seg_info) diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 07a61b8f..72b094d0 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -235,7 +235,6 @@ def set_hard_prototypes(self, affinity_matrix): if np.all(mask_i == 0): affinity_matrix[:, i] = -1000 - # affinity_matrix[np.diag(affinity_matrix.shape[0])] = -1.0 # hard prototypes for a class are itself and k-1 closest to it. self.hard_prototypes = torch.topk( affinity_matrix, self.num_hard_prototypes, dim=-1 diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 3fed7143..bd1ec4cd 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -83,6 +83,10 @@ def _fuse_hid_feats(self, hid_feats): return feats + @property + def sample_frequency(self): + return self.hf_feats.sample_frequency + def compute_prototype_affinity(self): return self.xvector.compute_prototype_affinity() diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 15f0ce86..2939db5b 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -892,7 +892,7 @@ def add_finetune_args(parser, prefix=None): parser.add_argument( "--num-subcenters", default=2, - type=float, + type=int, help="number of subcenters in subcenter losses", ) From 13dd879c04ebc962df89f3af6ce3379eb3ff9826 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 27 Dec 2022 10:45:18 -0500 Subject: [PATCH 067/154] update configs for w2v --- .../v1.1/conf/train_data_default.yaml | 28 +++--- egs/voxceleb/v1.1/conf/val_data_default.yaml | 28 +++--- ...c2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml | 40 ++++++-- ...c2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml | 40 ++++++-- ...c2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml | 39 ++++++-- ...vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml | 40 ++++++-- ...vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml | 40 ++++++-- ...vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml | 40 ++++++-- ...baseplus6l_ecapatdnn512x3_stage1_v1.0.yaml | 40 ++++++-- ...s6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml | 40 ++++++-- ...baseplus9l_ecapatdnn512x3_stage1_v1.0.yaml | 40 ++++++-- ...s9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml | 40 ++++++-- ...lmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml | 43 +++++++++ ...lmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml | 40 ++++++-- ...lmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml | 40 ++++++-- ...lus_linfus_ecapatdnn512x3_stage1_v1.0.yaml | 42 ++++++--- ...lmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml | 40 ++++++-- ...lmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml | 40 ++++++-- ...vlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml | 40 ++++++-- ...wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml | 40 ++++++-- ...wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml | 40 ++++++-- ...wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml | 40 ++++++-- egs/voxceleb/v2/run_011_train_xvector.sh | 18 ++-- hyperion/torch/trainers/__init__.py | 18 ++-- hyperion/torch/trainers/ae_trainer.py | 86 ++++++++++------- hyperion/torch/trainers/dvae_trainer.py | 93 ++++++++++++------- hyperion/torch/trainers/plda_trainer.py | 70 +++++++------- hyperion/torch/trainers/torch_trainer.py | 88 ++++++++++-------- hyperion/torch/trainers/vae_trainer.py | 85 ++++++++++------- hyperion/torch/trainers/vq_dvae_trainer.py | 92 +++++++++++------- hyperion/torch/trainers/vq_vae_trainer.py | 86 ++++++++++------- .../torch/trainers/xvector_adv_trainer.py | 73 ++++++++------- .../trainers/xvector_adv_trainer_from_wav.py | 75 ++++++++------- hyperion/torch/trainers/xvector_trainer.py | 70 +++++++------- .../trainers/xvector_trainer_deep_feat_reg.py | 69 +++++++------- .../xvector_trainer_deep_feat_reg_from_wav.py | 79 +++++++++------- .../trainers/xvector_trainer_from_wav.py | 69 +++++++------- hyperion/torch/utils/devices.py | 45 +++++++++ hyperion/utils/__init__.py | 23 +++-- hyperion/utils/ext_segment_list.py | 4 +- hyperion/utils/fold_list.py | 2 +- hyperion/utils/info_table.py | 2 +- hyperion/utils/kaldi_matrix.py | 1 + hyperion/utils/list_utils.py | 5 +- hyperion/utils/misc.py | 27 ++++++ hyperion/utils/plotting.py | 6 +- hyperion/utils/queues.py | 8 +- hyperion/utils/rttm.py | 2 +- hyperion/utils/scp_list.py | 2 +- hyperion/utils/segment_list.py | 2 +- hyperion/utils/sparse_trial_key.py | 4 +- hyperion/utils/sparse_trial_scores.py | 13 +-- hyperion/utils/time_units.py | 1 + hyperion/utils/train_val_eval_list.py | 2 +- hyperion/utils/trial_key.py | 4 +- hyperion/utils/trial_ndx.py | 4 +- hyperion/utils/trial_scores.py | 8 +- hyperion/utils/trial_stats.py | 6 +- hyperion/utils/utt2info.py | 2 +- 59 files changed, 1395 insertions(+), 709 deletions(-) diff --git a/egs/voxceleb/v1.1/conf/train_data_default.yaml b/egs/voxceleb/v1.1/conf/train_data_default.yaml index acd088e6..1f96d1f6 100644 --- a/egs/voxceleb/v1.1/conf/train_data_default.yaml +++ b/egs/voxceleb/v1.1/conf/train_data_default.yaml @@ -1,17 +1,19 @@ dataset: - class_names: + dataset: + class_names: - class_id - aug_cfgs: + aug_cfgs: - conf/reverb_noise_aug.yaml - return_segment_info: + return_segment_info: - class_id -sampler: - sampler_type: class_weighted_random_seg_chunk_sampler - batch_size: 32 - max_chunk_length: 4.0 - min_chunk_length: 4.0 - num_chunks_per_seg_epoch: 6 - class_name: class_id -data_loader: - num_workers: 8 - \ No newline at end of file + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 diff --git a/egs/voxceleb/v1.1/conf/val_data_default.yaml b/egs/voxceleb/v1.1/conf/val_data_default.yaml index acd088e6..1f96d1f6 100644 --- a/egs/voxceleb/v1.1/conf/val_data_default.yaml +++ b/egs/voxceleb/v1.1/conf/val_data_default.yaml @@ -1,17 +1,19 @@ dataset: - class_names: + dataset: + class_names: - class_id - aug_cfgs: + aug_cfgs: - conf/reverb_noise_aug.yaml - return_segment_info: + return_segment_info: - class_id -sampler: - sampler_type: class_weighted_random_seg_chunk_sampler - batch_size: 32 - max_chunk_length: 4.0 - min_chunk_length: 4.0 - num_chunks_per_seg_epoch: 6 - class_name: class_id -data_loader: - num_workers: 8 - \ No newline at end of file + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml index e1d1b1ea..01ad8897 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wav2vec2xlsr300m_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml index 1298a056..90e3b14f 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml index fb264a53..7a2f7bba 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml @@ -1,25 +1,44 @@ data: train: dataset: - max_chunk_length: 6.0 - min_chunk_length: 6.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 16 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform num_hard_prototypes: 8 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: num_workers: 8 model: diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml index 247f8a7c..f424275d 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wav2vec2xlsr53_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml index 1298a056..90e3b14f 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml index 2867cfef..69bcc097 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 6.0 - min_chunk_length: 6.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 16 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: num_workers: 8 model: diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml index 570aad6a..86dec831 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wavlmbaseplus6l_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml index 9838b855..e22620ca 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus6l_linfus_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wavlmbaseplus6l_linfus_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml index 1028f79a..9860abfa 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wavlmbaseplus9l_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml index 2c2e5b64..18b910d1 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_linfus_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wavlmbaseplus9l_linfus_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml index f62b2e14..34c6e8dc 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,5 +1,48 @@ data: train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + +train: dataset: max_chunk_length: 3.0 min_chunk_length: 3.0 diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml index 1298a056..90e3b14f 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml index 1721e337..69bcc097 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 6.0 - min_chunk_length: 6.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: num_workers: 8 model: diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml index eb32ce0c..8c00d0fa 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_linfus_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: - num_workers: 8 + num_workers: 0 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wavlmbaseplus_linfus_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml index 895bcb2b..ad699556 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wavlmlarge12l_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml index 1721e337..69bcc097 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 6.0 - min_chunk_length: 6.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: num_workers: 8 model: diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml index 181d8fd7..9602d562 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge6l_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wavlmlarge6l_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml index 1af241ea..37b085f3 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: wavlmlarge_ecapatdnn512x3.yaml diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml index 1298a056..90e3b14f 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 data_loader: num_workers: 8 model: diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml index 2867cfef..69bcc097 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml @@ -1,24 +1,44 @@ data: train: dataset: - max_chunk_length: 6.0 - min_chunk_length: 6.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 16 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: num_workers: 8 val: dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 wav_scale: 1 sampler: - batch_size: 32 - iters_per_epoch: 6 + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 data_loader: num_workers: 8 model: diff --git a/egs/voxceleb/v2/run_011_train_xvector.sh b/egs/voxceleb/v2/run_011_train_xvector.sh index dc4e1dee..0eddb1a6 100755 --- a/egs/voxceleb/v2/run_011_train_xvector.sh +++ b/egs/voxceleb/v2/run_011_train_xvector.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ - --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ - --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ --data.val.dataset.audio-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ - --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_s1_dir $args \ --num-gpus $ngpu @@ -73,11 +73,11 @@ if [ $stage -le 2 ]; then --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.audio-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ - --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ - --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ --data.val.dataset.audio-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ - --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s1 \ --trainer.exp-path $nnet_s2_dir $args \ --num-gpus $ngpu \ @@ -98,11 +98,11 @@ if [ $stage -le 3 ]; then --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ --data.train.dataset.audio-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ - --data.train.dataset.key-file $list_dir/lists_xvec/train.scp \ - --data.train.dataset.class-file $list_dir/lists_xvec/class2int \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ --data.val.dataset.audio-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ - --data.val.dataset.key-file $list_dir/lists_xvec/val.scp \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s2 \ --trainer.exp-path $nnet_s3_dir $args \ --num-gpus $ngpu \ diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py index 8fef7df5..d8f0e908 100644 --- a/hyperion/torch/trainers/__init__.py +++ b/hyperion/torch/trainers/__init__.py @@ -3,17 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .dvae_trainer import DVAETrainer from .torch_trainer import TorchTrainer - +from .vae_trainer import VAETrainer +from .vq_dvae_trainer import VQDVAETrainer +from .vq_vae_trainer import VQVAETrainer +from .xvector_adv_trainer import XVectorAdvTrainer +from .xvector_adv_trainer_from_wav import XVectorAdvTrainerFromWav from .xvector_trainer import XVectorTrainer from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg -from .xvector_adv_trainer import XVectorAdvTrainer - +from .xvector_trainer_deep_feat_reg_from_wav import \ + XVectorTrainerDeepFeatRegFromWav from .xvector_trainer_from_wav import XVectorTrainerFromWav -from .xvector_trainer_deep_feat_reg_from_wav import XVectorTrainerDeepFeatRegFromWav -from .xvector_adv_trainer_from_wav import XVectorAdvTrainerFromWav - -from .vae_trainer import VAETrainer -from .dvae_trainer import DVAETrainer -from .vq_vae_trainer import VQVAETrainer -from .vq_dvae_trainer import VQDVAETrainer diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 21d53d32..6faaf684 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -3,14 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .torch_trainer import TorchTrainer @@ -44,7 +45,8 @@ class AETrainer(TorchTrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -75,38 +77,44 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="x", ): if loss is None: loss = nn.MSELoss() - super().__init__( - model, - loss, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # loss, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) def train_epoch(self, data_loader): """Training epoch loop @@ -191,3 +199,19 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super().add_class_args(parser, train_modes, skip=skip.union({"target_key"})) + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index b75a94ab..3300d152 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -3,14 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .torch_trainer import TorchTrainer @@ -43,7 +44,8 @@ class DVAETrainer(TorchTrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -73,36 +75,40 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x_aug", + target_key="x", ): - - super().__init__( - model, - None, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # None, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) def train_epoch(self, data_loader): """Training epoch loop @@ -203,3 +209,26 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super().add_class_args( + parser, train_modes, skip=skip.union({"input_key", "target_key"}) + ) + if "input_key" not in skip: + parser.add_argument( + "--input-key", default="x_aug", help="dict. key for nnet input" + ) + + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index ea5e57af..1c27c30d 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -2,17 +2,17 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch import torch.nn as nn +from ...utils.misc import filter_func_args +from ..losses import BCEWithLLR from ..utils import MetricAcc from ..utils.misc import get_selfsim_tarnon -from ..losses import BCEWithLLR from .torch_trainer import TorchTrainer @@ -48,6 +48,8 @@ class PLDATrainer(TorchTrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -80,38 +82,44 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="class_id", ): if loss is None: loss = nn.CrossEntropyLoss() - super().__init__( - model, - loss, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # loss, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) self.loss_bce = BCEWithLLR(p_tar) self.loss_weights = loss_weights diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 5f573904..5cadd57c 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -3,28 +3,28 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -import math import contextlib +import logging +import math +import os from collections import OrderedDict as ODict from enum import Enum -from jsonargparse import ArgumentParser, ActionParser -import logging from pathlib import Path import torch -import torch.nn as nn import torch.cuda.amp as amp -from torch.optim.swa_utils import AveragedModel, SWALR import torch.distributed as dist - +import torch.nn as nn from fairscale.optim.grad_scaler import ShardedGradScaler +from jsonargparse import ActionParser, ArgumentParser +from torch.optim.swa_utils import SWALR, AveragedModel -from ..utils import MetricAcc, TorchDDP, FairShardedDDP, FairFullyShardedDDP -from ..loggers import LoggerList, CSVLogger, ProgLogger, TensorBoardLogger, WAndBLogger -from ..optim import OptimizerFactory as OF -from ..lr_schedulers import LRSchedulerFactory as LRSF +from ...utils.misc import filter_func_args +from ..loggers import CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, WAndBLogger from ..lr_schedulers import LRScheduler as LRS +from ..lr_schedulers import LRSchedulerFactory as LRSF +from ..optim import OptimizerFactory as OF +from ..utils import FairFullyShardedDDP, FairShardedDDP, MetricAcc, TorchDDP class DDPType(str, Enum): @@ -66,6 +66,8 @@ class TorchTrainer(object): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -96,6 +98,8 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="class_id", ): self.model = model @@ -126,6 +130,8 @@ def __init__( self.swa_lr = swa_lr self.swa_anneal_epochs = swa_anneal_epochs self.amp_args = {} + self.input_key = input_key + self.target_key = target_key self.set_train_mode() @@ -150,9 +156,7 @@ def __init__( oss = False if ddp_type == DDPType.DDP else True self.optimizer = self._make_optimizer(optim, self.model, oss=oss) self.model = TorchDDP( - self.model, - device_ids=[device], - output_device=device, + self.model, device_ids=[device], output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) @@ -616,32 +620,34 @@ def load_last_checkpoint(self): @staticmethod def filter_args(**kwargs): - valid_args = ( - "grad_acc_steps", - "eff_batch_size", - "epochs", - "log_interval", - "use_amp", - "ddp_type", - "grad_clip", - "grad_clip_norm", - "swa_start", - "swa_lr", - "swa_anneal_epochs", - "exp_path", - "optim", - "lrsched", - "cpu_offload", - "use_tensorboard", - "use_wandb", - "wandb", - "train_mode", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args = filter_func_args(TorchTrainer.__init__, kwargs) + + # valid_args = ( + # "grad_acc_steps", + # "eff_batch_size", + # "epochs", + # "log_interval", + # "use_amp", + # "ddp_type", + # "grad_clip", + # "grad_clip_norm", + # "swa_start", + # "swa_lr", + # "swa_anneal_epochs", + # "exp_path", + # "optim", + # "lrsched", + # "cpu_offload", + # "use_tensorboard", + # "use_wandb", + # "wandb", + # "train_mode", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args @staticmethod - def add_class_args(parser, prefix=None, train_modes=None, skip=[]): + def add_class_args(parser, prefix=None, train_modes=None, skip={}): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -744,6 +750,14 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=[]): ) parser.add_argument("--exp-path", help="experiment path") + if "input_key" not in skip: + parser.add_argument( + "--input-key", default="x", help="dict. key for nnet input" + ) + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="class_id", help="dict. key for nnet targets" + ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index 284f07d0..8e75d768 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -3,14 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .torch_trainer import TorchTrainer @@ -43,7 +44,8 @@ class VAETrainer(TorchTrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -73,36 +75,41 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="x", ): - super().__init__( - model, - None, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # None, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) def train_epoch(self, data_loader): @@ -189,3 +196,19 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super().add_class_args(parser, train_modes, skip=skip.union({"target_key"})) + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index 30d2d3b3..bac95b78 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -2,15 +2,16 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict - import logging import math +import os +from collections import OrderedDict as ODict import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .dvae_trainer import DVAETrainer @@ -43,7 +44,8 @@ class VQDVAETrainer(DVAETrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -73,35 +75,40 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x_aug", + target_key="x", ): - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) def train_epoch(self, data_loader): @@ -199,3 +206,26 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super().add_class_args( + parser, train_modes, skip=skip.union({"input_key", "target_key"}) + ) + if "input_key" not in skip: + parser.add_argument( + "--input-key", default="x_aug", help="dict. key for nnet input" + ) + + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index c484b5c7..c4b046c0 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -2,15 +2,16 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -from collections import OrderedDict as ODict - import logging import math +import os +from collections import OrderedDict as ODict import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .vae_trainer import VAETrainer @@ -43,7 +44,8 @@ class VQVAETrainer(VAETrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp - + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -73,35 +75,39 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="x", ): - - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) def train_epoch(self, data_loader): @@ -199,3 +205,19 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super().add_class_args(parser, train_modes, skip=skip.union({"target_key"})) + if "target_key" not in skip: + parser.add_argument( + "--target-key", default="x", help="dict. key for nnet targets" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 961597e5..22971deb 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -2,16 +2,16 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os -from collections import OrderedDict as ODict - import time -import logging -from jsonargparse import ArgumentParser, ActionParser +from collections import OrderedDict as ODict import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .xvector_trainer import XVectorTrainer @@ -48,6 +48,8 @@ class XVectorAdvTrainer(XVectorTrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -81,36 +83,41 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="class_id", ): - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # loss=loss, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) self.attack = attack self.attack.to(device) @@ -230,7 +237,7 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None, skip=[]): + def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 036ee46e..ac28b95a 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -2,16 +2,16 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os -from collections import OrderedDict as ODict - import time -import logging -from jsonargparse import ArgumentParser, ActionParser +from collections import OrderedDict as ODict import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .xvector_trainer_from_wav import XVectorTrainerFromWav @@ -49,6 +49,8 @@ class XVectorAdvTrainerFromWav(XVectorTrainerFromWav): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -83,37 +85,42 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="class_id", ): - super().__init__( - model, - feat_extractor, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # feat_extractor, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # loss=loss, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) self.attack = attack self.attack.to(device) @@ -239,7 +246,7 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None, skip=[]): + def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 9b04fdd0..6703ea5d 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -2,17 +2,17 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch import torch.nn as nn +from torch.distributed.elastic.multiprocessing.errors import record +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .torch_trainer import TorchTrainer -from torch.distributed.elastic.multiprocessing.errors import record class XVectorTrainer(TorchTrainer): @@ -45,6 +45,8 @@ class XVectorTrainer(TorchTrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -75,38 +77,44 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="class_id", ): if loss is None: loss = nn.CrossEntropyLoss() - super().__init__( - model, - loss, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # loss, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) @record def train_epoch(self, data_loader): diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 1c0c26b7..fdb2627e 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -2,15 +2,15 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os -from jsonargparse import ArgumentParser, ActionParser from collections import OrderedDict as ODict -import logging - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .xvector_trainer import XVectorTrainer @@ -50,6 +50,8 @@ class XVectorTrainerDeepFeatReg(XVectorTrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -86,36 +88,41 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="class_id", ): - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # loss=loss, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) self.prior_model = prior_model if reg_loss is None or reg_loss == "l1": diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 1ad4d24a..07882f31 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -2,14 +2,14 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch import torch.nn as nn +from ...utils.misc import filter_func_args from ..utils import MetricAcc from .torch_trainer import TorchTrainer from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg @@ -50,6 +50,8 @@ class XVectorTrainerDeepFeatRegFromWav(XVectorTrainerDeepFeatReg): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -87,42 +89,47 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="class_id", ): - super().__init__( - model, - prior_model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - reg_layers_enc=reg_layers_enc, - reg_layers_classif=reg_layers_classif, - reg_weight_enc=reg_weight_enc, - reg_weight_classif=reg_weight_classif, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - reg_loss=reg_loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # prior_model, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # reg_layers_enc=reg_layers_enc, + # reg_layers_classif=reg_layers_classif, + # reg_weight_enc=reg_weight_enc, + # reg_weight_classif=reg_weight_classif, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # loss=loss, + # reg_loss=reg_loss, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) self.feat_extractor = feat_extractor if device is not None: diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 64a1d187..dfbd8e00 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -2,14 +2,14 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - import torch import torch.nn as nn +from ...utils.misc import filter_func_args from ..utils import MetricAcc, TorchDDP from .xvector_trainer import XVectorTrainer @@ -44,6 +44,8 @@ class XVectorTrainerFromWav(XVectorTrainer): swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. """ def __init__( @@ -75,44 +77,45 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="class_id", ): - super().__init__( - model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + # super().__init__( + # model, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # loss=loss, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) self.feat_extractor = feat_extractor if device is not None: self.feat_extractor.to(device) - # if ddp: - # self.feat_extractor = TorchDDP(self.feat_extractor) - def train_epoch(self, data_loader): """Training epoch loop diff --git a/hyperion/torch/utils/devices.py b/hyperion/torch/utils/devices.py index 16c61a48..cb77f1e5 100644 --- a/hyperion/torch/utils/devices.py +++ b/hyperion/torch/utils/devices.py @@ -42,3 +42,48 @@ def find_free_gpus(num_gpus): except: gpu_ids = "0" return gpu_ids + + +def tensors_to_device(data, device): + if isinstance(data, dict): + for k in data: + data[k] = data[k].to(device) + elif isinstance(data, list): + for i, value in enumerate(data): + data[i] = value.to(device) + elif isinstance(data, torch.Tensor): + data = data.to(device) + else: + raise Exception(f"Unknown data type for {data}") + + return data + + +def tensors_to_cpu(data): + if isinstance(data, dict): + for k in data: + data[k] = data[k].cpu() + elif isinstance(data, list): + for i, value in enumerate(data): + data[i] = value.cpu() + elif isinstance(data, torch.Tensor): + data = data.cpu() + else: + raise Exception(f"Unknown data type for {data}") + + return data + + +def tensors_to_numpy(data): + if isinstance(data, dict): + for k in data: + data[k] = data[k].cpu().numpy() + elif isinstance(data, list): + for i, value in enumerate(data): + data[i] = value.cpu().numpy() + elif isinstance(data, torch.Tensor): + data = data.cpu().numpy() + else: + raise Exception(f"Unknown data type for {data}") + + return data diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index 251361ae..67f492f9 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -3,19 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .trial_ndx import TrialNdx -from .trial_key import TrialKey -from .trial_scores import TrialScores -from .sparse_trial_key import SparseTrialKey -from .sparse_trial_scores import SparseTrialScores +from .class_info import ClassInfo +from .feature_set import FeatureSet +from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix +from .recording_set import RecordingSet +from .rttm import RTTM from .scp_list import SCPList -from .utt2info import Utt2Info - # from .ext_segment_list import ExtSegmentList from .segment_list import SegmentList -from .kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix -from .rttm import RTTM -from .recording_set import RecordingSet -from .class_info import ClassInfo from .segment_set import SegmentSet -from .feature_set import FeatureSet +from .sparse_trial_key import SparseTrialKey +from .sparse_trial_scores import SparseTrialScores +from .trial_key import TrialKey +from .trial_ndx import TrialNdx +from .trial_scores import TrialScores +from .utt2info import Utt2Info diff --git a/hyperion/utils/ext_segment_list.py b/hyperion/utils/ext_segment_list.py index 9c7d81d3..132cf7ff 100644 --- a/hyperion/utils/ext_segment_list.py +++ b/hyperion/utils/ext_segment_list.py @@ -3,10 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import logging -from copy import deepcopy +import os.path as path from collections import OrderedDict +from copy import deepcopy import numpy as np import pandas as pd diff --git a/hyperion/utils/fold_list.py b/hyperion/utils/fold_list.py index d5731f10..f22263cf 100644 --- a/hyperion/utils/fold_list.py +++ b/hyperion/utils/fold_list.py @@ -5,8 +5,8 @@ Class to make/read/write k-fold x-validation lists """ -import os.path as path import logging +import os.path as path from collections import OrderedDict from copy import deepcopy diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 80199a33..f2262217 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -3,10 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from pathlib import Path import logging from collections import OrderedDict from copy import deepcopy +from pathlib import Path import numpy as np import pandas as pd diff --git a/hyperion/utils/kaldi_matrix.py b/hyperion/utils/kaldi_matrix.py index 11726cc7..c8e26cbb 100644 --- a/hyperion/utils/kaldi_matrix.py +++ b/hyperion/utils/kaldi_matrix.py @@ -6,6 +6,7 @@ """ import struct + import numpy as np from ..hyp_defs import float_cpu diff --git a/hyperion/utils/list_utils.py b/hyperion/utils/list_utils.py index 6e805a25..4375183d 100644 --- a/hyperion/utils/list_utils.py +++ b/hyperion/utils/list_utils.py @@ -5,9 +5,10 @@ Utilities for lists. """ -import numpy as np -from operator import itemgetter from itertools import groupby +from operator import itemgetter + +import numpy as np def list2ndarray(a, dtype=None): diff --git a/hyperion/utils/misc.py b/hyperion/utils/misc.py index c185b9a3..b9bdf12b 100644 --- a/hyperion/utils/misc.py +++ b/hyperion/utils/misc.py @@ -4,6 +4,7 @@ Miscellaneous functions """ +from inspect import signature import numpy as np @@ -88,3 +89,29 @@ def filter_args(valid_args, kwargs): Dictionary with only valid_args keys if they exists """ return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + +def filter_func_args(func, kwargs, skip=set()): + """Filters arguments expected by a function + + Args: + func: function object + kwargs: dictionary containing arguments + skip: set with keys of func arguments to remove from kwargs + + Returns + Dictionary with arguments expected by the target function + """ + sig = signature(func) + valid_args = sig.parameters.keys() + skip.add("self") + for param in skip: + if param in kwargs: + del kwargs[param] + + my_kwargs = filter_args(valid_args, kwargs) + if "kwargs" in kwargs: + my_kwargs.update(kwargs["kwargs"]) + + args = sig.bind(**my_kwargs).arguments + return args diff --git a/hyperion/utils/plotting.py b/hyperion/utils/plotting.py index 7b87dbee..2341beb4 100644 --- a/hyperion/utils/plotting.py +++ b/hyperion/utils/plotting.py @@ -3,13 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np -import scipy.linalg as la - import matplotlib - # matplotlib.use('Agg') import matplotlib.pyplot as plt +import numpy as np +import scipy.linalg as la import scipy.stats as stats from mpl_toolkits.mplot3d import Axes3D as plt3d diff --git a/hyperion/utils/queues.py b/hyperion/utils/queues.py index ad4298be..8bfd0166 100644 --- a/hyperion/utils/queues.py +++ b/hyperion/utils/queues.py @@ -3,15 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import warnings import copy -import time -import numpy as np import multiprocessing import threading -import six +import time +import warnings from abc import abstractmethod +import numpy as np +import six try: import queue diff --git a/hyperion/utils/rttm.py b/hyperion/utils/rttm.py index 2ff3a4b0..c691fc17 100644 --- a/hyperion/utils/rttm.py +++ b/hyperion/utils/rttm.py @@ -9,8 +9,8 @@ import pandas as pd from .list_utils import * -from .vad_utils import * from .segment_list import SegmentList +from .vad_utils import * class RTTM(object): diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py index 8109d905..5abf76f2 100644 --- a/hyperion/utils/scp_list.py +++ b/hyperion/utils/scp_list.py @@ -3,11 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os import os.path as path from collections import OrderedDict from copy import deepcopy -import logging import numpy as np diff --git a/hyperion/utils/segment_list.py b/hyperion/utils/segment_list.py index 33b432bd..0151e967 100644 --- a/hyperion/utils/segment_list.py +++ b/hyperion/utils/segment_list.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import logging +import os.path as path from copy import deepcopy import numpy as np diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py index f18dee3b..5afc72a0 100644 --- a/hyperion/utils/sparse_trial_key.py +++ b/hyperion/utils/sparse_trial_key.py @@ -3,15 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import copy +import os.path as path import numpy as np import scipy.sparse as sparse from .list_utils import * -from .trial_ndx import TrialNdx from .trial_key import TrialKey +from .trial_ndx import TrialNdx class SparseTrialKey(TrialKey): diff --git a/hyperion/utils/sparse_trial_scores.py b/hyperion/utils/sparse_trial_scores.py index 0684c57e..7ed9a1d1 100644 --- a/hyperion/utils/sparse_trial_scores.py +++ b/hyperion/utils/sparse_trial_scores.py @@ -4,22 +4,23 @@ """ -import os.path as path -import logging import copy +import logging +import os.path as path import numpy as np import scipy.sparse as sparse -# import h5py - from ..hyp_defs import float_cpu from .list_utils import * -from .trial_ndx import TrialNdx -from .trial_key import TrialKey from .sparse_trial_key import SparseTrialKey +from .trial_key import TrialKey +from .trial_ndx import TrialNdx from .trial_scores import TrialScores +# import h5py + + class SparseTrialScores(TrialScores): diff --git a/hyperion/utils/time_units.py b/hyperion/utils/time_units.py index f8ed0846..6004329b 100644 --- a/hyperion/utils/time_units.py +++ b/hyperion/utils/time_units.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math + import numpy as np diff --git a/hyperion/utils/train_val_eval_list.py b/hyperion/utils/train_val_eval_list.py index f8cc4ca0..fd17e240 100644 --- a/hyperion/utils/train_val_eval_list.py +++ b/hyperion/utils/train_val_eval_list.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import logging +import os.path as path from collections import OrderedDict from copy import deepcopy diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py index b22babda..9552d7c0 100644 --- a/hyperion/utils/trial_key.py +++ b/hyperion/utils/trial_key.py @@ -3,11 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import copy +import os.path as path -import numpy as np import h5py +import numpy as np from .list_utils import * from .trial_ndx import TrialNdx diff --git a/hyperion/utils/trial_ndx.py b/hyperion/utils/trial_ndx.py index 58a36aa7..e26d19e2 100644 --- a/hyperion/utils/trial_ndx.py +++ b/hyperion/utils/trial_ndx.py @@ -3,11 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import copy +import os.path as path -import numpy as np import h5py +import numpy as np from .list_utils import * diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py index 164b39df..a486647d 100644 --- a/hyperion/utils/trial_scores.py +++ b/hyperion/utils/trial_scores.py @@ -4,17 +4,17 @@ """ -import os.path as path -import logging import copy +import logging +import os.path as path -import numpy as np import h5py +import numpy as np from ..hyp_defs import float_cpu from .list_utils import * -from .trial_ndx import TrialNdx from .trial_key import TrialKey +from .trial_ndx import TrialNdx class TrialScores(object): diff --git a/hyperion/utils/trial_stats.py b/hyperion/utils/trial_stats.py index 229bad3c..7d9d74d1 100644 --- a/hyperion/utils/trial_stats.py +++ b/hyperion/utils/trial_stats.py @@ -4,16 +4,16 @@ """ -import os.path as path -import logging import copy +import logging +import os.path as path import numpy as np import pandas as pd from ..hyp_defs import float_cpu -from .trial_ndx import TrialNdx from .trial_key import TrialKey +from .trial_ndx import TrialNdx class TrialStats(object): diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index 3cf4179b..9785d021 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os.path as path import logging +import os.path as path from collections import OrderedDict from copy import deepcopy From 4ba13a742f5eb7889f17f5cedf93052e9901f5a6 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 27 Dec 2022 10:52:30 -0500 Subject: [PATCH 068/154] isort data --- hyperion/torch/data/__init__.py | 4 +--- hyperion/torch/data/audio_dataset.py | 22 +++++++------------ hyperion/torch/data/bucketing_seg_sampler.py | 8 +++---- .../data/class_weighted_seg_chunk_sampler.py | 6 ++--- hyperion/torch/data/embed_dataset.py | 12 +++++----- hyperion/torch/data/feat_seq_dataset.py | 16 ++++++-------- hyperion/torch/data/hyp_sampler.py | 7 +++--- .../torch/data/paired_feat_seq_dataset.py | 5 ++--- hyperion/torch/data/seg_chunk_sampler.py | 8 +++---- hyperion/torch/data/seg_sampler.py | 6 ++--- hyperion/torch/data/seg_sampler_factory.py | 13 ++++++----- hyperion/torch/data/weighted_embed_sampler.py | 3 +-- hyperion/torch/data/weighted_seq_sampler.py | 7 +++--- 13 files changed, 53 insertions(+), 64 deletions(-) diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index 752cf0f5..ae2efca6 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -3,12 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .audio_dataset import AudioDataset # datasets from .feat_seq_dataset import FeatSeqDataset from .paired_feat_seq_dataset import PairedFeatSeqDataset - -from .audio_dataset import AudioDataset - # samplers # from .weighted_seq_sampler import ClassWeightedSeqSampler from .seg_sampler_factory import SegSamplerFactory diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index a52e7ab3..fc100d12 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -4,27 +4,25 @@ """ import logging -from jsonargparse import ActionYesNo, ArgumentParser, ActionParser -import time import math +import time import numpy as np import pandas as pd - import torch +import torch.distributed as dist import torchaudio.transforms as tat -from ..torch_defs import floatstr_torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from torch.utils.data import Dataset + from ...io import RandomAccessAudioReader as AR # from ...utils.utt2info import Utt2Info from ...np.augment import SpeechAugment - -from torch.utils.data import Dataset -import torch.distributed as dist - -from hyperion.np import augment - +from ...utils.class_info import ClassInfo +from ...utils.segment_set import SegmentSet +from ..torch_defs import floatstr_torch # class AudioDataset1(Dataset): # def __init__( @@ -449,10 +447,6 @@ # add_argparse_args = add_class_args -from ...utils.class_info import ClassInfo -from ...utils.segment_set import SegmentSet - - class AudioDataset(Dataset): def __init__( self, diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index 224660bb..02497f3b 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -3,16 +3,16 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math -from jsonargparse import ArgumentParser, ActionParser import logging +import math import numpy as np - import torch +import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser + from .hyp_sampler import HypSampler from .seg_sampler import SegSampler -import torch.distributed as dist class BucketingSegSampler(HypSampler): diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 72b094d0..184c4ab0 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -3,15 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import logging +import math import time import numpy as np import pandas as pd - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/embed_dataset.py b/hyperion/torch/data/embed_dataset.py index aa244d81..e489e3a3 100644 --- a/hyperion/torch/data/embed_dataset.py +++ b/hyperion/torch/data/embed_dataset.py @@ -8,18 +8,20 @@ import logging import time -# import copy - import numpy as np import pandas as pd - import torch +from torch.utils.data import Dataset -from ..torch_defs import floatstr_torch from ...io import RandomAccessDataReaderFactory as RF from ...utils.utt2info import Utt2Info +from ..torch_defs import floatstr_torch + +# import copy + + + -from torch.utils.data import Dataset class EmbedDataset(Dataset): diff --git a/hyperion/torch/data/feat_seq_dataset.py b/hyperion/torch/data/feat_seq_dataset.py index 462bfe41..61fbd576 100644 --- a/hyperion/torch/data/feat_seq_dataset.py +++ b/hyperion/torch/data/feat_seq_dataset.py @@ -3,24 +3,22 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -import logging -from jsonargparse import ArgumentParser, ActionParser -import time import copy +import logging +import os +import sys import threading +import time import numpy as np import pandas as pd - import torch +from jsonargparse import ActionParser, ArgumentParser +from torch.utils.data import Dataset -from ..torch_defs import floatstr_torch from ...io import RandomAccessDataReaderFactory as RF from ...utils.utt2info import Utt2Info - -from torch.utils.data import Dataset +from ..torch_defs import floatstr_torch class FeatSeqDataset(Dataset): diff --git a/hyperion/torch/data/hyp_sampler.py b/hyperion/torch/data/hyp_sampler.py index 18ae4b5d..c5097723 100644 --- a/hyperion/torch/data/hyp_sampler.py +++ b/hyperion/torch/data/hyp_sampler.py @@ -1,12 +1,11 @@ -import math -from jsonargparse import ArgumentParser, ActionParser import logging +import math import numpy as np - import torch -from torch.utils.data import Sampler import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser +from torch.utils.data import Sampler class HypSampler(Sampler): diff --git a/hyperion/torch/data/paired_feat_seq_dataset.py b/hyperion/torch/data/paired_feat_seq_dataset.py index 671bb6bf..fc17593e 100644 --- a/hyperion/torch/data/paired_feat_seq_dataset.py +++ b/hyperion/torch/data/paired_feat_seq_dataset.py @@ -4,13 +4,12 @@ """ import logging -import numpy as np +import numpy as np import torch -from ..torch_defs import floatstr_torch - from ...utils.utt2info import Utt2Info +from ..torch_defs import floatstr_torch from .feat_seq_dataset import FeatSeqDataset diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py index 2f5cc610..76054cd8 100644 --- a/hyperion/torch/data/seg_chunk_sampler.py +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -3,18 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math -from jsonargparse import ArgumentParser, ActionParser import logging +import math import numpy as np import pandas as pd - import torch +import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser + from ...utils.segment_set import SegmentSet from .hyp_sampler import HypSampler from .seg_sampler import SegSampler -import torch.distributed as dist class SegChunkSampler(HypSampler): diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index 73319dca..1c54a021 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -3,13 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import logging +import math import numpy as np - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index 251d937b..64dbb89e 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -2,17 +2,18 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from typing import Union, Optional import logging -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from typing import Optional, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .audio_dataset import AudioDataset +from .bucketing_seg_sampler import BucketingSegSampler +from .class_weighted_seg_chunk_sampler import \ + ClassWeightedRandomSegChunkSampler from .feat_seq_dataset import FeatSeqDataset - -from .seg_sampler import SegSampler -from .class_weighted_seg_chunk_sampler import ClassWeightedRandomSegChunkSampler from .seg_chunk_sampler import SegChunkSampler -from .bucketing_seg_sampler import BucketingSegSampler +from .seg_sampler import SegSampler sampler_dict = { "class_weighted_random_seg_chunk_sampler": ClassWeightedRandomSegChunkSampler, diff --git a/hyperion/torch/data/weighted_embed_sampler.py b/hyperion/torch/data/weighted_embed_sampler.py index 2c381365..22da93f9 100644 --- a/hyperion/torch/data/weighted_embed_sampler.py +++ b/hyperion/torch/data/weighted_embed_sampler.py @@ -2,12 +2,11 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging # import os import math -import logging import numpy as np - import torch from torch.utils.data import Sampler diff --git a/hyperion/torch/data/weighted_seq_sampler.py b/hyperion/torch/data/weighted_seq_sampler.py index c50d577d..345c2429 100644 --- a/hyperion/torch/data/weighted_seq_sampler.py +++ b/hyperion/torch/data/weighted_seq_sampler.py @@ -2,16 +2,15 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging # import os import math -from jsonargparse import ArgumentParser, ActionParser -import logging import numpy as np - import torch -from torch.utils.data import Sampler import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser +from torch.utils.data import Sampler class ClassWeightedSeqSampler(Sampler): From dd7b3edac34f48a9619a2af6b2f9b73c7af0edbc Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 27 Dec 2022 11:28:41 -0500 Subject: [PATCH 069/154] fixed filter_func_args --- hyperion/utils/misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hyperion/utils/misc.py b/hyperion/utils/misc.py index b9bdf12b..6813c6b7 100644 --- a/hyperion/utils/misc.py +++ b/hyperion/utils/misc.py @@ -113,5 +113,5 @@ def filter_func_args(func, kwargs, skip=set()): if "kwargs" in kwargs: my_kwargs.update(kwargs["kwargs"]) - args = sig.bind(**my_kwargs).arguments + args = sig.bind_partial(**my_kwargs).arguments return args From b2748955ea4b2c74009fdfaf4c49a782e1edc5cd Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 28 Dec 2022 05:55:10 -0500 Subject: [PATCH 070/154] trainers use dictionaries --- hyperion/torch/data/audio_dataset.py | 89 ++++++++++++++----- hyperion/torch/trainers/ae_trainer.py | 33 +++---- hyperion/torch/trainers/dvae_trainer.py | 35 +++----- hyperion/torch/trainers/plda_trainer.py | 22 ++--- hyperion/torch/trainers/torch_trainer.py | 30 ++++--- hyperion/torch/trainers/vae_trainer.py | 41 +++++---- hyperion/torch/trainers/vq_dvae_trainer.py | 38 +++----- hyperion/torch/trainers/vq_vae_trainer.py | 37 +++----- .../torch/trainers/xvector_adv_trainer.py | 31 ++++--- .../trainers/xvector_adv_trainer_from_wav.py | 33 ++++--- hyperion/torch/trainers/xvector_trainer.py | 10 +-- .../trainers/xvector_trainer_deep_feat_reg.py | 16 ++-- .../xvector_trainer_deep_feat_reg_from_wav.py | 22 ++--- .../trainers/xvector_trainer_from_wav.py | 30 ++++--- hyperion/torch/utils/__init__.py | 8 +- hyperion/torch/utils/devices.py | 18 ++++ 16 files changed, 262 insertions(+), 231 deletions(-) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index fc100d12..4368ec94 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -615,8 +615,22 @@ def _read_audio(self, seg_id, start, duration): x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration) return x[0].astype(floatstr_torch(), copy=False), fs[0] + # def _apply_augs(self, x, num_samples, reverb_context_samples): + # x_augs = [] + # # for each type of augmentation + # for i, augmenter in enumerate(self.augmenters): + # # we do n_augs per augmentation type + # for j in range(self.num_augs): + # # augment x + # x_aug, aug_info = augmenter(x) + # # remove the extra left context used to compute the reverberation. + # x_aug = x_aug[reverb_context_samples : len(x)] + # x_augs.append(x_aug.astype(floatstr_torch(), copy=False)) + + # return x_augs + def _apply_augs(self, x, num_samples, reverb_context_samples): - x_augs = [] + x_augs = {} # for each type of augmentation for i, augmenter in enumerate(self.augmenters): # we do n_augs per augmentation type @@ -625,26 +639,31 @@ def _apply_augs(self, x, num_samples, reverb_context_samples): x_aug, aug_info = augmenter(x) # remove the extra left context used to compute the reverberation. x_aug = x_aug[reverb_context_samples : len(x)] - x_augs.append(x_aug.astype(floatstr_torch(), copy=False)) + x_aug = x_aug.astype(floatstr_torch(), copy=False) + x_augs[f"x_aug_{i}_{j}"] = x_aug + + if not self.return_orig and len(x_augs) == 1: + # if we just have one aug and we don't return the clean version, + # we just call x to the aug version + x_augs["x"] = x_augs.pop("x_aug_0_0") return x_augs def _get_segment_info(self, seg_id): - r = [] + seg_info = {} # converts the class_ids to integers for info_name in self.return_segment_info: - seg_info = self.seg_set.loc[seg_id, info_name] + seg_info_i = self.seg_set.loc[seg_id, info_name] if info_name in self.class_info: # if the type of information is a class-id # we use the class information table to # convert from id to integer class_info = self.class_info[info_name] - idx = class_info.loc[seg_info, "class_idx"] - seg_info = idx + seg_info_i = class_info.loc[seg_info_i, "class_idx"] - r.append(seg_info) + seg_info[info_name] = seg_info_i - return r + return seg_info def _get_resampler(self, fs): if fs in self.resamplers: @@ -671,39 +690,65 @@ def _resample(self, x, fs): except: return x, fs + # def __getitem__(self, segment): + + # seg_id, start, duration = self._parse_segment_item(segment) + # x, fs = self._read_audio(seg_id, start, duration) + # x, fs = self._resample(x, fs) + # if self.augmenters: + # # augmentations + # num_samples = int(duration * fs) + # reverb_context_samples = len(x) - num_samples + # x_augs = self._apply_augs(x, num_samples, reverb_context_samples) + # r = x_augs + + # # add original non augmented audio + # if self.return_orig: + # x_orig = x[reverb_context_samples:] + # r.append(x_orig) + + # else: + # r = [x] + + # # try: + # # import soundfile as sf + + # # for i, z in enumerate(r): + # # sf.write(f"file_{seg_id}.wav", z, fs, "PCM_16") + # # except: + # # print("soundfile failed", flush=True) + + # # adds the segment labels + # seg_info = self._get_segment_info(seg_id) + # r.extend(seg_info) + + # return (*r,) + def __getitem__(self, segment): seg_id, start, duration = self._parse_segment_item(segment) x, fs = self._read_audio(seg_id, start, duration) x, fs = self._resample(x, fs) + data = {"seg_id": seg_id, "sample_freq": fs} if self.augmenters: # augmentations num_samples = int(duration * fs) reverb_context_samples = len(x) - num_samples x_augs = self._apply_augs(x, num_samples, reverb_context_samples) - r = x_augs + data.update(x_augs) # add original non augmented audio if self.return_orig: x_orig = x[reverb_context_samples:] - r.append(x_orig) + data["x"] = x_orig else: - r = [x] - - # try: - # import soundfile as sf - - # for i, z in enumerate(r): - # sf.write(f"file_{seg_id}.wav", z, fs, "PCM_16") - # except: - # print("soundfile failed", flush=True) + data["x"] = x # adds the segment labels seg_info = self._get_segment_info(seg_id) - r.extend(seg_info) - - return (*r,) + data.update(seg_info) + return data @staticmethod def filter_args(**kwargs): diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 6faaf684..7d6c7f07 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -12,7 +12,7 @@ from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .torch_trainer import TorchTrainer @@ -122,26 +122,21 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() for batch, data in enumerate(data_loader): - - if isinstance(data, (tuple, list)): - data, _ = data - self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data = data.to(self.device) - batch_size = data.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(data) - loss = self.loss(output, data).mean() / self.grad_acc_steps + output = self.model(input_data) + loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -155,7 +150,7 @@ def train_epoch(self, data_loader): batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, data) + batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -170,6 +165,7 @@ def train_epoch(self, data_loader): def validation_epoch(self, data_loader, swa_update_bn=False): + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): @@ -181,18 +177,15 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, data in enumerate(data_loader): - if isinstance(data, (tuple, list)): - data, _ = data - - data = data.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(data) - loss = self.loss(output, data) + output = self.model(input_data) + loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, data) + batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 3300d152..383a26d1 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -12,7 +12,7 @@ from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .torch_trainer import TorchTrainer @@ -116,28 +116,21 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning noisy and clean features """ - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() for batch, data in enumerate(data_loader): - - assert isinstance(data, (tuple, list)) - x = data[0] - x_target = data[1] - self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - x = x.to(self.device) - x_target = x_target.to(self.device) - batch_size = x.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(x, x_target=x_target, return_x_mean=True) + output = self.model(input_data, x_target=target, return_x_mean=True) elbo = output["elbo"].mean() loss = -elbo / self.grad_acc_steps @@ -157,7 +150,7 @@ def train_epoch(self, data_loader): for metric in ["log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x_target) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -175,6 +168,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): @@ -186,23 +180,16 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, data in enumerate(data_loader): - - assert isinstance(data, (tuple, list)) - x = data[0] - x_target = data[1] - - x = x.to(self.device) - x_target = x_target.to(self.device) - batch_size = x.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(x, x_target=x_target, return_x_mean=True) + output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] for metric in ["elbo", "log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x_target) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index 1c27c30d..54c4205f 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -11,7 +11,7 @@ from ...utils.misc import filter_func_args from ..losses import BCEWithLLR -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from ..utils.misc import get_selfsim_tarnon from .torch_trainer import TorchTrainer @@ -130,7 +130,7 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ - + batch_keys = [self.input_key, self.target_key] self.model.update_margin(self.cur_epoch) return_multi = self.loss_weights["multi"] > 0 @@ -140,20 +140,20 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc() batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if return_bin: target_bin, mask_bin = get_selfsim_tarnon(target, return_mask=True) with self.amp_autocast(): output = self.model( - data, + input_data, target, return_multi=return_multi, return_bin=return_bin, @@ -204,7 +204,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): Args: data_loader: PyTorch data loader return input/output pairs """ - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc() batch_metrics = ODict() return_multi = self.loss_weights["multi"] > 0 @@ -218,15 +218,15 @@ def validation_epoch(self, data_loader, swa_update_bn=False): log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if return_bin: target_bin, mask_bin = get_selfsim_tarnon(target, return_mask=True) with self.amp_autocast(): output = self.model( - data, return_multi=return_multi, return_bin=return_bin + input_data, return_multi=return_multi, return_bin=return_bin ) loss = 0 if return_multi: diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 5cadd57c..5099d829 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -24,7 +24,13 @@ from ..lr_schedulers import LRScheduler as LRS from ..lr_schedulers import LRSchedulerFactory as LRSF from ..optim import OptimizerFactory as OF -from ..utils import FairFullyShardedDDP, FairShardedDDP, MetricAcc, TorchDDP +from ..utils import ( + FairFullyShardedDDP, + FairShardedDDP, + MetricAcc, + TorchDDP, + tensors_subset, +) class DDPType(str, Enum): @@ -286,18 +292,20 @@ def train_epoch(self, data_loader): Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) + with self.amp_autocast(): - output = self.model(data) + output = self.model(input_data) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: @@ -310,7 +318,6 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - self._reduce_metric(loss) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) @@ -333,7 +340,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): data_loader: PyTorch data loader return input/output pairs. sw_update_bn: wheter or not, update batch-norm layers in SWA. """ - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(self.device) batch_metrics = ODict() with torch.no_grad(): @@ -344,12 +351,11 @@ def validation_epoch(self, data_loader, swa_update_bn=False): log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - + for batch, data in enumerate(data_loader): + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(data) + output = self.model(input_data) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index 8e75d768..2db9b24b 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -12,7 +12,7 @@ from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .torch_trainer import TorchTrainer @@ -112,26 +112,27 @@ def __init__( # ) def train_epoch(self, data_loader): + """Training epoch loop + Args: + data_loader: pytorch data loader returning noisy and clean features + """ + + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() for batch, data in enumerate(data_loader): - - if isinstance(data, (tuple, list)): - data, _ = data - self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data = data.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(data, return_x_mean=True) + output = self.model(input_data, x_target=target, return_x_mean=True) elbo = output["elbo"].mean() loss = -elbo / self.grad_acc_steps x_hat = output["x_mean"] @@ -150,20 +151,26 @@ def train_epoch(self, data_loader): for metric in ["log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, data) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs = ODict(("train_" + k, v) for k, v in logs.items()) logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics + logs = ODict(("train_" + k, v) for k, v in logs.items()) logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + Args: + data_loader: PyTorch data loader return input/output pairs + """ + + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): @@ -175,21 +182,17 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, data in enumerate(data_loader): - if isinstance(data, (tuple, list)): - data, _ = data - - data = data.to(self.device) - batch_size = data.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(data, return_x_mean=True) + output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] for metric in ["elbo", "log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, data) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index bac95b78..1f345f7d 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -12,7 +12,7 @@ from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .dvae_trainer import DVAETrainer @@ -112,27 +112,20 @@ def __init__( def train_epoch(self, data_loader): + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() for batch, data in enumerate(data_loader): - - assert isinstance(data, (tuple, list)) - x = data[0] - x_target = data[1] - self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - x = x.to(self.device) - x_target = x_target.to(self.device) - batch_size = x.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(x, x_target=x_target, return_x_mean=True) + output = self.model(input_data, x_target=target, return_x_mean=True) loss = output["loss"] x_hat = output["x_mean"] loss = loss.mean() / self.grad_acc_steps @@ -154,7 +147,7 @@ def train_epoch(self, data_loader): output["log_perplexity"].mean().item() ) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x_target) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -167,29 +160,22 @@ def train_epoch(self, data_loader): return logs def validation_epoch(self, data_loader, swa_update_bn=False): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: - log_tag = "train" + log_tag = "train_" self.model.train() else: log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): - - assert isinstance(data, (tuple, list)) - x = data[0] - x_target = data[1] - - x = x.to(self.device) - x_target = x_target.to(self.device) - batch_size = x.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(x, x_target=x_target, return_x_mean=True) + output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] for metric in ["loss", "elbo", "log_px", "kldiv_z", "vq_loss"]: @@ -199,7 +185,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): ) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x_target) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index c4b046c0..83a30cc8 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -12,7 +12,7 @@ from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .vae_trainer import VAETrainer @@ -110,28 +110,20 @@ def __init__( # ) def train_epoch(self, data_loader): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() for batch, data in enumerate(data_loader): - - if isinstance(data, (tuple, list)): - x = data[0] - else: - x = data - self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - x = x.to(self.device) - batch_size = x.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(x, return_x_mean=True) + output = self.model(input_data, x_target=target, return_x_mean=True) loss = output["loss"] x_hat = output["x_mean"] loss = loss.mean() / self.grad_acc_steps @@ -153,7 +145,7 @@ def train_epoch(self, data_loader): output["log_perplexity"].mean().item() ) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -166,7 +158,7 @@ def train_epoch(self, data_loader): return logs def validation_epoch(self, data_loader, swa_update_bn=False): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): @@ -178,17 +170,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, data in enumerate(data_loader): - - if isinstance(data, (tuple, list)): - x = data[0] - else: - x = data - - x = x.to(self.device) - batch_size = x.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(x, return_x_mean=True) + output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] for metric in ["loss", "elbo", "log_px", "kldiv_z", "vq_loss"]: @@ -198,7 +183,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): ) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(x_hat, x) + batch_metrics[k] = metric(x_hat, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 22971deb..0a45c97f 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -12,7 +12,7 @@ from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .xvector_trainer import XVectorTrainer @@ -136,34 +136,33 @@ def __init__( ) def train_epoch(self, data_loader): - + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) - - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if batch % self.grad_acc_steps == 0: if torch.rand(1) < self.p_attack: # generate adversarial attacks - logging.info("generating adv attack for batch=%d" % (batch)) + logging.info("generating adv attack for batch=%d", batch) self.model.eval() - data_adv = self.attack.generate(data, target) + data_adv = self.attack.generate(inptu_data, target) max_delta = torch.max(torch.abs(data_adv - data)).item() - logging.info("adv attack max perturbation=%f" % (max_delta)) - data = data_adv + logging.info("adv attack max perturbation=%f", max_delta) + input_data = data_adv self.model.train() self.optimizer.zero_grad() with self.amp_autocast(): - output = self.model(data, target) + output = self.model(input_data, target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: @@ -191,7 +190,7 @@ def train_epoch(self, data_loader): return logs def validation_epoch(self, data_loader, swa_update_bn=False): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() @@ -202,14 +201,14 @@ def validation_epoch(self, data_loader, swa_update_bn=False): log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if torch.rand(1) < self.p_val_attack: # generate adversarial attacks self.model.eval() - data = self.attack.generate(data, target) + data = self.attack.generate(input_data, target) if swa_update_bn: self.model.train() diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index ac28b95a..1a25e9a9 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -12,7 +12,7 @@ from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .xvector_trainer_from_wav import XVectorTrainerFromWav @@ -134,41 +134,41 @@ def __init__( "first step of the gradient acc. loop given that" "adv optimization over-writes the gradients " "stored in the model" - ) - % (p_attack, 1.0 / self.grad_acc_steps) + ), + p_attack, + 1.0 / self.grad_acc_steps, ) def train_epoch(self, data_loader): - + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) - - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if batch % self.grad_acc_steps == 0: if torch.rand(1) < self.p_attack: # generate adversarial attacks # logging.info('generating adv attack for batch=%d' % (batch)) self.model.eval() - data_adv = self.attack.generate(data, target) + data_adv = self.attack.generate(input_data, target) max_delta = torch.max(torch.abs(data_adv - data)).item() # z = torch.abs(data_adv-data) > 100 # logging.info('zz {} {}'.format(data[z], data_adv[z])) # logging.info('adv attack max perturbation=%f' % (max_delta)) - data = data_adv + input_data = data_adv self.model.train() self.optimizer.zero_grad() with torch.no_grad(): - feats = self.feat_extractor(data) + feats = self.feat_extractor(input_data) with self.amp_autocast(): output = self.model(feats, y=target) @@ -199,7 +199,7 @@ def train_epoch(self, data_loader): return logs def validation_epoch(self, data_loader, swa_update_bn=False): - + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() @@ -211,18 +211,17 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) if torch.rand(1) < self.p_val_attack: # generate adversarial attacks self.model.eval() - data = self.attack.generate(data, target) + input_data = self.attack.generate(input_data, target) if swa_update_bn: self.model.train() with torch.no_grad(): - feats = self.feat_extractor(data) + feats = self.feat_extractor(input_data) with self.amp_autocast(): output = self.model(feats) loss = self.loss(output, target) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 6703ea5d..9b97fb63 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -11,7 +11,7 @@ from torch.distributed.elastic.multiprocessing.errors import record from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .torch_trainer import TorchTrainer @@ -123,7 +123,7 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ - + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) @@ -135,10 +135,10 @@ def train_epoch(self, data_loader): if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): - output = self.model(data, y=target) + output = self.model(input_data, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index fdb2627e..5dbdfd0f 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -11,7 +11,7 @@ from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .xvector_trainer import XVectorTrainer @@ -144,24 +144,23 @@ def train_epoch(self, data_loader): Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with self.amp_autocast(): outputs = self.model( - data, + input_data, y=target, return_enc_layers=self.reg_layers_enc, return_classif_layers=self.reg_layers_classif, @@ -179,7 +178,7 @@ def train_epoch(self, data_loader): batch_metrics["loss-classif"] = loss.item() prior_outputs = self.prior_model( - data, + input_data, return_enc_layers=self.reg_layers_enc, return_classif_layers=self.reg_layers_classif, return_output=False, @@ -230,7 +229,6 @@ def train_epoch(self, data_loader): logs = ODict(("train_" + k, v) for k, v in logs.items()) logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - # total_batches +=1 logs = metric_acc.metrics logs["lr"] = self._get_lr() diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 07882f31..6a9aa067 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -10,7 +10,7 @@ import torch.nn as nn from ...utils.misc import filter_func_args -from ..utils import MetricAcc +from ..utils import MetricAcc, tensors_subset from .torch_trainer import TorchTrainer from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg @@ -141,23 +141,22 @@ def train_epoch(self, data_loader): Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] - + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with torch.no_grad(): - feats = self.feat_extractor(data) + feats = self.feat_extractor(input_data) with self.amp_autocast(): outputs = self.model( @@ -241,6 +240,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): Args: data_loader: PyTorch data loader return input/output pairs """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() with torch.no_grad(): @@ -251,11 +251,11 @@ def validation_epoch(self, data_loader, swa_update_bn=False): log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) - feats = self.feat_extractor(data) + feats = self.feat_extractor(input_data) with self.amp_autocast(): output = self.model(feats) loss = self.loss(output, target) diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index dfbd8e00..4a66f0eb 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -7,10 +7,11 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from ...utils.misc import filter_func_args -from ..utils import MetricAcc, TorchDDP +from ..utils import MetricAcc, TorchDDP, tensors_subset from .xvector_trainer import XVectorTrainer @@ -122,24 +123,28 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ - + batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.feat_extractor.train() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + # input_data, target = ( + # data[self.input_key].to(self.device), + # data[self.target_key].to(self.device), + # ) + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) with torch.no_grad(): - feats = self.feat_extractor(data) + feats = self.feat_extractor(input_data) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(feats, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps @@ -174,6 +179,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): data_loader: PyTorch data loader return input/output pairs. sw_update_bn: wheter or not, update batch-norm layers in SWA. """ + batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.feat_extractor.eval() @@ -185,12 +191,12 @@ def validation_epoch(self, data_loader, swa_update_bn=False): log_tag = "val_" self.model.eval() - for batch, (data, target) in enumerate(data_loader): - data, target = data.to(self.device), target.to(self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + input_data, target = tensors_subset(data, batch_keys, self.device) + batch_size = input_data.size(0) - feats = self.feat_extractor(data) - with self.amp_autocast(): + feats = self.feat_extractor(input_data) + with amp.autocast(enabled=self.use_amp): output = self.model(feats) loss = self.loss(output, target) diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index 3a4692dc..da4a3773 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -3,7 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .devices import open_device +from .devices import ( + open_device, + tensors_to_device, + tensors_to_cpu, + tensors_to_numpy, + tensors_subset, +) from .metric_acc import MetricAcc from .masking import seq_lengths_to_mask, scale_seq_lengths from .collation import collate_seq_1d, collate_seq_2d, collate_seq_nd diff --git a/hyperion/torch/utils/devices.py b/hyperion/torch/utils/devices.py index cb77f1e5..19c124b2 100644 --- a/hyperion/torch/utils/devices.py +++ b/hyperion/torch/utils/devices.py @@ -51,6 +51,8 @@ def tensors_to_device(data, device): elif isinstance(data, list): for i, value in enumerate(data): data[i] = value.to(device) + elif isinstance(data, tuple): + data = tuple(value.to(device) for value in data) elif isinstance(data, torch.Tensor): data = data.to(device) else: @@ -66,6 +68,8 @@ def tensors_to_cpu(data): elif isinstance(data, list): for i, value in enumerate(data): data[i] = value.cpu() + elif isinstance(data, tuple): + data = tuple(value.cpu() for value in data) elif isinstance(data, torch.Tensor): data = data.cpu() else: @@ -81,9 +85,23 @@ def tensors_to_numpy(data): elif isinstance(data, list): for i, value in enumerate(data): data[i] = value.cpu().numpy() + elif isinstance(data, tuple): + data = tuple(value.cpu().numpy() for value in data) elif isinstance(data, torch.Tensor): data = data.cpu().numpy() else: raise Exception(f"Unknown data type for {data}") return data + + +def tensors_subset(data, keys, device=None, return_dict=False): + if return_dict: + data = {k: data[k] for k in keys} + else: + data = tuple(data[k] for k in keys) + + if device is not None: + data = tensors_to_device(data, device) + + return data From 4f7bc3ddec8e2ebc63e71dca37ea63199765b2f7 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 28 Dec 2022 06:06:40 -0500 Subject: [PATCH 071/154] fixed amp --- hyperion/torch/trainers/ae_trainer.py | 5 +++-- hyperion/torch/trainers/dvae_trainer.py | 5 +++-- hyperion/torch/trainers/plda_trainer.py | 7 +++++-- hyperion/torch/trainers/torch_trainer.py | 4 ++-- hyperion/torch/trainers/vae_trainer.py | 5 +++-- hyperion/torch/trainers/vq_dvae_trainer.py | 5 +++-- hyperion/torch/trainers/vq_vae_trainer.py | 5 +++-- hyperion/torch/trainers/xvector_adv_trainer.py | 5 +++-- hyperion/torch/trainers/xvector_adv_trainer_from_wav.py | 5 +++-- hyperion/torch/trainers/xvector_trainer.py | 3 ++- hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py | 3 ++- .../trainers/xvector_trainer_deep_feat_reg_from_wav.py | 5 +++-- 12 files changed, 35 insertions(+), 22 deletions(-) diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 7d6c7f07..69e97cc6 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -8,6 +8,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser @@ -134,7 +135,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data) loss = self.loss(output, target).mean() / self.grad_acc_steps @@ -179,7 +180,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data) loss = self.loss(output, target) diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 383a26d1..0523ad44 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -8,6 +8,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser @@ -129,7 +130,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, x_target=target, return_x_mean=True) elbo = output["elbo"].mean() @@ -182,7 +183,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index 54c4205f..d6761e87 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -7,6 +7,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from ...utils.misc import filter_func_args @@ -151,7 +152,8 @@ def train_epoch(self, data_loader): if return_bin: target_bin, mask_bin = get_selfsim_tarnon(target, return_mask=True) - with self.amp_autocast(): + + with amp.autocast(enabled=self.use_amp): output = self.model( input_data, target, @@ -224,7 +226,8 @@ def validation_epoch(self, data_loader, swa_update_bn=False): if return_bin: target_bin, mask_bin = get_selfsim_tarnon(target, return_mask=True) - with self.amp_autocast(): + + with amp.autocast(enabled=self.use_amp): output = self.model( input_data, return_multi=return_multi, return_bin=return_bin ) diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 5099d829..ad3df161 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -304,7 +304,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data) loss = self.loss(output, target).mean() / self.grad_acc_steps @@ -354,7 +354,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data) loss = self.loss(output, target) diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index 2db9b24b..ba401cb7 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -8,6 +8,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser @@ -131,7 +132,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, x_target=target, return_x_mean=True) elbo = output["elbo"].mean() loss = -elbo / self.grad_acc_steps @@ -184,7 +185,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index 1f345f7d..03800e0d 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -8,6 +8,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser @@ -124,7 +125,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, x_target=target, return_x_mean=True) loss = output["loss"] x_hat = output["x_mean"] @@ -174,7 +175,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 83a30cc8..40b6b10d 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -8,6 +8,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser @@ -122,7 +123,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, x_target=target, return_x_mean=True) loss = output["loss"] x_hat = output["x_mean"] @@ -172,7 +173,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 0a45c97f..af915d6b 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -8,6 +8,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser @@ -161,7 +162,7 @@ def train_epoch(self, data_loader): self.optimizer.zero_grad() - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, target) loss = self.loss(output, target).mean() / self.grad_acc_steps @@ -213,7 +214,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.train() with torch.no_grad(): - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(data, **self.amp_args) loss = self.loss(output, target) diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 1a25e9a9..1e1b1778 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -8,6 +8,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser @@ -170,7 +171,7 @@ def train_epoch(self, data_loader): with torch.no_grad(): feats = self.feat_extractor(input_data) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(feats, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps @@ -222,7 +223,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): feats = self.feat_extractor(input_data) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(feats) loss = self.loss(output, target) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 9b97fb63..a9a9d98f 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -7,6 +7,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from torch.distributed.elastic.multiprocessing.errors import record @@ -137,7 +138,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(input_data, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 5dbdfd0f..4e791347 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -7,6 +7,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser @@ -158,7 +159,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): outputs = self.model( input_data, y=target, diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 6a9aa067..6d06eac8 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -7,6 +7,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn from ...utils.misc import filter_func_args @@ -158,7 +159,7 @@ def train_epoch(self, data_loader): with torch.no_grad(): feats = self.feat_extractor(input_data) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): outputs = self.model( feats, y=target, @@ -256,7 +257,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_size = input_data.size(0) feats = self.feat_extractor(input_data) - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp): output = self.model(feats) loss = self.loss(output, target) From 5a138e8ff6251760febfd3897b80ddc9b8602406 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 29 Dec 2022 06:29:52 -0500 Subject: [PATCH 072/154] update feature, embeddings datasets and samplers, added fwse and cfwse, added IDRnD Resnets, ResNet should work with x_lengths --- hyperion/torch/data/__init__.py | 3 + hyperion/torch/data/audio_dataset.py | 8 +- .../data/class_weighted_embed_sampler.py | 280 ++++++++++ hyperion/torch/data/embed_dataset.py | 187 ++++--- hyperion/torch/data/embed_sampler.py | 112 ++++ hyperion/torch/data/embed_sampler_factory.py | 125 +++++ hyperion/torch/data/feat_seq_dataset.py | 369 +++++-------- hyperion/torch/layer_blocks/__init__.py | 10 +- .../torch/layer_blocks/res2net1d_blocks.py | 4 +- .../torch/layer_blocks/res2net2d_blocks.py | 4 +- hyperion/torch/layer_blocks/res2net_blocks.py | 35 +- hyperion/torch/layer_blocks/se_blocks.py | 99 +++- .../torch/layer_blocks/seresnet_blocks.py | 44 +- hyperion/torch/layers/spec_augment.py | 46 +- hyperion/torch/models/xvectors/xvector.py | 39 +- hyperion/torch/narchs/dc1d_decoder.py | 41 +- hyperion/torch/narchs/dc1d_encoder.py | 42 +- hyperion/torch/narchs/dc2d_decoder.py | 41 +- hyperion/torch/narchs/dc2d_encoder.py | 41 +- hyperion/torch/narchs/resnet.py | 483 +++++++++++++++++- hyperion/torch/narchs/resnet1d_decoder.py | 43 +- hyperion/torch/narchs/resnet1d_encoder.py | 44 +- hyperion/torch/narchs/resnet2d_decoder.py | 41 +- hyperion/torch/narchs/resnet2d_encoder.py | 42 +- hyperion/torch/narchs/resnet_factory.py | 83 ++- hyperion/torch/trainers/torch_trainer.py | 2 +- 26 files changed, 1729 insertions(+), 539 deletions(-) create mode 100644 hyperion/torch/data/class_weighted_embed_sampler.py create mode 100644 hyperion/torch/data/embed_sampler.py create mode 100644 hyperion/torch/data/embed_sampler_factory.py diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index ae2efca6..65608a0c 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -4,9 +4,12 @@ """ from .audio_dataset import AudioDataset + # datasets from .feat_seq_dataset import FeatSeqDataset from .paired_feat_seq_dataset import PairedFeatSeqDataset + # samplers # from .weighted_seq_sampler import ClassWeightedSeqSampler from .seg_sampler_factory import SegSamplerFactory +from .embed_sampler_factory import EmbedSamplerFactory diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 4368ec94..02b81efa 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -482,16 +482,16 @@ def __init__( self.r = AR(audio_file, wav_scale=wav_scale) if rank == 0: - logging.info("loading segments file %s" % segments_file) + logging.info("loading segments file %s", segments_file) self.seg_set = SegmentSet.load(segments_file) if rank == 0: - logging.info("dataset contains %d seqs" % len(self.seg_set)) + logging.info("dataset contains %d seqs", len(self.seg_set)) self.is_val = is_val if time_durs_file is not None: if rank == 0: - logging.info("loading durations file %s" % time_durs_file) + logging.info("loading durations file %s", time_durs_file) time_durs = SegmentSet.load(time_durs_file) self.seg_set["duration"] = time_durs.loc[ @@ -771,7 +771,7 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None, skip={}): + def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/data/class_weighted_embed_sampler.py b/hyperion/torch/data/class_weighted_embed_sampler.py new file mode 100644 index 00000000..aed9105d --- /dev/null +++ b/hyperion/torch/data/class_weighted_embed_sampler.py @@ -0,0 +1,280 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +import time + +import numpy as np +import pandas as pd +import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from .hyp_sampler import HypSampler + + +class ClassWeightedEmbedSampler(HypSampler): + def __init__( + self, + embed_set, + class_info, + batch_size=1, + num_embeds_per_class=1, + weight_exponent=1.0, + weight_mode="custom", + num_hard_prototypes=0, + affinity_matrix=None, + class_name="class_id", + shuffle=False, + seed=1234, + ): + super().__init__(shuffle=shuffle, seed=seed) + self.class_name = class_name + self.embed_set = embed_set + self.class_info = class_info + self.batch_size = batch_size + self.avg_batch_size = batch_size + + self.num_embeds_per_class = num_embeds_per_class + + self.weight_exponent = weight_exponent + self.weight_mode = weight_mode + + self.num_hard_prototypes = num_hard_prototypes + self.batch = 0 + + self._compute_len() + self._compute_num_classes_per_batch() + self._gather_class_info() + self._set_class_weights() + + self.set_hard_prototypes(affinity_matrix) + + logging.info( + ("sampler batches/epoch=%d batch-size=%d, " "classes/batch=%.2f "), + self._len, + self.batch_size, + self.num_classes_per_batch, + ) + + def _set_seed(self): + if self.shuffle: + self.rng.manual_seed(self.seed + 10 * self.epoch + 100 * self.rank) + else: + self.rng.manual_seed(self.seed + 100 * self.rank) + + def _compute_len(self): + self._len = int( + math.ceil(len(self.embed_set) / self.avg_batch_size / self.world_size) + ) + + def __len__(self): + return self._len + + def _gather_class_info(self): + # we get some extra info that we need for the classes. + # we need the mapping from class index to id + self.map_class_idx_to_ids = self.class_info[["class_idx", "id"]] + self.map_class_idx_to_ids.set_index("class_idx", inplace=True) + + # we need the list of embeddings from each class + # to speed up embedding sampling + # searching then in each batch, it is too slow + map_class_to_embeds = self.embed_set.df[["id", self.class_name]].set_index( + self.class_name + ) + self.map_class_to_embed_idx = {} + for class_id in self.class_info["id"].values: + if class_id in map_class_to_embeds.index: + embed_ids = map_class_to_embeds.loc[class_id, "id"] + if isinstance(embed_ids, str): + embed_ids = [embed_ids] + else: + embed_ids = embed_ids.values + + embed_idx = self.embed_set.get_loc(embed_ids) + else: + embed_idx = [] + self.class_info.loc[class_id, "weights"] = 0.0 + self.class_info.renorm_weights() + + self.map_class_to_embed_idx[class_id] = embed_idx + + def _set_class_weights(self): + if self.weight_mode == "uniform": + self.class_info.set_uniform_weights() + elif self.weight_mode == "data-prior": + weights = self.class_info["total_duration"].values + self.class_info.set_weights(self, weights) + + if self.weight_exponent != 1.0: + self.class_info.exp_weights(self.weight_exponent) + + @property + def hard_prototype_mining(self): + return self.num_hard_prototypes > 1 + + def set_hard_prototypes(self, affinity_matrix): + if affinity_matrix is None: + self.hard_prototypes = None + return + + # don't sample hard negs from classes with zero weigth or absent + zero_w = self.class_info["weights"] == 0 + if np.any(zero_w): + zero_w_idx = self.class_info.loc[zero_w, "class_idx"].values + affinity_matrix[:, zero_w_idx] = -1000 + + for i in range(affinity_matrix.size(1)): + mask_i = self.class_info["class_idx"] == i + if np.all(mask_i == 0): + affinity_matrix[:, i] = -1000 + + # hard prototypes for a class are itself and k-1 closest to it. + self.hard_prototypes = torch.topk( + affinity_matrix, self.num_hard_prototypes, dim=-1 + ).indices + + def get_hard_prototypes(self, class_idx): + return self.hard_prototypes[class_idx].flatten().numpy() + + def _compute_num_classes_per_batch(self): + num_classes = self.batch_size / self.num_embeds_per_class + if self.hard_prototype_mining: + num_classes /= self.num_hard_prototypes + self.num_classes_per_batch = int(math.ceil(num_classes)) + + def _get_class_weights(self,): + return torch.as_tensor(self.class_info["weights"].values) + + def _sample_classes(self): + weights = self._get_class_weights() + row_idx = torch.multinomial( + weights, + num_samples=self.num_classes_per_batch, + replacement=True, + generator=self.rng, + ).numpy() + + class_ids = self.class_info.iloc[row_idx].id.values + if self.hard_prototype_mining: + # map class ids to class indexes + class_idx = self.class_info.loc[class_ids, "class_idx"].values + class_idx = self.get_hard_prototypes(class_idx) + # map back to class ids + class_ids = self.map_class_idx_to_ids.loc[class_idx, "id"].values + + return class_ids + + def _sample_embeds(self, class_ids): + + id_col_idx = self.embed_set.get_col_idx("id") + embed_ids = [] + for c in class_ids: + # get embeds belonging to c + embed_idx_c = self.map_class_to_embed_idx[c] + # sample num_embeds_per_class randomly + if len(embed_idx_c) == 0: + logging.error("no embeddings found with class=%s", c) + + sel_idx = torch.randint( + low=0, + high=len(embed_idx_c), + size=(self.num_embeds_per_class,), + generator=self.rng, + ).numpy() + + sel_embed_idx_c = embed_idx_c[sel_idx] + sel_embed_ids_c = list(self.embed_set.iloc[sel_embed_idx_c, id_col_idx]) + embed_ids.extend(sel_embed_ids_c) + + return embed_ids + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + class_ids = self._sample_classes() + embed_ids = self._sample_embeds(class_ids) + if self.batch == 0: + logging.info("batch 0 uttidx=%s", str(embed_ids[:10])) + + self.batch += 1 + return embed_ids + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "batch_size", + "num_embeds_per_class", + "weight_exponent", + "weight_mode", + "num_hard_prototypes", + "class_name", + "shuffle", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--batch-size", type=int, default=1, help=("batch size per gpu"), + ) + + parser.add_argument( + "--num-embeds-per-class", + type=int, + default=1, + help=("number of embeds per class in batch"), + ) + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the embeddings at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the info table indicates the class", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/embed_dataset.py b/hyperion/torch/data/embed_dataset.py index e489e3a3..2963854d 100644 --- a/hyperion/torch/data/embed_dataset.py +++ b/hyperion/torch/data/embed_dataset.py @@ -11,56 +11,60 @@ import numpy as np import pandas as pd import torch +import torch.distributed as dist + +from jsonargparse import ActionParser, ArgumentParser, ActionYesNo from torch.utils.data import Dataset from ...io import RandomAccessDataReaderFactory as RF -from ...utils.utt2info import Utt2Info +from ...utils.misc import filter_func_args +from ...utils.class_info import ClassInfo +from ...utils.info_table import InfoTable from ..torch_defs import floatstr_torch -# import copy - - - - - class EmbedDataset(Dataset): def __init__( self, embeds=None, - class_ids=None, - class_weights=None, - rspecifier=None, - key_file=None, - class_file=None, + embed_info=None, + class_info=None, + embed_file=None, + embed_info_file=None, + class_names=None, + class_files=None, + return_segment_info=None, path_prefix=None, preload_embeds=False, - return_class=True, is_val=False, ): - assert embeds is not None or rspecifier is not None - assert rspecifier is None or key_file is not None - assert class_ids is not None or key_file is not None + assert embeds is not None or embed_file is not None + assert embed_info is not None or embed_info is not None + assert class_info is not None or class_files is not None + super().__init__() + try: + rank = dist.get_rank() + world_size = dist.get_world_size() + except: + rank = 0 + world_size = 1 self.preload_embeds = preload_embeds - if key_file is not None: - if isinstance(key_file, Utt2Info): - self.u2c = key_file - else: - logging.info("loading utt2info file %s", key_file) - self.u2c = Utt2Info.load(key_file, sep=" ") - self.num_embeds = len(self.u2c) - else: - assert embeds is not None - self.u2c = None - self.num_embeds = len(embeds) + + if embed_info is None: + embed_info = InfoTable.load(embed_info_file) + + self.embed_info = embed_info + if rank == 0: + logging.info("dataset contains %d embeddings", len(self.embed_info)) if embeds is None: - logging.info("opening dataset %s", rspecifier) - self.r = RF.create(rspecifier, path_prefix=path_prefix, scp_sep=" ") + if rank == 0: + logging.info("opening dataset %s", rspecifier) + self.r = RF.create(embed_file, path_prefix=path_prefix, scp_sep=" ") if self.preload_embeds: - self.embeds = self.r.load(u2c.key, squeeze=True).astype( + self.embeds = self.r.load(embed_info["id"], squeeze=True).astype( floatstr_torch(), copy=False ) del self.r @@ -70,65 +74,80 @@ def __init__( self.embeds = embeds.astype(floatstr_torch(), copy=False) self.is_val = is_val - self._prepare_class_info(class_file, class_ids, class_weights) - self.return_class = return_class - - logging.info("dataset contains %d embeds", self.num_embeds) + if rank == 0: + logging.info("loading class-info files") + self._load_class_infos(class_names, class_files, is_val) + + self.return_segment_info = ( + [] if return_segment_info is None else return_segment_info + ) + + def _load_class_infos(self, class_names, class_files, is_val): + self.class_info = {} + if class_names is None: + assert class_files is None + return + + assert len(class_names) == len(class_files) + for name, file in zip(class_names, class_files): + assert ( + name in self.seg_set + ), f"class_name {name} not present in the segment set" + if self.rank == 0: + logging.info("loading class-info file %s" % file) + table = ClassInfo.load(file) + self.class_info[name] = table + if not is_val: + # check that all classes are present in the training segments + class_ids = table["id"] + segment_class_ids = self.seg_set[name].unique() + for c_id in class_ids: + if c_id not in segment_class_ids: + logging.warning( + "%s class: %s not present in dataset", name, c_id + ) + + @property + def num_embeds(self): + return len(self.embed_info) def __len__(self): return self.num_embeds - def _prepare_class_info(self, class_file, class_idx=None, class_weights=None): - if class_file is None: - if self.u2c is not None: - classes, class_idx = np.unique(self.u2c.info, return_inverse=True) - self.num_classes = np.max(class_idx) + 1 - else: - logging.info("reading class-file %s", class_file) - class_info = pd.read_csv(class_file, header=None, sep=" ") - class2idx = {str(k): i for i, k in enumerate(class_info[0])} - self.num_classes = len(class2idx) - class_idx = np.array([class2idx[k] for k in self.u2c.info], dtype=int) - if class_info.shape[1] == 2: - class_weights = np.array(class_info[1]).astype( - floatstr_torch(), copy=False - ) + @property + def num_classes(self): + return {k: t.num_classes for k, t in self.class_info.items()} - class2utt_idx = {} - class2num_utt = np.zeros((self.num_classes,), dtype=int) - - for k in range(self.num_classes): - idx = (class_idx == k).nonzero()[0] - class2utt_idx[k] = idx - class2num_utt[k] = len(idx) - if class2num_utt[k] == 0: - if not self.is_val: - logging.warning("class %d doesn't have any samples", k) - if class_weights is None: - class_weights = np.ones((self.num_classes,), dtype=floatstr_torch()) - class_weights[k] = 0 - - count_empty = np.sum(class2num_utt == 0) - if count_empty > 0: - logging.warning("%d classes have 0 samples", count_empty) - - self.utt_idx2class = class_idx - self.class2utt_idx = class2utt_idx - self.class2num_utt = class2num_utt - if class_weights is not None: - class_weights /= np.sum(class_weights) - class_weights = torch.Tensor(class_weights) - self.class_weights = class_weights - - def __getitem__(self, index): + def _read_embeds(self, embed_id): if self.preload_embeds: + index = self.embed_info.index.get_loc(embed_id) x = self.embeds[index] else: - key = self.u2c.key[index] - x = self.r.read([key])[0].astype(floatstr_torch(), copy=False) - - if not self.return_class: - return x - - class_idx = self.utt_idx2class[index] - return x, class_idx + x = self.r.read([embed_id])[0].astype(floatstr_torch(), copy=False) + return x + + def _get_embed_info(self, embed_id): + embed_info = {} + # converts the class_ids to integers + for info_name in self.return_embed_info: + embed_info_i = self.embed_info.loc[embed_id, info_name] + if info_name in self.class_info: + # if the type of information is a class-id + # we use the class information table to + # convert from id to integer + class_info = self.class_info[info_name] + embed_info_i = class_info.loc[embed_info_i, "class_idx"] + + embed_info[info_name] = embed_info_i + + return embed_info + + def __getitem__(self, embed_id): + + x = self._read_embed(embed_id) + + data = {"embed_id": embed_id, "x": x} + # adds the embed labels + embed_info = self._get_embed_info(embed_id) + data.update(embed_info) + return data diff --git a/hyperion/torch/data/embed_sampler.py b/hyperion/torch/data/embed_sampler.py new file mode 100644 index 00000000..8836fe2a --- /dev/null +++ b/hyperion/torch/data/embed_sampler.py @@ -0,0 +1,112 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math + +import numpy as np +import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from .hyp_sampler import HypSampler + + +class EmbedSampler(HypSampler): + def __init__( + self, embed_set, batch_size=1, shuffle=False, drop_last=False, seed=1234, + ): + super().__init__(shuffle=shuffle, seed=seed) + self.embed_set = embed_set + self.batch_size = batch_size + self.avg_batch_size = batch_size + + num_batches = len(self.embed_set) / batch_size / self.world_size + if drop_last: + self._len = int(num_batches) + else: + self._len = int(math.ceil(num_batches)) + + self._permutation = None + + def __len__(self): + return self._len + + def _shuffle_embeds(self): + self._permutation = torch.randperm( + len(self.embed_set), generator=self.rng + ).numpy() + + def __iter__(self): + super().__iter__() + if self.shuffle: + self._shuffle_segs() + + self.start = self.rank + return self + + def __next__(self): + + if self.batch == self._len: + raise StopIteration + + stop = min( + self.start + self.world_size * self.min_batch_size, len(self.embed_set) + ) + if self.shuffle: + idx = self._permutation[self.start : stop : self.world_size] + else: + idx = slice(self.start, stop, self.world_size) + + self.start += self.world_size * self.min_batch_size + + embed_ids = self.embed_set.iloc[idx].id + + if self.batch == 0: + logging.info("batch 0 chunks=%s", str(embed_ids[:10])) + + self.batch += 1 + return embed_ids + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "batch_size", + "shuffle", + "drop_last", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--batch-size", type=int, default=1, help=("minimum batch size per gpu"), + ) + + parser.add_argument( + "--drop-last", action=ActionYesNo, help="drops the last batch of the epoch", + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the segments or chunks at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/embed_sampler_factory.py b/hyperion/torch/data/embed_sampler_factory.py new file mode 100644 index 00000000..43d00b1d --- /dev/null +++ b/hyperion/torch/data/embed_sampler_factory.py @@ -0,0 +1,125 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Optional, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from .embed_dataset import EmbedDataset +from .bucketing_seg_sampler import BucketingSegSampler +from .class_weighted_embed_sampler import ClassWeightedEmbedSampler +from .embed_sampler import EmbedSampler + +sampler_dict = { + "class_weighted_embed_sampler": ClassWeightedEmbedSampler, + "embed_sampler": EmbedSampler, +} + + +class EmbedSamplerFactory(object): + """Factory class to create different types of samplers for + embeddings like x-vectors. + """ + + @staticmethod + def create( + dataset: EmbedDataset, + sampler_type: str = "class_weighted_embed_sampler", + **kwargs, + ): + """Functions that creates a sampler based on a dataset, sampler_type and sampler arguments. + + Args: + dataset: embeddings dataset object containing the data info + sampler_type: string indicating the sampler type. + """ + + sampler_class = sampler_dict[sampler_type] + sampler_kwargs = sampler_class.filter_args(**kwargs) + + if sampler_type in ["class_weighted_embed_sampler"]: + try: + class_name = sampler_kwargs["class_name"] + except: + class_name = "class_id" + sampler_kwargs["class_info"] = dataset.class_info[class_name] + + logging.info(f"sampler-args={sampler_kwargs}") + + return sampler_class(dataset.embed_info, **sampler_kwargs) + + @staticmethod + def filter_args(**kwargs): + + valid_args = ( + "batch_size", + "num_embeds_per_class", + "weight_exponent", + "weight_mode", + "num_hard_prototypes", + "class_name", + "shuffle", + "seed", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--batch-size", type=int, default=1, help=("batch size per gpu"), + ) + + parser.add_argument( + "--num-embeds-per-class", + type=int, + default=1, + help=("number of embeds per class in batch"), + ) + parser.add_argument( + "--weight-exponent", + default=1.0, + type=float, + help=("exponent for class weights"), + ) + parser.add_argument( + "--weight-mode", + default="custom", + choices=["custom", "uniform", "data-prior"], + help=("method to get the class weights"), + ) + + parser.add_argument( + "--num-hard-prototypes", + type=int, + default=0, + help=("number of hard prototype classes per batch"), + ) + + parser.add_argument( + "--shuffle", + action=ActionYesNo, + help="shuffles the embeddings at the beginning of the epoch", + ) + + parser.add_argument( + "--seed", + type=int, + default=1234, + help=("seed for sampler random number generator"), + ) + + parser.add_argument( + "--class-name", + default="class_id", + help="which column in the info table indicates the class", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/feat_seq_dataset.py b/hyperion/torch/data/feat_seq_dataset.py index 61fbd576..1605ead3 100644 --- a/hyperion/torch/data/feat_seq_dataset.py +++ b/hyperion/torch/data/feat_seq_dataset.py @@ -13,11 +13,15 @@ import numpy as np import pandas as pd import torch -from jsonargparse import ActionParser, ArgumentParser +import torch.distributed as dist + +from jsonargparse import ActionParser, ArgumentParser, ActionYesNo from torch.utils.data import Dataset from ...io import RandomAccessDataReaderFactory as RF -from ...utils.utt2info import Utt2Info +from ...utils.misc import filter_func_args +from ...utils.class_info import ClassInfo +from ...utils.segment_set import SegmentSet from ..torch_defs import floatstr_torch @@ -25,82 +29,100 @@ class FeatSeqDataset(Dataset): def __init__( self, feat_file, - key_file, - class_file=None, + segments_file, + class_names=None, + class_files=None, num_frames_file=None, + return_segment_info=None, path_prefix=None, - min_chunk_length=1, - max_chunk_length=None, - return_fullseqs=False, - return_class=True, transpose_input=True, is_val=False, ): - logging.info("opening dataset %s", feat_file) + super().__init__() + try: + rank = dist.get_rank() + world_size = dist.get_world_size() + except: + rank = 0 + world_size = 1 + + if rank == 0: + logging.info("opening feature reader %s", feat_file) + self.r = RF.create(feat_file, path_prefix=path_prefix, scp_sep=" ") - logging.info("loading utt2info file %s" % key_file) - self.u2c = Utt2Info.load(key_file, sep=" ") - logging.info("dataset contains %d seqs" % self.num_seqs) + + if rank == 0: + logging.info("loading segments file %s" % segments_file) + + self.seg_set = SegmentSet.load(segments_file) + if rank == 0: + logging.info("dataset contains %d seqs", len(self.seg_set)) self.is_val = is_val - self._seq_lengths = None if num_frames_file is not None: - self._read_num_frames_file(num_frames_file) - self._prune_short_seqs(min_chunk_length) - - self.short_seq_exist = self._seq_shorter_than_max_length_exists( - max_chunk_length - ) + if rank == 0: + logging.info("loading durations file %s", num_frames_file) - self._prepare_class_info(class_file) + time_durs = SegmentSet.load(num_frames_file) + self.seg_set["num_frames"] = time_durs.loc[ + self.seg_set["id"] + ].class_id.values.astype(int, copy=False) + else: + assert "num_frames" in self.seg_set - if max_chunk_length is None: - max_chunk_length = min_chunk_length - self._min_chunk_length = min_chunk_length - self._max_chunk_length = max_chunk_length + logging.info("loading class-info files") + self._load_class_infos(class_names, class_files, is_val) - self.return_fullseqs = return_fullseqs - self.return_class = return_class + self.return_segment_info = ( + [] if return_segment_info is None else return_segment_info + ) self.transpose_input = transpose_input - def _read_num_frames_file(self, file_path): - logging.info("reading num_frames file %s" % file_path) - nf_df = pd.read_csv(file_path, header=None, sep=" ") - nf_df.index = nf_df[0] - self._seq_lengths = nf_df.loc[self.u2c.key, 1].values + def _load_class_infos(self, class_names, class_files, is_val): + self.class_info = {} + if class_names is None: + assert class_files is None + return + + assert len(class_names) == len(class_files) + for name, file in zip(class_names, class_files): + assert ( + name in self.seg_set + ), f"class_name {name} not present in the segment set" + if self.rank == 0: + logging.info("loading class-info file %s" % file) + table = ClassInfo.load(file) + self.class_info[name] = table + if not is_val: + # check that all classes are present in the training segments + class_ids = table["id"] + segment_class_ids = self.seg_set[name].unique() + for c_id in class_ids: + if c_id not in segment_class_ids: + logging.warning( + "%s class: %s not present in dataset", name, c_id + ) + + def set_epoch(self, epoch): + self.epoch = epoch @property def num_seqs(self): - return len(self.u2c) + return len(self.seg_set) def __len__(self): return self.num_seqs @property def seq_lengths(self): - if self._seq_lengths is None: - self._seq_lengths = self.r.read_num_rows(self.u2c.key) - - return self._seq_lengths + return self.seg_set["num_frames"] @property def total_length(self): return np.sum(self.seq_lengths) - @property - def min_chunk_length(self): - if self.return_fullseqs: - self._min_chunk_length = np.min(self.seq_lengths) - return self._min_chunk_length - - @property - def max_chunk_length(self): - if self._max_chunk_length is None: - self._max_chunk_length = np.max(self.seq_lengths) - return self._max_chunk_length - @property def min_seq_length(self): return np.min(self.seq_lengths) @@ -109,239 +131,114 @@ def min_seq_length(self): def max_seq_length(self): return np.max(self.seq_lengths) - def _prune_short_seqs(self, min_length): - logging.info("pruning short seqs") - keep_idx = self.seq_lengths >= min_length - self.u2c = self.u2c.filter_index(keep_idx) - self._seq_lengths = self.seq_lengths[keep_idx] - logging.info( - "pruned seqs with min_length < %d," - "keep %d/%d seqs" % (min_length, self.num_seqs, len(keep_idx)) - ) - - def _prepare_class_info(self, class_file): - class_weights = None - if class_file is None: - classes, class_idx = np.unique(self.u2c.info, return_inverse=True) - class2idx = {k: i for i, k in enumerate(classes)} - else: - logging.info("reading class-file %s" % (class_file)) - class_info = pd.read_csv(class_file, header=None, sep=" ") - class2idx = {str(k): i for i, k in enumerate(class_info[0])} - class_idx = np.array([class2idx[k] for k in self.u2c.info], dtype=int) - if class_info.shape[1] == 2: - class_weights = np.array(class_info[1]).astype( - floatstr_torch(), copy=False - ) - - self.num_classes = len(class2idx) - - class2utt_idx = {} - class2num_utt = np.zeros((self.num_classes,), dtype=int) - - for k in range(self.num_classes): - idx = (class_idx == k).nonzero()[0] - class2utt_idx[k] = idx - class2num_utt[k] = len(idx) - if class2num_utt[k] == 0: - if not self.is_val: - logging.warning("class %d doesn't have any samples" % (k)) - if class_weights is None: - class_weights = np.ones((self.num_classes,), dtype=floatstr_torch()) - class_weights[k] = 0 - - count_empty = np.sum(class2num_utt == 0) - if count_empty > 0: - logging.warning("%d classes have 0 samples" % (count_empty)) - - self.utt_idx2class = class_idx - self.class2utt_idx = class2utt_idx - self.class2num_utt = class2num_utt - if class_weights is not None: - class_weights /= np.sum(class_weights) - class_weights = torch.Tensor(class_weights) - self.class_weights = class_weights - - if self.short_seq_exist: - # if there are seq shorter than max_chunk_lenght we need some extra variables - # we will need class_weights to put to 0 classes that have all utts shorter than the batch chunk length - if self.class_weights is None: - self.class_weights = torch.ones((self.num_classes,)) - - # we need the max length of the utterances of each class - class2max_length = torch.zeros((self.num_classes,), dtype=torch.int) - for c in range(self.num_classes): - if class2num_utt[c] > 0: - class2max_length[c] = int( - np.max(self.seq_lengths[self.class2utt_idx[c]]) - ) - - self.class2max_length = class2max_length - - def _seq_shorter_than_max_length_exists(self, max_length): - return np.any(self.seq_lengths < max_length) - @property - def var_chunk_length(self): - return self.min_chunk_length < self.max_chunk_length - - def get_random_chunk_length(self): - - if self.var_chunk_length: - return torch.randint( - low=self.min_chunk_length, high=self.max_chunk_length + 1, size=(1,) - ).item() - - return self.max_chunk_length - - # def get_random_chunk_length(self, index): - - # if self.min_chunk_length < self.max_chunk_length: - # if self.short_seq_exist: - # max_chunk_length = min(int(np.min(self.seq_lengths[index])), - # self.max_chunk_length) - # else: - # max_chunk_length = self.max_chunk_length + def num_classes(self): + return {k: t.num_classes for k, t in self.class_info.items()} + + def _parse_segment_item(self, segment): + if isinstance(segment, (tuple, list)): + seg_id, start, num_frames = segment + assert num_frames <= self.seg_set.loc[seg_id].num_frames, ( + f"{seg_id} with start={start} num_frames " + f"({self.seg_set.loc[seg_id].num_frames}) < " + f"chunk duration ({num_frames})" + ) + else: + seg_id, start, num_frames = segment, 0, 0 - # chunk_length = torch.randint( - # low=self.min_chunk_length, high=max_chunk_length+1, size=(1,)).item() + if "start" in self.seg_set: + start += self.seg_set.loc[seg_id].start - # # logging.info('{} {} {} set_random_chunk_length={}'.format( - # # self,os.getpid(), threading.get_ident(), chunk_length)) - # return chunk_length + return seg_id, int(start), int(num_frames) - # return self.max_chunk_length + def _read_feats(self, seg_id, start, num_frames): + x = self.r.read(seg_id, row_offset=start, num_rows=num_frames)[0].astype( + floatstr_torch(), copy=False + ) + return x - def __getitem__(self, index): - # logging.info('{} {} {} get item {}'.format( - # self, os.getpid(), threading.get_ident(), index)) - if self.return_fullseqs: - return self._get_fullseq(index) - else: - return self._get_random_chunk(index) + def _get_segment_info(self, seg_id): + seg_info = {} + # converts the class_ids to integers + for info_name in self.return_segment_info: + seg_info_i = self.seg_set.loc[seg_id, info_name] + if info_name in self.class_info: + # if the type of information is a class-id + # we use the class information table to + # convert from id to integer + class_info = self.class_info[info_name] + seg_info_i = class_info.loc[seg_info_i, "class_idx"] - def _get_fullseq(self, index): - key = self.u2c.key[index] - x = self.r.read([key])[0].astype(floatstr_torch(), copy=False) - if self.transpose_input: - x = x.T - if not self.return_class: - return x + seg_info[info_name] = seg_info_i - class_idx = self.utt_idx2class[index] - return x, class_idx + return seg_info - def _get_random_chunk(self, index): + def __getitem__(self, segment): - if len(index) == 2: - index, chunk_length = index - else: - chunk_length = self.max_chunk_length - - key = self.u2c.key[index] - full_seq_length = int(self.seq_lengths[index]) - assert ( - chunk_length <= full_seq_length - ), "chunk_length(%d) <= full_seq_length(%d)" % (chunk_length, full_seq_length) - first_frame = torch.randint( - low=0, high=full_seq_length - chunk_length + 1, size=(1,) - ).item() - - x = self.r.read([key], row_offset=first_frame, num_rows=chunk_length)[0].astype( - floatstr_torch(), copy=False - ) + seg_id, start, num_frames = self._parse_segment_item(segment) + x = self._read_feats(seg_id, start, num_frames) + num_frames = x.shape[0] if self.transpose_input: x = x.T - if not self.return_class: - return x + data = {"seg_id": seg_id, "x": x, "x_lengths": num_frames} - class_idx = self.utt_idx2class[index] - return x, class_idx + # adds the segment labels + seg_info = self._get_segment_info(seg_id) + data.update(seg_info) + return data @staticmethod def filter_args(**kwargs): - valid_args = ( - "feat_file", - "key_file", - "path_prefix", - "class_file", - "num_frames_file", - "min_chunk_length", - "max_chunk_length", - "return_fullseqs", - "part_idx", - "num_parts", - ) - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return filter_func_args(FeatSeqDataset.__init__, kwargs) @staticmethod - def add_class_args(parser, prefix=None, skip={"feat_file", "key_file"}): + def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") if "feat_file" not in skip: parser.add_argument( - "--feat-file", - required=True, - help=("acoustic features manifest file"), + "--audio-file", required=True, help=("feature manifest file"), ) - if "key_file" not in skip: + if "segments_file" not in skip: parser.add_argument( - "--key-file", - required=True, - help=("key manifest file"), + "--segments-file", required=True, help=("segments manifest file"), ) parser.add_argument( - "--path-prefix", default="", help=("path prefix for rspecifier scp file") + "--class-names", + default=None, + nargs="+", + help=( + "list with the names of the types of classes in the datasets, e.g., speaker, language" + ), ) parser.add_argument( - "--class-file", - default=None, - help=("ordered list of classes keys, it can contain class weights"), + "--class-files", default=None, nargs="+", help=("list of class info files"), ) parser.add_argument( "--num-frames-file", default=None, - help=( - "utt to num_frames file, if None it reads from the dataset " - "but it is slow" - ), + help=("segment to num-frames file, if durations are not in segments_file"), ) parser.add_argument( - "--min-chunk-length", - type=int, - default=None, - help=("minimum length of sequence chunks"), - ) - parser.add_argument( - "--max-chunk-length", - type=int, + "--return-segment-info", default=None, - help=("maximum length of sequence chunks"), + nargs="+", + help=( + "list of columns of the segment file which should be returned as supervisions" + ), ) parser.add_argument( - "--return-fullseqs", - default=False, - action="store_true", - help=("returns full sequences instead of chunks"), + "--path-prefix", default="", help=("path prefix for rspecifier scp file") ) - - # parser.add_argument('--part-idx', - # type=int, default=1, - # help=('splits the list of files in num-parts and process part_idx')) - # parser.add_argument('--num-parts', - # type=int, default=1, - # help=('splits the list of files in num-parts and process part_idx')) + RF.add_class_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='feature sequence dataset options') - - add_argparse_args = add_class_args diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index c292f09a..2fa71766 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -4,7 +4,15 @@ """ from .fc_blocks import FCBlock -from .se_blocks import SEBlock2D, TSEBlock2D, SEBlock2d, TSEBlock2d, SEBlock1d +from .se_blocks import ( + SEBlock2D, + TSEBlock2D, + SEBlock2d, + TSEBlock2d, + FwSEBlock2d, + CFwSEBlock2d, + SEBlock1d, +) from .tdnn_blocks import TDNNBlock from .etdnn_blocks import ETDNNBlock from .resetdnn_blocks import ResETDNNBlock diff --git a/hyperion/torch/layer_blocks/res2net1d_blocks.py b/hyperion/torch/layer_blocks/res2net1d_blocks.py index 59706f61..804dbbd3 100644 --- a/hyperion/torch/layer_blocks/res2net1d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net1d_blocks.py @@ -213,7 +213,7 @@ def forward(self, x, x_mask=None): x_i = self.bn1s[i](x_i) x_i = self.act1(x_i) if not self.norm_before: - x_i = self.bn1(x_i) + x_i = self.bn1s[i](x_i) x.append(x_i) if self.scale > 1: @@ -405,7 +405,7 @@ def forward(self, x, x_mask=None): x_i = self.bn2s[i](x_i) x_i = self.act2(x_i) if not self.norm_before: - x_i = self.bn2(x_i) + x_i = self.bn2s[i](x_i) x.append(x_i) if self.scale > 1: diff --git a/hyperion/torch/layer_blocks/res2net2d_blocks.py b/hyperion/torch/layer_blocks/res2net2d_blocks.py index e426d809..26d19a9a 100644 --- a/hyperion/torch/layer_blocks/res2net2d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net2d_blocks.py @@ -213,7 +213,7 @@ def forward(self, x, x_mask=None): x_i = self.bn1s[i](x_i) x_i = self.act1(x_i) if not self.norm_before: - x_i = self.bn1(x_i) + x_i = self.bn1s[i](x_i) x.append(x_i) if self.scale > 1: @@ -402,7 +402,7 @@ def forward(self, x, x_mask=None): x_i = self.bn2s[i](x_i) x_i = self.act2(x_i) if not self.norm_before: - x_i = self.bn2(x_i) + x_i = self.bn2s[i](x_i) x.append(x_i) if self.scale > 1: diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 83ce7601..072926c9 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -8,7 +8,7 @@ from torch.nn import Conv2d, BatchNorm2d, Dropout2d from ..layers import ActivationFactory as AF -from .se_blocks import SEBlock2D, TSEBlock2D +from .se_blocks import SEBlock2d, TSEBlock2d, FwSEBlock2d, CFwSEBlock2d def _conv3x3(in_channels, out_channels, stride=1, groups=1, dilation=1, bias=False): @@ -80,6 +80,7 @@ def __init__( norm_layer=None, norm_before=True, se_r=None, + se_type="cw-se", time_se=False, num_feats=None, ): @@ -148,9 +149,16 @@ def __init__( if se_r is not None: if time_se: - self.se_layer = TSEBlock2D(channels, num_feats, se_r, activation) - else: - self.se_layer = SEBlock2D(channels, se_r, activation) + se_type = "cw-se" + + if se_type == "t-se": + self.se_layer = TSEBlock2d(channels, num_feats, se_r, activation) + elif se_type == "cw-se": + self.se_layer = SEBlock2d(channels, se_r, activation) + elif se_type == "fw-se": + self.se_layer = FwSEBlock2d(num_feats, se_r, activation) + elif se_type == "cfw-se": + self.se_layer = CFwSEBlock2d(channels, num_feats, se_r, activation) else: self.se_layer = None @@ -255,6 +263,7 @@ def __init__( norm_layer=None, norm_before=True, se_r=None, + se_type="cw-se", time_se=False, num_feats=None, ): @@ -318,11 +327,17 @@ def __init__( if se_r is not None: if time_se: - self.se_layer = TSEBlock2D( - channels * self.expansion, num_feats, se_r, activation - ) - else: - self.se_layer = SEBlock2D(channels * self.expansion, se_r, activation) + se_type = "t-se" + + se_channels = channels * self.expansion + if se_type == "t-se": + self.se_layer = TSEBlock2d(se_channels, num_feats, se_r, activation) + elif se_type == "cw-se": + self.se_layer = SEBlock2d(se_channels, se_r, activation) + elif se_type == "fw-se": + self.se_layer = FwSEBlock2d(num_feats, se_r, activation) + elif se_type == "cfw-se": + self.se_layer = CFwSEBlock2d(se_channels, num_feats, se_r, activation) else: self.se_layer = None @@ -362,7 +377,7 @@ def forward(self, x, x_mask=None): x_i = self.bn2s[i](x_i) x_i = self.act2(x_i) if not self.norm_before: - x_i = self.bn2(x_i) + x_i = self.bn2s[i](x_i) x.append(x_i) if self.scale > 1: diff --git a/hyperion/torch/layer_blocks/se_blocks.py b/hyperion/torch/layer_blocks/se_blocks.py index c3ba8e20..e99d545e 100644 --- a/hyperion/torch/layer_blocks/se_blocks.py +++ b/hyperion/torch/layer_blocks/se_blocks.py @@ -10,7 +10,7 @@ from ..layers import ActivationFactory as AF -class SEBlock2D(nn.Module): +class SEBlock2d(nn.Module): """Squeeze-excitation block 2d from https://arxiv.org/abs/1709.01507. @@ -43,8 +43,8 @@ def _standardize_mask(self, mask): return mask - def forward(self, x, x_mask=None): - """Forward function. + def compute_scale_logit(self, x, x_mask=None): + """comptue the scale before the sigmoid Args: x: input tensor with shape = (batch, channels, heigh, width). @@ -61,12 +61,26 @@ def forward(self, x, x_mask=None): total = torch.mean(x_mask, dim=(2, 3), keepdim=True) z = torch.mean(x * x_mask, dim=(2, 3), keepdim=True) / total - scale = self.sigmoid(self.conv2(self.act(self.conv1(z)))) + return self.conv2(self.act(self.conv1(z))) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ + scale_logit = self.compute_scale_logit(x, x_mask) + scale = self.sigmoid(scale_logit) y = scale * x return y -class TSEBlock2D(nn.Module): +class TSEBlock2d(nn.Module): """From https://arxiv.org/abs/1709.01507 Modified to do pooling only in time dimension. @@ -138,6 +152,77 @@ def forward(self, x, x_mask=None): return y +class FwSEBlock2d(SEBlock2d): + """frequency-wise Squeeze-excitation block 2d + + Attributes: + num_feats: input/output channels. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. + + """ + + def __init__(self, num_feats, r=16, activation={"name": "relu", "inplace": True}): + super().__init__(num_feats, r, activation) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time) + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ + x = x.transpose(1, 2) + y = super().forward(x, x_mask) + y = y.tranpose(1, 2).continous() + return y + + +class CFwSEBlock2d(nn.Module): + """2-d channel and frequency wise squeeze-excitation block + + Attributes: + num_channels: input/output channels. + num_feats: Number of features in dimension 2. + r: Squeeze-excitation compression ratio. + activation: Non-linear activation object, string of configuration dictionary. + + """ + + def __init__( + self, + num_channels, + num_feats, + r=16, + activation={"name": "relu", "inplace": True}, + ): + super().__init__() + self.cw_se = SEBlock2d(num_channels, r, activation) + self.fw_se = SEBlock2d(num_feats, r, activation) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with shape = (batch, channels, heigh, width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time) + Returns: + Tensor with shape = (batch, channels, heigh, width). + """ + cw_scale_logits = self.cw_se.compute_scale_logits(x, x_mask) + fw_scale_logits = self.fw_se.compute_scale_logits( + x.transpose(1, 2), x_mask + ).transpose(1, 2) + scale_logits = cw_scale_logits + fw_scale_logits + scale = torch.sigmoid(scale_logits) + y = scale * x + return y + + class SEBlock1d(nn.Module): """1d Squeeze Excitation version of https://arxiv.org/abs/1709.01507 @@ -191,5 +276,5 @@ def forward(self, x, x_mask=None): # aliases to mantein backwards compatibility -SEBlock2d = SEBlock2D -TSEBlock2d = TSEBlock2D +SEBlock2D = SEBlock2d +TSEBlock2D = TSEBlock2d diff --git a/hyperion/torch/layer_blocks/seresnet_blocks.py b/hyperion/torch/layer_blocks/seresnet_blocks.py index 7f8d0bae..5074f466 100644 --- a/hyperion/torch/layer_blocks/seresnet_blocks.py +++ b/hyperion/torch/layer_blocks/seresnet_blocks.py @@ -8,7 +8,7 @@ from torch.nn import Conv2d, Linear, BatchNorm2d, Dropout2d from ..layers import ActivationFactory as AF -from .se_blocks import SEBlock2D, TSEBlock2D +from .se_blocks import SEBlock2d, TSEBlock2d, FwSEBlock2d, CFwSEBlock2d from .resnet_blocks import ResNetBasicBlock, ResNetBNBlock @@ -42,6 +42,7 @@ def __init__( norm_layer=None, norm_before=True, se_r=16, + se_type="cw-se", time_se=False, num_feats=None, ): @@ -59,16 +60,24 @@ def __init__( ) if time_se: - self.se_layer = TSEBlock2D(channels, num_feats, se_r, activation) - else: - self.se_layer = SEBlock2D(channels, se_r, activation) + se_type = "t-se" + + if se_type == "t-se": + self.se_layer = TSEBlock2d(channels, num_feats, se_r, activation) + elif se_type == "cw-se": + self.se_layer = SEBlock2d(channels, se_r, activation) + elif se_type == "fw-se": + self.se_layer = FwSEBlock2d(num_feats, se_r, activation) + elif se_type == "cfw-se": + self.se_layer = CFwSEBlock2d(channels, num_feats, se_r, activation) def forward(self, x, x_mask=None): """Forward function. Args: x: input tensor with shape = (batch, in_channels, in_heigh, in_width). - x_mask: unused. + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) Returns: Tensor with shape = (batch, out_channels, out_heigh, out_width). @@ -92,7 +101,7 @@ def forward(self, x, x_mask=None): if self.downsample is not None: residual = self.downsample(residual) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act2(x) @@ -135,6 +144,7 @@ def __init__( norm_layer=None, norm_before=True, se_r=16, + se_type="cw-se", time_se=False, num_feats=None, ): @@ -152,18 +162,26 @@ def __init__( ) if time_se: - self.se_layer = TSEBlock2D( - channels * self.expansion, num_feats, se_r, activation - ) - else: - self.se_layer = SEBlock2D(channels * self.expansion, se_r, activation) + se_type = "t-se" + + se_channels = channels * self.expansion + if se_type == "t-se": + self.se_layer = TSEBlock2d(se_channels, num_feats, se_r, activation) + elif se_type == "cw-se": + self.se_layer = SEBlock2d(se_channels, se_r, activation) + elif se_type == "fw-se": + self.se_layer = FwSEBlock2d(num_feats, se_r, activation) + elif se_type == "cfw-se": + self.se_layer = CFwSEBlock2d(se_channels, num_feats, se_r, activation) def forward(self, x, x_mask=None): """Forward function. Args: x: input tensor with shape = (batch, in_channels, in_heigh, in_width). - x_mask: unused. + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time), (batch, height, width) + Returns: Tensor with shape = (batch, out_channels, out_heigh, out_width). """ @@ -190,7 +208,7 @@ def forward(self, x, x_mask=None): if self.downsample is not None: residual = self.downsample(residual) - x = self.se_layer(x) + x = self.se_layer(x, x_mask=x_mask) x += residual x = self.act3(x) diff --git a/hyperion/torch/layers/spec_augment.py b/hyperion/torch/layers/spec_augment.py index 1366172b..a7ebcfb1 100644 --- a/hyperion/torch/layers/spec_augment.py +++ b/hyperion/torch/layers/spec_augment.py @@ -22,7 +22,7 @@ class AxisMasker(nn.Module): min_num_mask: minimum number of masks. max_num_mask: maximum number of masks. dim: axis where we apply the mask - fill_value: masking value + mask_value: masking value """ def __init__( @@ -32,7 +32,8 @@ def __init__( min_num_masks=1, max_num_masks=2, dim=-1, - fill_value=0, + mask_method="constant", + mask_value=0, ): super().__init__() assert min_width >= 0 @@ -45,13 +46,14 @@ def __init__( self.min_num_masks = min_num_masks self.max_num_masks = max_num_masks self.dim = dim - self.fill_value = fill_value + self.mask_method = mask_method + self.mask_value = mask_value def __repr__(self): s = ( "{}(min_width={}, max_width={}, " "min_num_masks={}, max_num_masks={}, " - "dim={}, fill_value={})" + "dim={}, mask_method={}, mask_value={})" ).format( self.__class__.__name__, self.min_width, @@ -59,7 +61,8 @@ def __repr__(self): self.min_num_masks, self.max_num_masks, self.dim, - self.fill_value, + self.mask_method, + self.mask_value, ) return s @@ -111,7 +114,14 @@ def forward(self, x): else: mask = mask.unsqueeze(-1) - x = x.masked_fill(mask, self.fill_value) + if self.mask_method == "mean": + mask_value = x.mean().item() + elif self.mask_method == "min": + mask_value = x.min().item() + else: + mask_value = self.mask_value + + x = x.masked_fill(mask, mask_value) if ndim > 3: x = x.view(in_shape) @@ -225,7 +235,7 @@ class SpecAugment(nn.Module): freq_max_width: maximum width of the frequency mask. freq_min_num_mask: minimum number of frequency masks. freq_max_num_mask: maximum number of frequency masks. - fill_value: masking value. + mask_value: masking value. """ def __init__( @@ -243,7 +253,8 @@ def __init__( freq_mask_max_width=20, freq_mask_min_num_masks=1, freq_mask_max_num_masks=2, - fill_value=0, + mask_method="constant", + mask_value=0, ): super().__init__() @@ -260,7 +271,7 @@ def __init__( self.freq_mask_max_width = freq_mask_max_width self.freq_mask_min_num_masks = freq_mask_min_num_masks self.freq_mask_max_num_masks = freq_mask_max_num_masks - self.fill_value = fill_value + self.mask_value = mask_value self.time_masker = None self.freq_masker = None @@ -273,7 +284,8 @@ def __init__( min_num_masks=time_mask_min_num_masks, max_num_masks=time_mask_max_num_masks, dim=-2, - fill_value=fill_value, + mask_method=mask_method, + mask_value=mask_value, ) if self.freq_mask_prob > 0: @@ -283,7 +295,8 @@ def __init__( min_num_masks=freq_mask_min_num_masks, max_num_masks=freq_mask_max_num_masks, dim=-1, - fill_value=fill_value, + mask_method=mask_method, + mask_value=mask_value, ) if self.time_warp_prob > 0: @@ -368,7 +381,8 @@ def filter_args(**kwargs): "freq_mask_min_width", "freq_mask_max_num_masks", "freq_mask_min_num_masks", - "fill_value", + "mask_value", + "mask_method", ) d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -463,9 +477,15 @@ def add_class_args(parser, prefix=None): default=2, help="max. number of freq mask", ) + parser.add_argument( + "--mask-method", + default="constant", + choices=["constant", "min", "mean"], + help="mothod to get the masked value", + ) parser.add_argument( - "--fill-value", + "---mask-value", type=float, default=0.0, help="filling value for the masked spec. bins", diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 2939db5b..2072241d 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -655,13 +655,13 @@ def valid_train_modes(): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] # get arguments for pooling pool_args = PF.filter_args(**kwargs["pool_net"]) @@ -792,18 +792,31 @@ def add_class_args(parser, prefix=None, skip=set()): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) try: diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index 82ac5a8a..22f63de6 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -4,7 +4,7 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import torch import torch.nn as nn @@ -279,13 +279,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -418,18 +418,31 @@ def add_class_args(parser, prefix=None, head_channels=False): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) if prefix is not None: diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index c2fb3d02..619851bb 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -2,7 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import math import torch @@ -252,13 +252,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_feats", @@ -392,18 +392,32 @@ def add_class_args(parser, prefix=None, head_channels=False, in_feats=False): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) + parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) if prefix is not None: diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index e21d615a..0166baca 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import math import torch @@ -300,13 +300,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -439,18 +439,31 @@ def add_class_args(parser, prefix=None, head_channels=False): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) if prefix is not None: diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index 4102c4f7..e847dbb6 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -4,7 +4,7 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import torch import torch.nn as nn @@ -258,13 +258,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -397,18 +397,31 @@ def add_class_args(parser, prefix=None, head_channels=False): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) if prefix is not None: diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index 9185964c..34ac9b81 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -10,6 +10,7 @@ import torch.nn as nn from torch.nn import Conv1d, Linear, BatchNorm1d +from ..utils import seq_lengths_to_mask, scale_seq_lengths from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..layer_blocks import ( @@ -89,10 +90,12 @@ def __init__( do_maxpool=True, in_norm=True, se_r=16, - time_se=False, + se_type="cw-se", in_feats=None, res2net_scale=4, res2net_width_factor=1, + resb_channels=None, + time_se=False, ): super().__init__() @@ -100,6 +103,7 @@ def __init__( self.block = block self.has_se = False self.is_res2net = False + if isinstance(block, str): if block == "basic": self._block = ResNetBasicBlock @@ -117,7 +121,7 @@ def __init__( elif block == "res2bn": self._block = Res2NetBNBlock self.is_res2net = True - elif block == "seres2bn" or block == "tseres2bn": + elif block in ("seres2bn", "tseres2bn"): self._block = Res2NetBNBlock self.has_se = True self.is_res2net = True @@ -140,9 +144,13 @@ def __init__( # self.width_per_group = width_per_group self.se_r = se_r self.time_se = time_se + if time_se: + se_type = "t-se" + self.se_type = se_type self.in_feats = in_feats self.res2net_scale = res2net_scale self.res2net_width_factor = res2net_width_factor + self.resb_channels = resb_channels self.multilevel = multilevel self.endpoint_channels = endpoint_channels @@ -186,25 +194,31 @@ def __init__( self._context = self.in_block.context self._downsample_factor = self.in_block.downsample_factor + if resb_channels is None: + resb_channels = [base_channels * (2 ** i) for i in range(4)] + self.cur_in_channels = conv_channels - self.layer1 = self._make_layer(self._block, base_channels, num_layers[0]) + self.layer1 = self._make_layer(self._block, resb_channels[0], num_layers[0]) self.layer2 = self._make_layer( self._block, - 2 * base_channels, + # 2 * base_channels, + resb_channels[1], num_layers[1], stride=2, dilate=replace_stride_with_dilation[0], ) self.layer3 = self._make_layer( self._block, - 4 * base_channels, + # 4 * base_channels, + resb_channels[2], num_layers[2], stride=2, dilate=replace_stride_with_dilation[1], ) self.layer4 = self._make_layer( self._block, - 8 * base_channels, + # 8 * base_channels, + resb_channels[3], num_layers[3], stride=2, dilate=replace_stride_with_dilation[2], @@ -277,8 +291,6 @@ def __init__( nn.init.constant_(m.bn2.weight, 0) def _make_layer(self, block, channels, num_blocks, stride=1, dilate=False): - norm_layer = self._norm_layer - downsample = None previous_dilation = self.dilation if dilate: self.dilation *= stride @@ -286,11 +298,11 @@ def _make_layer(self, block, channels, num_blocks, stride=1, dilate=False): kwargs = {} if self.has_se: - if self.time_se: + if self.se_type == "cw-se": + kwargs = {"se_r": self.se_r} + else: num_feats = int(self.in_feats / (self._downsample_factor * stride)) kwargs = {"se_r": self.se_r, "time_se": True, "num_feats": num_feats} - else: - kwargs = {"se_r": self.se_r} if self.is_res2net: kwargs["scale"] = self.res2net_scale @@ -401,6 +413,15 @@ def out_shape(self, in_shape=None): return (in_shape[0], self.layer4[-1].out_channels, H, W) + def _forward_layer_with_lens(layer, x, in_lengths, max_in_length): + x_lengths = scale_seq_lengths(in_lengths, x.size(-1), max_in_length) + x_mask = seq_lengths_to_mask(x_lengths, x.size(-1), time_dim=3) + + for sub_layer in layer: + x = sub_layer(x, x_mask) + + return x + def forward(self, x, x_lengths=None): """forward function @@ -414,21 +435,39 @@ def forward(self, x, x_lengths=None): otherwise, it returns tensor of represeantions of size=(batch, Cout, Hout, Wout) """ + if x_lengths is not None: + # if all lengths are eq. to the max length, we set x_lengths to None + max_length = x.size(-1) + if torch.all(x_lengths == max_length): + x_lengths = None if self.in_norm: x = self.in_bn(x) feats = [] x = self.in_block(x) - x = self.layer1(x) - x = self.layer2(x) - if self.multilevel: - feats.append(x) - x = self.layer3(x) - if self.multilevel: - feats.append(x) - x = self.layer4(x) - if self.multilevel: - feats.append(x) + + if x_lengths is None: + x = self.layer1(x) + x = self.layer2(x) + if self.multilevel: + feats.append(x) + x = self.layer3(x) + if self.multilevel: + feats.append(x) + x = self.layer4(x) + if self.multilevel: + feats.append(x) + else: + x = self._forward_layer_with_lens(self.layer1, x, x_lengths, max_length) + x = self._forward_layer_with_lens(self.layer2, x, x_lengths, max_length) + if self.multilevel: + feats.append(x) + x = self._forward_layer_with_lens(self.layer3, x, x_lengths, max_length) + if self.multilevel: + feats.append(x) + x = self._forward_layer_with_lens(self.layer4, x, x_lengths, max_length) + if self.multilevel: + feats.append(x) if self.multilevel: out2 = self.endpoint2(feats[0]) @@ -547,9 +586,11 @@ def get_config(self): "out_act": out_act, "hid_act": hid_act, "se_r": self.se_r, + "se_type": self.se_type, "in_feats": self.in_feats, "res2net_scale": self.res2net_scale, "res2net_width_factor": self.res2net_width_factor, + "resb_channels": self.resb_channels, } base_config = super().get_config() @@ -608,6 +649,20 @@ def __init__(self, in_channels, **kwargs): super().__init__("bn", [3, 4, 23, 3], in_channels, **kwargs) +class IdRndResNet100(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + super().__init__("basic", [6, 16, 24, 3], in_channels, **kwargs) + + +class IdRndResNet202(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + super().__init__("basic", [6, 16, 75, 3], in_channels, **kwargs) + + class LResNet18(ResNet): def __init__(self, in_channels, **kwargs): kwargs["conv_channels"] = 16 @@ -636,6 +691,16 @@ def __init__(self, in_channels, **kwargs): super().__init__("bn", [3, 4, 6, 3], in_channels, **kwargs) +# multi-level feature ResNet +class LResNet34_345(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["multilevel"] = True + kwargs["endpoint_channels"] = 64 + super().__init__("basic", [3, 4, 6, 3], in_channels, **kwargs) + + # Squezee-Excitation ResNets @@ -813,6 +878,228 @@ def __init__(self, in_channels, **kwargs): super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) +# Freq-wise Squezee-Excitation ResNets + + +class FwSEResNet18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [2, 2, 2, 2], in_channels, **kwargs) + + +class FwSEResNet34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEResNet101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSEResNet152(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 8, 36, 3], in_channels, **kwargs) + + +class FwSEResNext50_32x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEResNext101_32x8d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 256 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSEWideResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEWideResNet101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSELResNet18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [2, 2, 2, 2], in_channels, **kwargs) + + +class FwSELResNet34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("sebasic", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSELResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSELResNext50_4x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 4 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEIdRndResNet100(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + kwargs["se_type"] = "fw-se" + super().__init__("basic", [6, 16, 24, 3], in_channels, **kwargs) + + +class FwSEIdRndResNet202(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + kwargs["se_type"] = "fw-se" + super().__init__("basic", [6, 16, 75, 3], in_channels, **kwargs) + + +# Channel-Freq-wise Squezee-Excitation ResNets + + +class CFwSEResNet18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [2, 2, 2, 2], in_channels, **kwargs) + + +class CFwSEResNet34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEResNet101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSEResNet152(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 8, 36, 3], in_channels, **kwargs) + + +class CFwSEResNext50_32x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEResNext101_32x8d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 256 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSEWideResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEWideResNet101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSELResNet18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [2, 2, 2, 2], in_channels, **kwargs) + + +class CFwSELResNet34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("sebasic", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSELResNet50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSELResNext50_4x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 4 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("sebn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEIdRndResNet100(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + kwargs["se_type"] = "cfw-se" + super().__init__("basic", [6, 16, 24, 3], in_channels, **kwargs) + + +class CFwSEIdRndResNet202(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["resb_channels"] = [128, 128, 256, 256] + kwargs["se_type"] = "cfw-se" + super().__init__("basic", [6, 16, 75, 3], in_channels, **kwargs) + + #################### Res2Net variants ######################## # Standard Res2Nets @@ -1024,11 +1311,155 @@ def __init__(self, in_channels, **kwargs): super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) -# multi-level feature ResNet -class LResNet34_345(ResNet): +# frequency-wise Squezee-Excitation Res2Nets +class FwSERes2Net18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("se2basic", [2, 2, 2, 2], in_channels, **kwargs) + + +class FwSERes2Net34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("se2basic", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSERes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSERes2Net101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSERes2Net152(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 8, 36, 3], in_channels, **kwargs) + + +class FwSERes2Next50_32x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSERes2Next101_32x8d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 256 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSEWideRes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSEWideRes2Net101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class FwSELRes2Net50(ResNet): def __init__(self, in_channels, **kwargs): kwargs["conv_channels"] = 16 kwargs["base_channels"] = 16 - kwargs["multilevel"] = True - kwargs["endpoint_channels"] = 64 - super().__init__("basic", [3, 4, 6, 3], in_channels, **kwargs) + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class FwSELRes2Next50_4x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 4 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "fw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +# channel-frequency-wise Squezee-Excitation Res2Nets +class CFwSERes2Net18(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("se2basic", [2, 2, 2, 2], in_channels, **kwargs) + + +class CFwSERes2Net34(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("se2basic", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSERes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSERes2Net101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSERes2Net152(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 8, 36, 3], in_channels, **kwargs) + + +class CFwSERes2Next50_32x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSERes2Next101_32x8d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 32 + kwargs["base_channels"] = 256 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSEWideRes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSEWideRes2Net101(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["base_channels"] = 128 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 23, 3], in_channels, **kwargs) + + +class CFwSELRes2Net50(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) + + +class CFwSELRes2Next50_4x4d(ResNet): + def __init__(self, in_channels, **kwargs): + kwargs["groups"] = 4 + kwargs["base_channels"] = 16 + kwargs["se_type"] = "cfw-se" + super().__init__("seres2bn", [3, 4, 6, 3], in_channels, **kwargs) diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index f24887fe..3ab454ae 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -2,7 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import math import torch @@ -323,13 +323,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -349,7 +349,7 @@ def filter_args(**kwargs): "head_act", "dropout_rate", "use_norm", - "norm-layer", + "norm_layer", "norm_before", ) @@ -478,18 +478,31 @@ def add_class_args(parser, prefix=None): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) parser.add_argument( diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index 794f8144..deef9c59 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -537,13 +537,13 @@ def change_dropouts(self, dropout_rate, drop_connect_rate): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_feats", @@ -722,18 +722,31 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) parser.add_argument( @@ -754,10 +767,7 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): ) parser.add_argument( - "--res2net-scale", - default=1, - type=int, - help=("res2net scaling parameter "), + "--res2net-scale", default=1, type=int, help=("res2net scaling parameter "), ) parser.add_argument( diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index 6457ada1..22b1e7a7 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -4,7 +4,7 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import torch import torch.nn as nn @@ -330,13 +330,13 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -485,18 +485,31 @@ def add_class_args(parser, prefix=None): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) parser.add_argument( diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index b27e883d..3af174cf 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -4,7 +4,7 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import logging import torch @@ -372,13 +372,13 @@ def change_dropouts(self, dropout_rate, drop_connect_rate): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: - kwargs["use_norm"] = not kwargs["wo_norm"] - del kwargs["wo_norm"] + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] valid_args = ( "in_channels", @@ -540,18 +540,32 @@ def add_class_args(parser, prefix=None, skip=set()): except: pass + # parser.add_argument( + # "--wo-norm", + # default=False, + # action="store_true", + # help="without batch normalization", + # ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) + parser.add_argument( - "--wo-norm", - default=False, - action="store_true", + "--use-norm", + default=True, + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) parser.add_argument( diff --git a/hyperion/torch/narchs/resnet_factory.py b/hyperion/torch/narchs/resnet_factory.py index 645b7f2b..c9d5806e 100644 --- a/hyperion/torch/narchs/resnet_factory.py +++ b/hyperion/torch/narchs/resnet_factory.py @@ -21,6 +21,7 @@ "lresnet34": LResNet34, "lresnet50": LResNet50, "lresnext50_4x4d": LResNext50_4x4d, + "lresnet34_345": LResNet34_345, "seresnet18": SEResNet18, "seresnet34": SEResNet34, "seresnet50": SEResNet50, @@ -47,6 +48,32 @@ "tselresnet34": TSELResNet34, "tselresnet50": TSELResNet50, "tselresnext50_4x4d": TSELResNext50_4x4d, + "fwseresnet18": FwSEResNet18, + "fwseresnet34": FwSEResNet34, + "fwseresnet50": FwSEResNet50, + "fwseresnet101": FwSEResNet101, + "fwseresnet152": FwSEResNet152, + "fwseresnext50_32x4d": FwSEResNext50_32x4d, + "fwseresnext101_32x8d": FwSEResNext101_32x8d, + "fwsewideresnet50": FwSEWideResNet50, + "fwsewideresnet101": FwSEWideResNet101, + "fwselresnet18": FwSELResNet18, + "fwselresnet34": FwSELResNet34, + "fwselresnet50": FwSELResNet50, + "fwselresnext50_4x4d": FwSELResNext50_4x4d, + "cfwseresnet18": CFwSEResNet18, + "cfwseresnet34": CFwSEResNet34, + "cfwseresnet50": CFwSEResNet50, + "cfwseresnet101": CFwSEResNet101, + "cfwseresnet152": CFwSEResNet152, + "cfwseresnext50_32x4d": CFwSEResNext50_32x4d, + "cfwseresnext101_32x8d": CFwSEResNext101_32x8d, + "cfwsewideresnet50": CFwSEWideResNet50, + "cfwsewideresnet101": CFwSEWideResNet101, + "cfwselresnet18": CFwSELResNet18, + "cfwselresnet34": CFwSELResNet34, + "cfwselresnet50": CFwSELResNet50, + "cfwselresnext50_4x4d": CFwSELResNext50_4x4d, "res2net18": Res2Net18, "res2net34": Res2Net34, "res2net50": Res2Net50, @@ -80,7 +107,34 @@ "tsewideres2net101": TSEWideRes2Net101, "tselres2net50": TSELRes2Net50, "tselres2next50_4x4d": TSELRes2Next50_4x4d, - "lresnet34_345": LResNet34_345, + "fwseres2net18": FwSERes2Net18, + "fwseres2net34": FwSERes2Net34, + "fwseres2net50": FwSERes2Net50, + "fwseres2net101": FwSERes2Net101, + "fwseres2net152": FwSERes2Net152, + "fwseres2next50_32x4d": FwSERes2Next50_32x4d, + "fwseres2next101_32x8d": FwSERes2Next101_32x8d, + "fwsewideres2net50": FwSEWideRes2Net50, + "fwsewideres2net101": FwSEWideRes2Net101, + "fwselres2net50": FwSELRes2Net50, + "fwselres2next50_4x4d": FwSELRes2Next50_4x4d, + "cfwseres2net18": CFwSERes2Net18, + "cfwseres2net34": CFwSERes2Net34, + "cfwseres2net50": CFwSERes2Net50, + "cfwseres2net101": CFwSERes2Net101, + "cfwseres2net152": CFwSERes2Net152, + "cfwseres2next50_32x4d": CFwSERes2Next50_32x4d, + "cfwseres2next101_32x8d": CFwSERes2Next101_32x8d, + "cfwsewideres2net50": CFwSEWideRes2Net50, + "cfwsewideres2net101": CFwSEWideRes2Net101, + "cfwselres2net50": CFwSELRes2Net50, + "cfwselres2next50_4x4d": CFwSELRes2Next50_4x4d, + "idrndresnet100": IdRndResNet100, + "idrndresnet202": IdRndResNet202, + "fwseidrndresnet100": FwSEIdRndResNet100, + "fwseidrndresnet202": FwSEIdRndResNet202, + "cfwseidrndresnet100": CFwSEIdRndResNet100, + "cfwseidrndresnet202": CFwSEIdRndResNet202, } @@ -141,9 +195,9 @@ def create( return resnet def filter_args(**kwargs): - if "norm_after" in kwargs: - kwargs["norm_before"] = not kwargs["norm_after"] - del kwargs["norm_after"] + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] if "no_maxpool" in kwargs: kwargs["do_maxpool"] = not kwargs["no_maxpool"] @@ -247,21 +301,21 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--in-norm", default=False, - action="store_true", + action=ActionYesNo, help="batch normalization at the input", ) parser.add_argument( "--no-maxpool", default=False, - action="store_true", + action=ActionYesNo, help="don't do max pooling after first convolution", ) parser.add_argument( "--zero-init-residual", default=False, - action="store_true", + action=ActionYesNo, help="Zero-initialize the last BN in each residual branch", ) @@ -293,11 +347,18 @@ def add_class_args(parser, prefix=None): try: parser.add_argument( - "--norm-after", - default=False, - action="store_true", - help="batch normalizaton after activation", + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", ) + + # parser.add_argument( + # "--norm-after", + # default=False, + # action="store_true", + # help="batch normalizaton after activation", + # ) except: pass diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index ad3df161..93571acf 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -653,7 +653,7 @@ def filter_args(**kwargs): return args @staticmethod - def add_class_args(parser, prefix=None, train_modes=None, skip={}): + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") From 15db427110988b942dfc3c6caf58c02e7aed13d9 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 29 Dec 2022 07:59:34 -0500 Subject: [PATCH 073/154] added scripts to extract xvectors from feats back to bin --- hyperion/bin/compute_mfcc_feats.py | 116 ++++++ hyperion/bin/extract_xvectors_from_feats.py | 247 ++++++++++++ hyperion/bin/extract_xvectors_from_wav.py | 48 ++- .../extract_xvectors_slidwin_from_feats.py | 306 +++++++++++++++ .../bin/extract_xvectors_slidwin_from_wav.py | 358 ++++++++++++++++++ 5 files changed, 1050 insertions(+), 25 deletions(-) create mode 100755 hyperion/bin/compute_mfcc_feats.py create mode 100755 hyperion/bin/extract_xvectors_from_feats.py create mode 100755 hyperion/bin/extract_xvectors_slidwin_from_feats.py create mode 100755 hyperion/bin/extract_xvectors_slidwin_from_wav.py diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py new file mode 100755 index 00000000..b7e90056 --- /dev/null +++ b/hyperion/bin/compute_mfcc_feats.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +""" + Copyright 2018 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import SequentialDataReaderFactory as DRF +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import compression_methods +from hyperion.np.feats import MFCC + + +def compute_mfcc_feats( + input_path, output_path, compress, compression_method, write_num_frames, **kwargs +): + + mfcc_args = MFCC.filter_args(**kwargs) + mfcc = MFCC(**mfcc_args) + + if mfcc.input_step == "wave": + input_args = AR.filter_args(**kwargs) + reader = AR(input_path, **input_args) + else: + input_args = DRF.filter_args(**kwargs) + reader = DRF.create(input_path, **input_args) + + writer = DWF.create( + output_path, + scp_sep=" ", + compress=compress, + compression_method=compression_method, + ) + + if write_num_frames is not None: + f_num_frames = open(write_num_frames, "w") + + for data in reader: + if mfcc.input_step == "wave": + key, x, fs = data + else: + key, x = data + logging.info("Extracting MFCC for %s num_samples=%d" % (key, len(x))) + t1 = time.time() + y = mfcc.compute(x) + dt = (time.time() - t1) * 1000 + rtf = dt / (mfcc.frame_shift * y.shape[0]) + logging.info( + "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f" + % (key, y.shape[0], dt, rtf) + ) + writer.write([key], [y]) + + if write_num_frames is not None: + f_num_frames.write("%s %d\n" % (key, y.shape[0])) + + mfcc.reset() + + if write_num_frames is not None: + f_num_frames.close() + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Compute MFCC features") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output", dest="output_path", required=True) + parser.add_argument("--write-num-frames", default=None) + + AR.add_class_args(parser) + DRF.add_class_args(parser) + MFCC.add_class_args(parser) + parser.add_argument( + "--compress", + dest="compress", + default=False, + action="store_true", + help="Compress the features", + ) + parser.add_argument( + "--compression-method", + dest="compression_method", + default="auto", + choices=compression_methods, + help="Compression method", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + compute_mfcc_feats(**namespace_to_dict(args)) diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py new file mode 100755 index 00000000..9fb1006c --- /dev/null +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +import torch + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.utils import Utt2Info +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.feats import MeanVarianceNorm as MVN + +from hyperion.torch.utils import open_device +from hyperion.torch import TorchModelLoader as TML + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_mvn(device, **kwargs): + mvn_args = MVN.filter_args(**kwargs["mvn"]) + logging.info("mvn args={}".format(mvn_args)) + mvn = MVN(**mvn_args) + if mvn.norm_mean or mvn.norm_var: + return mvn + return None + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): + utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + if utt_length < x.shape[1]: + first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d" + % (key, x.shape[1], first_frame) + ) + return x + + +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_num_frames_spec, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + use_gpu, + **kwargs +): + + logging.info("initializing") + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + mvn = init_mvn(device, **kwargs) + model = load_model(model_path, device) + + if write_num_frames_spec is not None: + keys = [] + info = [] + + dr_args = DRF.filter_args(**kwargs) + logging.info("opening output stream: %s" % (output_spec)) + with DWF.create(output_spec) as writer: + + logging.info("opening input stream: %s" % (input_spec)) + with DRF.create(input_spec, **dr_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + while not reader.eof(): + t1 = time.time() + key, data = reader.read(1) + if len(key) == 0: + break + t2 = time.time() + logging.info("processing utt %s" % (key[0])) + x = data[0] + if mvn is not None: + x = mvn.normalize(x) + t3 = time.time() + tot_frames = x.shape[0] + if vad_spec is not None: + vad = v_reader.read(key, num_frames=x.shape[0])[0].astype( + "bool", copy=False + ) + x = x[vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % (key[0], x.shape[0], tot_frames, x.shape[0] / tot_frames * 100) + ) + + if random_utt_length: + x = select_random_chunk(key, x, min_utt_length, max_utt_length, rng) + + t4 = time.time() + if x.shape[0] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) + with torch.no_grad(): + y = ( + model.extract_embed( + xx, chunk_length=chunk_length, embed_layer=embed_layer + ) + .detach() + .cpu() + .numpy()[0] + ) + + t5 = time.time() + writer.write(key, [y]) + if write_num_frames_spec is not None: + keys.append(key[0]) + info.append(str(x.shape[0])) + t6 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key[0], + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + t6 - t5, + x.shape[0] * 1e-2 / (t6 - t1), + ) + ) + + if write_num_frames_spec is not None: + logging.info("writing num-frames to %s" % (write_num_frames_spec)) + u2nf = Utt2Info.create(keys, info) + u2nf.save(write_num_frames_spec) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Extracts x-vectors from features") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + DRF.add_class_args(parser) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + MVN.add_class_args(parser, prefix="mvn") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk of the utterance", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=500, + help=("minimum utterance length when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=12000, + help=("maximum utterance length when using random utt length"), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index e9746897..c1cdf02d 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -88,8 +88,10 @@ def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( - "extract-random-utt %s of length=%d first-frame=%d" - % (key, x.shape[1], first_frame) + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, ) return x @@ -132,7 +134,7 @@ def extract_xvectors( num_augs = 1 ar_args = AR.filter_args(**kwargs) - logging.info("opening output stream: %s" % (output_spec)) + logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec, scp_sep=scp_sep) as writer: logging.info( @@ -141,7 +143,7 @@ def extract_xvectors( with AR(input_spec, **ar_args) as reader: if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) + logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create( vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep ) @@ -156,7 +158,7 @@ def extract_xvectors( key0 = key[0] t2 = time.time() - logging.info("processing utt %s" % (key0)) + logging.info("processing utt %s", key0) for aug_id in range(num_augs): t3 = time.time() key, x = augment(key0, x0, augmenter, aug_df, aug_id) @@ -175,13 +177,11 @@ def extract_xvectors( x = x[:, vad] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - x.shape[1], - tot_frames, - x.shape[1] / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, ) if random_utt_length: @@ -219,22 +219,20 @@ def extract_xvectors( "aug-time=%.3f feat-time=%.3f " "vad-time=%.3f embed-time=%.3f write-time=%.3f " "rt-factor=%.2f" - ) - % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - ) + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, ) if write_num_frames_spec is not None: - logging.info("writing num-frames to %s" % (write_num_frames_spec)) + logging.info("writing num-frames to %s", write_num_frames_spec) u2nf = Utt2Info.create(keys, info) u2nf.save(write_num_frames_spec) diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py new file mode 100755 index 00000000..d14f16f3 --- /dev/null +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +import torch +import yaml + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.utils import Utt2Info +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.feats import MeanVarianceNorm as MVN + +from hyperion.torch.utils import open_device +from hyperion.torch import TorchModelLoader as TML + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_mvn(device, **kwargs): + mvn_args = MVN.filter_args(**kwargs["mvn"]) + logging.info("mvn args={}".format(mvn_args)) + mvn = MVN(**mvn_args) + if mvn.norm_mean or mvn.norm_var: + return mvn + return None + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_timestamps_spec, + slidwin_params_path, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_shift, + feat_snip_edges, + use_gpu, + **kwargs +): + + logging.info("initializing") + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + mvn = init_mvn(device, **kwargs) + model = load_model(model_path, device) + + if write_timestamps_spec is not None: + time_writer = DWF.create(write_timestamps_spec, scp_sep=" ") + + dr_args = DRF.filter_args(**kwargs) + logging.info("opening output stream: %s" % (output_spec)) + with DWF.create(output_spec) as writer: + + logging.info("opening input stream: %s" % (output_spec)) + with DRF.create(input_spec, **dr_args) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + while not reader.eof(): + t1 = time.time() + key, data = reader.read(1) + if len(key) == 0: + break + t2 = time.time() + logging.info("processing utt %s" % (key[0])) + x = data[0] + if mvn is not None: + x = mvn.normalize(x) + t3 = time.time() + tot_frames = x.shape[0] + if vad_spec is not None: + vad = v_reader.read(key, num_frames=x.shape[0])[0].astype( + "bool", copy=False + ) + x = x[vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % (key[0], x.shape[0], tot_frames, x.shape[0] / tot_frames * 100) + ) + + t4 = time.time() + if x.shape[0] == 0: + y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),) + else: + xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) + with torch.no_grad(): + y = ( + model.extract_embed_slidwin( + xx, + win_length, + win_shift, + snip_edges=snip_edges, + feat_frame_length=feat_frame_length, + feat_frame_shift=feat_frame_shift, + chunk_length=chunk_length, + embed_layer=embed_layer, + detach_chunks=True, + ) + .detach() + .cpu() + .numpy()[0] + ) + + t5 = time.time() + y = y.T + writer.write(key, [y]) + + if write_timestamps_spec is not None: + num_wins = y.shape[0] + timestamps = model.compute_slidwin_timestamps( + num_wins, + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ).numpy() + logging.info("{}".format(timestamps)) + time_writer.write(key, [timestamps]) + t6 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key[0], + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + t6 - t5, + x.shape[0] * 1e-2 / (t6 - t1), + ) + ) + + if write_timestamps_spec is not None: + time_writer.close() + + if slidwin_params_path is not None: + params = { + "padding": model.compute_slidwin_left_padding( + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ), + "win_length": win_length, + "win_shift": win_shift, + } + with open(slidwin_params_path, "w") as f: + yaml.dump(params, f) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Extract x-vectors over a sliding window") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + DRF.add_class_args(parser) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-timestamps", dest="write_timestamps_spec", default=None + ) + parser.add_argument("--slidwin-params-path", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + MVN.add_class_args(parser, prefix="mvn") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--win-length", + type=float, + default=1.5, + help=("window length for x-vector extraction in seconds"), + ) + parser.add_argument( + "--win-shift", + type=float, + default=0.25, + help=("window shift for x-vector extraction in seconds"), + ) + parser.add_argument( + "--snip-edges", + default=False, + action="store_true", + help=( + "If true, end effects will be handled by outputting " + "only windows that completely fit in the file, " + "and the number of windows depends on the window-length. " + "If false, the number of windows depends only on " + "the window-shift, and we reflect the data at the ends." + ), + ) + + parser.add_argument( + "--feat-frame-length", + type=float, + default=25, + help=("frame-length used to compute the acoustic features in msecs"), + ) + parser.add_argument( + "--feat-frame-shift", + type=float, + default=10, + help=("frame-shift used to compute the acoustic features in msecs"), + ) + parser.add_argument( + "--feat-snip-edges", + default=False, + action="store_true", + help=( + "If true, end effects will be handled by outputting only windows " + "that completely fit in the file, and the number of windows " + "depends on the feat-frame-length. " + "If false, the number of feature frames depends only on the " + "feat-frame-shift, and we reflect the waveform at the ends." + ), + ) + + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py new file mode 100755 index 00000000..b4bd2b0d --- /dev/null +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np +import pandas as pd +import yaml + +import torch + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.utils import Utt2Info +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment + +from hyperion.torch.utils import open_device +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch import TorchModelLoader as TML + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(device, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + feat_extractor.to(device) + return feat_extractor + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_timestamps_spec, + slidwin_params_path, + scp_sep, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + win_length, + win_shift, + snip_edges, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs +): + + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + feat_extractor = init_feats(device, **kwargs) + model = load_model(model_path, device) + + feat_args = kwargs["feats"]["audio_feats"] + feat_frame_length = feat_args["frame_length"] + feat_frame_shift = feat_args["frame_shift"] + feat_snip_edges = feat_args["snip_edges"] + + if write_timestamps_spec is not None: + time_writer = DWF.create(write_timestamps_spec, scp_sep=scp_sep) + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output stream: %s", output_spec) + with DWF.create(output_spec, scp_sep=scp_sep) as writer: + + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) + with AR(input_spec, **ar_args) as reader: + + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create( + vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + ) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + t2 = time.time() + + logging.info("processing utt %s", key0) + for aug_id in range(num_augs): + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + + x = feat_extractor(x) + t5 = time.time() + tot_frames = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0, num_frames=tot_frames)[0] + vad = torch.tensor(vad, dtype=torch.bool).to(device) + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),) + else: + x = x.transpose(1, 2).contiguous() + y = ( + model.extract_embed_slidwin( + x, + win_length, + win_shift, + snip_edges=snip_edges, + feat_frame_length=feat_frame_length, + feat_frame_shift=feat_frame_shift, + chunk_length=chunk_length, + embed_layer=embed_layer, + detach_chunks=True, + ) + .detach() + .cpu() + .numpy()[0] + ) + + t7 = time.time() + y = y.T + writer.write([key], [y]) + + if write_timestamps_spec is not None: + num_wins = y.shape[0] + timestamps = model.compute_slidwin_timestamps( + num_wins, + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ).numpy() + logging.info("{}".format(timestamps)) + time_writer.write([key], [timestamps]) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + + if write_timestamps_spec is not None: + time_writer.close() + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + if slidwin_params_path is not None: + params = { + "padding": model.compute_slidwin_left_padding( + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ), + "win_length": win_length, + "win_shift": win_shift, + } + with open(slidwin_params_path, "w") as f: + yaml.dump(params, f) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=( + "Extract x-vectors over a sliding window" + "from waveform computing " + "acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-timestamps", dest="write_timestamps_spec", default=None + ) + parser.add_argument("--slidwin-params-path", default=None) + + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_argparse_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--win-length", + type=float, + default=1.5, + help=("window length for x-vector extraction in seconds"), + ) + parser.add_argument( + "--win-shift", + type=float, + default=0.25, + help=("window shift for x-vector extraction in seconds"), + ) + parser.add_argument( + "--snip-edges", + default=False, + action="store_true", + help=( + "If true, end effects will be handled by outputting " + "only windows that completely fit in the file, " + "and the number of windows depends on the window-length. " + "If false, the number of windows depends only on " + "the window-shift, and we reflect the data at the ends." + ), + ) + + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) From f9e4a2409b7edc6144cb92711849f056fb98f0c3 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 29 Dec 2022 08:08:35 -0500 Subject: [PATCH 074/154] isorted everything --- hyperion/__init__.py | 7 +- hyperion/bin/compute_energy_vad.py | 14 ++-- hyperion/bin/compute_mfcc_feats.py | 14 ++-- hyperion/bin/extract_wav2vec2xvectors.py | 20 ++--- hyperion/bin/extract_xvectors_from_feats.py | 18 ++--- hyperion/bin/extract_xvectors_from_wav.py | 20 ++--- .../extract_xvectors_slidwin_from_feats.py | 20 ++--- .../bin/extract_xvectors_slidwin_from_wav.py | 20 ++--- hyperion/bin/finetune_wav2vec2xvector.py | 32 +++----- .../bin/finetune_xvector_dfr_from_feats.py | 27 +++---- hyperion/bin/finetune_xvector_dfr_from_wav.py | 23 +++--- hyperion/bin/finetune_xvector_from_feats.py | 27 +++---- hyperion/bin/finetune_xvector_from_wav.py | 34 ++++----- hyperion/bin/make_babble_noise_audio_files.py | 20 ++--- hyperion/bin/pack_wav_rirs.py | 16 ++-- hyperion/bin/plot_embedding_tsne.py | 20 ++--- hyperion/bin/plot_embedding_tsne_per_class.py | 23 ++---- hyperion/bin/preprocess_audio_files.py | 20 ++--- hyperion/bin/train_wav2vec2xvector.py | 28 +++---- hyperion/bin/train_xvector_from_feats.py | 29 +++---- hyperion/bin/train_xvector_from_wav.py | 33 ++++---- hyperion/bin_deprec/ark2hyp.py | 4 +- hyperion/bin_deprec/arkvad2nist.py | 7 +- hyperion/bin_deprec/compute-gmm-post.py | 13 ++-- .../bin_deprec/eval-2class-performance.py | 10 +-- hyperion/bin_deprec/eval-elbo-ubm.py | 10 +-- .../bin_deprec/eval-q-scoring-homo-gbe.py | 14 ++-- hyperion/bin_deprec/eval-score-norm.py | 8 +- hyperion/bin_deprec/h5vad2nist.py | 6 +- hyperion/bin_deprec/init-ubm.py | 11 ++- hyperion/bin_deprec/scores2lre_format.py | 8 +- .../torch-train-conformer-enc-v1-vq-dvae.py | 17 ++--- .../torch-train-conformer-enc-v1-vq-vae.py | 17 ++--- hyperion/bin_deprec/torch-train-dc1d-dvae.py | 17 ++--- hyperion/bin_deprec/torch-train-dc1d-vae.py | 17 ++--- hyperion/bin_deprec/torch-train-dc2d-dvae.py | 17 ++--- hyperion/bin_deprec/torch-train-dc2d-vae.py | 17 ++--- .../bin_deprec/torch-train-resnet1d-dvae.py | 17 ++--- .../bin_deprec/torch-train-resnet1d-vae.py | 17 ++--- .../torch-train-resnet1d-vq-dvae.py | 17 ++--- .../bin_deprec/torch-train-resnet1d-vq-vae.py | 17 ++--- .../bin_deprec/torch-train-resnet2d-dvae.py | 17 ++--- .../bin_deprec/torch-train-resnet2d-vae.py | 27 +++---- .../torch-train-resnet2d-vq-dvae.py | 17 ++--- .../bin_deprec/torch-train-resnet2d-vq-vae.py | 17 ++--- .../torch-train-transformer-enc-v1-dvae.py | 17 ++--- .../torch-train-transformer-enc-v1-vae.py | 17 ++--- .../torch-train-transformer-enc-v1-vq-dvae.py | 17 ++--- .../torch-train-transformer-enc-v1-vq-vae.py | 17 ++--- hyperion/bin_deprec/torch-train-xvector.py | 25 +++--- .../bin_deprec/train-q-scoring-homo-gbe.py | 10 +-- hyperion/bin_deprec/vectors2scores.py | 6 +- .../bin_deprec2/apply-mvn-select-frames.py | 20 ++--- hyperion/bin_deprec2/compute-mfcc-feats.py | 14 ++-- hyperion/bin_deprec2/copy-feats.py | 7 +- hyperion/bin_deprec2/eval-cos-1vs1.py | 12 +-- hyperion/bin_deprec2/eval-linear-gbe-up.py | 16 ++-- hyperion/bin_deprec2/eval-linear-gbe.py | 16 ++-- hyperion/bin_deprec2/eval-linear-svmc.py | 16 ++-- .../bin_deprec2/eval-logistic-regression.py | 16 ++-- hyperion/bin_deprec2/eval-plda-1vs1.py | 14 ++-- hyperion/bin_deprec2/eval-plda-nvs1.py | 14 ++-- hyperion/bin_deprec2/merge-h5-files.py | 5 +- hyperion/bin_deprec2/pack-audio-files.py | 12 +-- hyperion/bin_deprec2/plot-vector-hist.py | 10 +-- hyperion/bin_deprec2/rttm-to-bin-vad.py | 8 +- hyperion/bin_deprec2/segments-to-bin-vad.py | 14 ++-- .../torch-adv-finetune-xvec-from-wav.py | 28 +++---- .../bin_deprec2/torch-adv-finetune-xvec.py | 29 +++---- .../bin_deprec2/torch-compute-mfcc-feats.py | 16 ++-- hyperion/bin_deprec2/torch-eval-vae.py | 18 ++--- ...osine-scoring-from-adv-test-wav-wavegan.py | 36 ++++----- ...l-xvec-cosine-scoring-from-adv-test-wav.py | 31 +++----- ...l-xvec-cosine-scoring-from-art-test-wav.py | 36 ++++----- ...-eval-xvec-cosine-scoring-from-test-wav.py | 24 +++--- ...sine-scoring-from-transfer-adv-test-wav.py | 31 +++----- ...sine-scoring-from-transfer-art-test-wav.py | 36 ++++----- .../torch-eval-xvec-logits-from-wav.py | 20 ++--- ...rch-extract-xvectors-from-wav-with-rttm.py | 20 ++--- ...torch-extract-xvectors-slidwin-from-wav.py | 20 ++--- .../torch-extract-xvectors-slidwin.py | 18 ++--- .../torch-extract-xvectors-vae-preproc.py | 18 ++--- .../bin_deprec2/torch-extract-xvectors.py | 18 ++--- ...ch-generate-adv-attacks-xvector-classif.py | 25 +++--- ...orch-generate-adv-attacks-xvector-verif.py | 31 +++----- hyperion/bin_deprec2/torch-train-dc1d-ae.py | 17 ++--- hyperion/bin_deprec2/torch-train-dvae.py | 34 ++++----- .../torch-train-efficientnet-xvec-from-wav.py | 22 ++---- .../torch-train-efficientnet-xvec.py | 26 +++---- .../torch-train-resnet-xvec-from-wav.py | 37 ++++----- .../bin_deprec2/torch-train-resnet-xvec.py | 26 +++---- .../torch-train-resnet1d-xvec-from-wav.py | 25 +++--- .../torch-train-spinenet-xvec-from-wav.py | 21 ++--- .../torch-train-tdnn-xvec-from-wav.py | 22 ++---- hyperion/bin_deprec2/torch-train-tdnn-xvec.py | 26 +++---- ...orch-train-transformer-xvec-v1-from-wav.py | 22 ++---- .../torch-train-transformer-xvec-v1.py | 26 +++---- hyperion/bin_deprec2/torch-train-vae.py | 34 ++++----- hyperion/bin_deprec2/torch-train-vq-dvae.py | 34 ++++----- hyperion/bin_deprec2/torch-train-vq-vae.py | 34 ++++----- hyperion/bin_deprec2/train-cw-up.py | 10 +-- hyperion/bin_deprec2/train-cw.py | 10 +-- hyperion/bin_deprec2/train-gaussianizer.py | 10 +-- hyperion/bin_deprec2/train-lda.py | 10 +-- hyperion/bin_deprec2/train-linear-gbe-up.py | 10 +-- hyperion/bin_deprec2/train-linear-gbe.py | 10 +-- hyperion/bin_deprec2/train-linear-svmc.py | 10 +-- .../bin_deprec2/train-logistic-regression.py | 10 +-- hyperion/bin_deprec2/train-mvn.py | 10 +-- hyperion/bin_deprec2/train-nda.py | 10 +-- hyperion/bin_deprec2/train-pca.py | 10 +-- hyperion/bin_deprec2/train-plda.py | 10 +-- hyperion/helpers/__init__.py | 10 +-- hyperion/helpers/classif_trial_data_reader.py | 10 +-- .../helpers/multi_test_trial_data_reader.py | 8 +- .../multi_test_trial_data_reader_v2.py | 8 +- hyperion/helpers/plda_factory.py | 2 +- hyperion/helpers/tracking_data_reader.py | 8 +- hyperion/helpers/trial_data_reader.py | 10 +-- hyperion/helpers/vector_class_reader.py | 10 +-- hyperion/helpers/vector_reader.py | 10 +-- hyperion/io/__init__.py | 30 +++----- hyperion/io/ark_data_reader.py | 10 ++- hyperion/io/ark_data_writer.py | 6 +- hyperion/io/audio_reader.py | 6 +- hyperion/io/audio_writer.py | 4 +- hyperion/io/bin_vad_reader.py | 3 +- hyperion/io/data_reader.py | 5 +- hyperion/io/data_rw_factory.py | 16 ++-- hyperion/io/h5_data_reader.py | 11 +-- hyperion/io/h5_data_writer.py | 7 +- hyperion/io/h5_merger.py | 1 + hyperion/io/hyp_data_reader.py | 5 +- hyperion/io/hyp_data_writer.py | 5 +- hyperion/io/kaldi_data_reader.py | 5 +- hyperion/io/packed_audio_reader.py | 10 +-- hyperion/io/packed_audio_writer.py | 2 +- hyperion/io/segment_vad_reader.py | 3 +- hyperion/io/vad_reader.py | 1 + hyperion/io/vad_rw_factory.py | 3 +- hyperion/np/augment/__init__.py | 4 +- hyperion/np/augment/noise_augment.py | 2 +- hyperion/np/augment/reverb_augment.py | 4 +- hyperion/np/augment/speech_augment.py | 3 +- hyperion/np/augment/speed_augment.py | 3 +- .../np/calibration/unsup_gauss_calibration.py | 1 + hyperion/np/classifiers/__init__.py | 6 +- hyperion/np/classifiers/greedy_fusion.py | 4 +- hyperion/np/classifiers/linear_gbe.py | 5 +- hyperion/np/classifiers/linear_gbe_up.py | 10 +-- hyperion/np/classifiers/linear_svmc.py | 6 +- .../np/classifiers/logistic_regression.py | 4 +- hyperion/np/classifiers/q_scoring_homo_gbe.py | 3 +- hyperion/np/classifiers/svmc.py | 8 +- hyperion/np/clustering/__init__.py | 2 +- hyperion/np/clustering/ahc.py | 6 +- hyperion/np/clustering/kmeans.py | 5 +- hyperion/np/diarization/diar_ahc_plda.py | 3 +- hyperion/np/feats/__init__.py | 10 +-- hyperion/np/feats/feature_normalization.py | 2 +- hyperion/np/feats/filter_banks.py | 2 +- hyperion/np/feats/mfcc.py | 2 +- hyperion/np/metrics/__init__.py | 5 +- hyperion/np/metrics/confusion_matrix.py | 3 +- hyperion/np/metrics/roc.py | 2 +- hyperion/np/metrics/utils.py | 2 +- hyperion/np/metrics/verification_evaluator.py | 8 +- hyperion/np/np_model.py | 6 +- hyperion/np/pdfs/__init__.py | 4 +- hyperion/np/pdfs/core/__init__.py | 4 +- hyperion/np/pdfs/core/normal.py | 20 ++--- hyperion/np/pdfs/core/normal_diag_cov.py | 10 +-- hyperion/np/pdfs/hmm/hmm.py | 2 +- hyperion/np/pdfs/jfa/jfa_total.py | 9 +-- hyperion/np/pdfs/mixtures/__init__.py | 4 +- .../np/pdfs/mixtures/exp_family_mixture.py | 6 +- hyperion/np/pdfs/mixtures/gmm.py | 26 ++----- hyperion/np/pdfs/mixtures/gmm_diag_cov.py | 14 ++-- .../np/pdfs/mixtures/gmm_tied_diag_cov.py | 14 ++-- hyperion/np/pdfs/plda/__init__.py | 4 +- hyperion/np/pdfs/plda/plda_base.py | 2 +- hyperion/np/score_norm/__init__.py | 6 +- hyperion/np/score_norm/adapt_s_norm.py | 2 +- hyperion/np/score_norm/s_norm.py | 2 +- hyperion/np/score_norm/t_norm.py | 2 +- hyperion/np/score_norm/zt_norm.py | 2 +- hyperion/np/transforms/__init__.py | 19 +++-- hyperion/np/transforms/cent_whiten.py | 5 +- hyperion/np/transforms/cent_whiten_up.py | 3 +- hyperion/np/transforms/coral.py | 3 +- hyperion/np/transforms/gaussianizer.py | 4 +- hyperion/np/transforms/lda.py | 3 +- hyperion/np/transforms/lnorm.py | 2 +- hyperion/np/transforms/lnorm_up.py | 2 +- hyperion/np/transforms/mvn.py | 3 +- hyperion/np/transforms/nap.py | 3 +- hyperion/np/transforms/nda.py | 5 +- hyperion/np/transforms/pca.py | 7 +- hyperion/np/transforms/sb_sw.py | 5 +- hyperion/np/transforms/skl_tsne.py | 3 +- hyperion/np/transforms/transform_list.py | 13 ++-- hyperion/torch/adv_attacks/__init__.py | 13 ++-- .../torch/adv_attacks/art_attack_factory.py | 2 +- hyperion/torch/adv_attacks/attack_factory.py | 12 +-- .../torch/adv_attacks/carlini_wagner_l2.py | 2 +- hyperion/torch/adv_attacks/pgd_attack.py | 3 +- .../adv_attacks/random_attack_factory.py | 4 +- hyperion/torch/adv_defenses/wave_gan_white.py | 5 +- hyperion/torch/data/__init__.py | 4 +- hyperion/torch/data/audio_dataset.py | 7 +- hyperion/torch/data/bucketing_seg_sampler.py | 3 +- .../data/class_weighted_embed_sampler.py | 3 +- .../data/class_weighted_seg_chunk_sampler.py | 3 +- hyperion/torch/data/embed_dataset.py | 6 +- hyperion/torch/data/embed_sampler.py | 3 +- hyperion/torch/data/embed_sampler_factory.py | 2 +- hyperion/torch/data/feat_seq_dataset.py | 6 +- hyperion/torch/data/hyp_sampler.py | 3 +- .../torch/data/paired_feat_seq_dataset.py | 1 + hyperion/torch/data/seg_chunk_sampler.py | 3 +- hyperion/torch/data/seg_sampler.py | 3 +- hyperion/torch/data/weighted_embed_sampler.py | 1 + hyperion/torch/data/weighted_seq_sampler.py | 3 +- hyperion/torch/layer_blocks/__init__.py | 76 +++++++------------ .../layer_blocks/conformer_encoder_v1.py | 2 +- hyperion/torch/layer_blocks/dc1d_blocks.py | 2 +- hyperion/torch/layer_blocks/dc2d_blocks.py | 2 +- hyperion/torch/layer_blocks/etdnn_blocks.py | 2 +- hyperion/torch/layer_blocks/fc_blocks.py | 2 +- hyperion/torch/layer_blocks/mbconv_blocks.py | 5 +- .../torch/layer_blocks/res2net1d_blocks.py | 5 +- .../torch/layer_blocks/res2net2d_blocks.py | 3 +- hyperion/torch/layer_blocks/res2net_blocks.py | 5 +- .../torch/layer_blocks/resetdnn_blocks.py | 2 +- .../torch/layer_blocks/resnet1d_blocks.py | 4 +- .../torch/layer_blocks/resnet2d_blocks.py | 2 +- hyperion/torch/layer_blocks/resnet_blocks.py | 2 +- hyperion/torch/layer_blocks/se_blocks.py | 2 +- .../torch/layer_blocks/seresnet_blocks.py | 4 +- hyperion/torch/layer_blocks/spine_blocks.py | 8 +- hyperion/torch/layer_blocks/tdnn_blocks.py | 2 +- hyperion/torch/layers/__init__.py | 35 ++++----- hyperion/torch/layers/activation_factory.py | 1 + hyperion/torch/layers/audio_feats.py | 4 +- hyperion/torch/layers/audio_feats_factory.py | 5 +- hyperion/torch/layers/global_pool.py | 1 + hyperion/torch/layers/margin_losses.py | 4 +- hyperion/torch/layers/mvn.py | 2 +- hyperion/torch/layers/pdf_storage.py | 2 +- hyperion/torch/layers/pool_factory.py | 3 +- hyperion/torch/layers/spec_augment.py | 3 +- hyperion/torch/layers/tensor2pdf.py | 2 +- hyperion/torch/layers/tensor2pdf1.py | 2 +- hyperion/torch/layers/vq.py | 2 +- hyperion/torch/loggers/__init__.py | 2 +- hyperion/torch/loggers/csv_logger.py | 3 +- hyperion/torch/loggers/logger.py | 1 + hyperion/torch/loggers/logger_list.py | 1 + hyperion/torch/loggers/prog_logger.py | 2 +- hyperion/torch/loggers/tensorboard_logger.py | 1 + hyperion/torch/loggers/wandb_logger.py | 2 +- hyperion/torch/lr_schedulers/__init__.py | 8 +- hyperion/torch/lr_schedulers/cos_lr.py | 2 +- hyperion/torch/lr_schedulers/factory.py | 6 +- hyperion/torch/lr_schedulers/noam_lr.py | 5 +- hyperion/torch/lr_schedulers/triangular_lr.py | 2 +- hyperion/torch/metrics/__init__.py | 4 +- hyperion/torch/metrics/accuracy.py | 2 +- hyperion/torch/models/__init__.py | 23 +++--- hyperion/torch/models/ae/ae.py | 2 +- hyperion/torch/models/plda/plda_base.py | 4 +- hyperion/torch/models/plda/splda.py | 2 +- hyperion/torch/models/tvector/__init__.py | 2 +- .../torch/models/tvector/resnet_tvector.py | 2 +- hyperion/torch/models/tvector/tvector.py | 7 +- hyperion/torch/models/vae/vae.py | 8 +- hyperion/torch/models/vae/vq_vae.py | 6 +- .../torch/models/wav2xvectors/__init__.py | 12 ++- .../hf_hubert2resnet1d_xvector.py | 7 +- .../hf_wav2vec2resnet1d_xvector.py | 7 +- .../models/wav2xvectors/hf_wav2xvector.py | 10 ++- .../wav2xvectors/hf_wavlm2resnet1d_xvector.py | 7 +- .../wav2xvectors/wav2resnet1d_xvector.py | 5 +- .../models/wav2xvectors/wav2resnet_xvector.py | 5 +- .../torch/models/wav2xvectors/wav2xvector.py | 5 +- hyperion/torch/models/xvectors/__init__.py | 10 +-- .../models/xvectors/efficient_net_xvector.py | 5 +- .../torch/models/xvectors/resnet1d_xvector.py | 5 +- .../torch/models/xvectors/resnet_xvector.py | 5 +- .../torch/models/xvectors/spinenet_xvector.py | 5 +- .../torch/models/xvectors/tdnn_xvector.py | 5 +- .../models/xvectors/transformer_xvector_v1.py | 5 +- hyperion/torch/models/xvectors/xvector.py | 5 +- hyperion/torch/narchs/__init__.py | 42 ++++------ hyperion/torch/narchs/audio_feats_mvn.py | 2 +- hyperion/torch/narchs/classif_head.py | 7 +- hyperion/torch/narchs/conformer_encoder_v1.py | 9 ++- hyperion/torch/narchs/dc1d_decoder.py | 8 +- hyperion/torch/narchs/dc1d_encoder.py | 5 +- hyperion/torch/narchs/dc2d_decoder.py | 8 +- hyperion/torch/narchs/dc2d_encoder.py | 5 +- hyperion/torch/narchs/efficient_net.py | 7 +- hyperion/torch/narchs/etdnn.py | 2 +- hyperion/torch/narchs/fcnet.py | 4 +- hyperion/torch/narchs/resetdnn.py | 4 +- hyperion/torch/narchs/resnet.py | 18 ++--- hyperion/torch/narchs/resnet1d_decoder.py | 11 ++- hyperion/torch/narchs/resnet1d_encoder.py | 20 ++--- hyperion/torch/narchs/resnet2d_decoder.py | 11 ++- hyperion/torch/narchs/resnet2d_encoder.py | 14 ++-- hyperion/torch/narchs/resnet_factory.py | 2 +- hyperion/torch/narchs/spinenet.py | 12 +-- hyperion/torch/narchs/spinenet_factory.py | 2 +- hyperion/torch/narchs/tdnn.py | 2 +- hyperion/torch/narchs/tdnn_factory.py | 4 +- hyperion/torch/narchs/torch_na_loader.py | 34 ++++----- .../torch/narchs/transformer_encoder_v1.py | 6 +- hyperion/torch/optim/__init__.py | 2 +- hyperion/torch/optim/factory.py | 5 +- hyperion/torch/optim/radam.py | 1 + hyperion/torch/seq_embed/__init__.py | 8 +- hyperion/torch/torch_defs.py | 1 - hyperion/torch/torch_model_loader.py | 4 +- hyperion/torch/tpm/__init__.py | 2 +- hyperion/torch/tpm/hf/__init__.py | 2 +- hyperion/torch/tpm/hf/hf_hubert.py | 12 +-- hyperion/torch/tpm/hf/hf_wav2vec2.py | 12 +-- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 13 ++-- hyperion/torch/tpm/hf/hf_wavlm.py | 12 +-- hyperion/torch/trainers/ae_trainer.py | 3 +- hyperion/torch/trainers/dvae_trainer.py | 3 +- hyperion/torch/trainers/torch_trainer.py | 17 ++--- hyperion/torch/trainers/vae_trainer.py | 3 +- hyperion/torch/trainers/vq_dvae_trainer.py | 3 +- hyperion/torch/trainers/vq_vae_trainer.py | 3 +- .../torch/trainers/xvector_adv_trainer.py | 3 +- .../trainers/xvector_adv_trainer_from_wav.py | 3 +- .../trainers/xvector_trainer_deep_feat_reg.py | 3 +- hyperion/torch/utils/__init__.py | 17 ++--- hyperion/torch/utils/ddp.py | 13 ++-- hyperion/torch/utils/devices.py | 2 +- hyperion/torch/utils/eval_utils.py | 1 + hyperion/torch/utils/metric_acc.py | 1 + hyperion/torch/utils/misc.py | 2 +- 344 files changed, 1557 insertions(+), 1952 deletions(-) diff --git a/hyperion/__init__.py b/hyperion/__init__.py index 055441cd..fc35423c 100644 --- a/hyperion/__init__.py +++ b/hyperion/__init__.py @@ -4,11 +4,6 @@ """ -from . import utils -from . import np -from . import io -from . import torch -from . import helpers - +from . import helpers, io, np, torch, utils __version__ = "0.4.0a" diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index 99f562cf..15d74f3a 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -3,22 +3,18 @@ Copyright 2018 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.hyp_defs import config_logger -from hyperion.io import SequentialAudioReader as AR from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR from hyperion.np.feats import EnergyVAD diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index b7e90056..a83f95d1 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -3,23 +3,19 @@ Copyright 2018 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF -from hyperion.io import DataWriterFactory as DWF from hyperion.io import compression_methods from hyperion.np.feats import MFCC diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index a09e5c11..2a92a83e 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd - -import torch import torchaudio.transforms as tat +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.np.augment import SpeechAugment - -from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info resamplers = {} diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index 9fb1006c..926e0bcc 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -4,30 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index c1cdf02d..addabbcf 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.np.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index d14f16f3..e3d2fcbb 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -4,31 +4,25 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np - -import torch import yaml +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index b4bd2b0d..2b1bba3b 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -4,33 +4,27 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd import yaml +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.np.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index 718aeeb9..b3edd9b5 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -3,37 +3,29 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory - from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.models import ( - HFWav2Vec2ResNet1dXVector, - HFHubert2ResNet1dXVector, - HFWavLM2ResNet1dXVector, -) -from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models import (HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector) +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py index a26c14fb..2ac01025 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_feats.py +++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py @@ -4,32 +4,27 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer -from hyperion.torch.data import FeatSeqDataset as SD +from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models import XVector as XVec +from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py index 437c76f0..ff97d3ca 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_wav.py +++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py @@ -3,32 +3,27 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer +from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import XVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py index ec6386c8..7a1fb5a9 100755 --- a/hyperion/bin/finetune_xvector_from_feats.py +++ b/hyperion/bin/finetune_xvector_from_feats.py @@ -3,31 +3,26 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD +from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models import XVector as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index c6239b45..7b68b9dd 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -3,37 +3,31 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path -import torch +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +import torch from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp - -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.data import AudioDataset as AD - from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.models import ResNetXVector as RXVec -from hyperion.torch.models import ResNet1dXVector as R1dXVec from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec from hyperion.torch.models import TDNNXVector as TDXVec from hyperion.torch.models import TransformerXVectorV1 as TFXVec -from hyperion.torch.models import SpineNetXVector as SpineXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 460f4044..972ff01f 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -3,26 +3,22 @@ Copyright 2020 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging +import math import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging -import math import numpy as np -from scipy import signal, ndimage +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal from hyperion.hyp_defs import config_logger -from hyperion.utils import Utt2Info -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as Writer +from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import VADReaderFactory as VRF +from hyperion.utils import Utt2Info def make_noise(xs): diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index 00177988..dccf58da 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -3,23 +3,19 @@ Copyright 2020 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging +import math import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging -import math import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.hyp_defs import config_logger -from hyperion.io import SequentialAudioReader as AR from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR def pack_wav_rirs(input_path, output_spec, **kwargs): diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py index e514252f..e011dfe8 100755 --- a/hyperion/bin/plot_embedding_tsne.py +++ b/hyperion/bin/plot_embedding_tsne.py @@ -4,28 +4,22 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -import sys import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, - ActionYesNo, -) +import sys import time from pathlib import Path -import numpy as np -import pandas as pd import matplotlib - import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) from hyperion.hyp_defs import config_logger -from hyperion.utils import SegmentSet from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.np.transforms import PCA, SklTSNE, LNorm +from hyperion.np.transforms import PCA, LNorm, SklTSNE +from hyperion.utils import SegmentSet matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 5e832bff..6f35f074 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -4,31 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -import sys import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, - ActionYesNo, -) +import sys import time from pathlib import Path -import numpy as np -import pandas as pd import matplotlib - import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) from hyperion.hyp_defs import config_logger -from hyperion.utils import SegmentSet -from hyperion.utils.math import cosine_scoring from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.np.transforms import PCA, SklTSNE, LNorm from hyperion.np.clustering import AHC - +from hyperion.np.transforms import PCA, LNorm, SklTSNE +from hyperion.utils import SegmentSet +from hyperion.utils.math import cosine_scoring matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index 67b1cf61..2f4e5cbc 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -3,26 +3,22 @@ Copyright 2020 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging +import math import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging -import math import numpy as np -from scipy import signal, ndimage +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal from hyperion.hyp_defs import config_logger -from hyperion.utils import Utt2Info -from hyperion.io import SequentialAudioReader as AR from hyperion.io import AudioWriter as Writer +from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF +from hyperion.utils import Utt2Info def process_vad(vad, length, fs, dilation, erosion): diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 7187c13c..cb2b4a6d 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -3,36 +3,28 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging +import multiprocessing # import sys import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) import time -import logging -import multiprocessing +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory - from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.models import ( - HFWav2Vec2ResNet1dXVector, - HFHubert2ResNet1dXVector, - HFWavLM2ResNet1dXVector, -) +from hyperion.torch.models import (HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector) +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index c09f15a4..7f4ab0fa 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -3,36 +3,31 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.models import ResNetXVector as RXVec -from hyperion.torch.models import ResNet1dXVector as R1dXVec from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec from hyperion.torch.models import TDNNXVector as TDXVec from hyperion.torch.models import TransformerXVectorV1 as TFXVec -from hyperion.torch.models import SpineNetXVector as SpineXVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index 0e074977..57a33b56 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -3,36 +3,31 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path -import torch +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +import torch from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.data import AudioDataset as AD - # from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.models import ResNetXVector as RXVec -from hyperion.torch.models import ResNet1dXVector as R1dXVec from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec from hyperion.torch.models import TDNNXVector as TDXVec from hyperion.torch.models import TransformerXVectorV1 as TFXVec -from hyperion.torch.models import SpineNetXVector as SpineXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin_deprec/ark2hyp.py b/hyperion/bin_deprec/ark2hyp.py index 45a20712..abcb4457 100755 --- a/hyperion/bin_deprec/ark2hyp.py +++ b/hyperion/bin_deprec/ark2hyp.py @@ -7,9 +7,9 @@ Converts from Ark format to h5 format (deprecated, use copy-feats.py) """ -import sys -import os import argparse +import os +import sys import time import numpy as np diff --git a/hyperion/bin_deprec/arkvad2nist.py b/hyperion/bin_deprec/arkvad2nist.py index bd15592a..559371be 100755 --- a/hyperion/bin_deprec/arkvad2nist.py +++ b/hyperion/bin_deprec/arkvad2nist.py @@ -7,15 +7,14 @@ Converts from Ark format to NIST OpenSAT """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np - from hyperion.io import KaldiDataReader diff --git a/hyperion/bin_deprec/compute-gmm-post.py b/hyperion/bin_deprec/compute-gmm-post.py index 1b0a8d04..58675336 100755 --- a/hyperion/bin_deprec/compute-gmm-post.py +++ b/hyperion/bin_deprec/compute-gmm-post.py @@ -7,21 +7,20 @@ Computes GMM posteriors """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np - from keras import backend as K -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.io import HypDataWriter from hyperion.helpers import SequenceReader as SR -from hyperion.transforms import TransformList +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter from hyperion.pdfs import DiagGMM +from hyperion.transforms import TransformList def to_sparse(r, num_comp): diff --git a/hyperion/bin_deprec/eval-2class-performance.py b/hyperion/bin_deprec/eval-2class-performance.py index a10ec5c0..eff16830 100755 --- a/hyperion/bin_deprec/eval-2class-performance.py +++ b/hyperion/bin_deprec/eval-2class-performance.py @@ -7,18 +7,18 @@ Evals EER, DCF, DET """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np from hyperion.hyp_defs import config_logger -from hyperion.utils.trial_scores import TrialScores -from hyperion.utils.trial_key import TrialKey from hyperion.metrics import compute_eer +from hyperion.utils.trial_key import TrialKey +from hyperion.utils.trial_scores import TrialScores def eval_2class_performance(score_file, key_file, output_path): diff --git a/hyperion/bin_deprec/eval-elbo-ubm.py b/hyperion/bin_deprec/eval-elbo-ubm.py index 5cf1aa0d..bf4839db 100755 --- a/hyperion/bin_deprec/eval-elbo-ubm.py +++ b/hyperion/bin_deprec/eval-elbo-ubm.py @@ -7,18 +7,18 @@ Evaluate the likelihood of the ubm on some data """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import SequenceReader as SR -from hyperion.transforms import TransformList +from hyperion.hyp_defs import config_logger, float_cpu from hyperion.pdfs import DiagGMM +from hyperion.transforms import TransformList def eval_elbo( diff --git a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py index 9e2880f8..4548e49b 100755 --- a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py @@ -7,21 +7,21 @@ Evals Q-scoring back-end """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np +from hyperion.classifiers import QScoringHomoGBE as GBE +from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger -from hyperion.utils.trial_ndx import TrialNdx -from hyperion.utils.trial_scores import TrialScores from hyperion.io import HypDataWriter as HDW -from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.transforms import TransformList -from hyperion.classifiers import QScoringHomoGBE as GBE +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scores import TrialScores def eval_qscoring_gbe( diff --git a/hyperion/bin_deprec/eval-score-norm.py b/hyperion/bin_deprec/eval-score-norm.py index fd6e2e00..4b620518 100755 --- a/hyperion/bin_deprec/eval-score-norm.py +++ b/hyperion/bin_deprec/eval-score-norm.py @@ -7,18 +7,18 @@ Score Normalization """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np from hyperion.hyp_defs import config_logger from hyperion.score_norm import * -from hyperion.utils.trial_scroes import TrialScores from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scroes import TrialScores def load_scores(score_file, enr_coh_file, coh_test_file, coh_coh_file): diff --git a/hyperion/bin_deprec/h5vad2nist.py b/hyperion/bin_deprec/h5vad2nist.py index 804c8637..fb45c22b 100755 --- a/hyperion/bin_deprec/h5vad2nist.py +++ b/hyperion/bin_deprec/h5vad2nist.py @@ -7,11 +7,11 @@ Converts from Ark format to NIST OpenSAT """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np diff --git a/hyperion/bin_deprec/init-ubm.py b/hyperion/bin_deprec/init-ubm.py index 8a162314..204ca855 100755 --- a/hyperion/bin_deprec/init-ubm.py +++ b/hyperion/bin_deprec/init-ubm.py @@ -8,20 +8,19 @@ Initialize UBM """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np - from keras import backend as K -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.utils.multithreading import threadsafe_generator from hyperion.helpers import SequenceReader as SR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.pdfs import DiagGMM +from hyperion.utils.multithreading import threadsafe_generator @threadsafe_generator diff --git a/hyperion/bin_deprec/scores2lre_format.py b/hyperion/bin_deprec/scores2lre_format.py index 50e9147f..717c1535 100755 --- a/hyperion/bin_deprec/scores2lre_format.py +++ b/hyperion/bin_deprec/scores2lre_format.py @@ -4,12 +4,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time -import re import logging +import os +import re +import sys +import time import numpy as np diff --git a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py index 9adb2cfd..608a5271 100755 --- a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ConformerEncoderV1 as Encoder -from hyperion.torch.narchs import ConformerEncoderV1 as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ConformerEncoderV1 as Decoder +from hyperion.torch.narchs import ConformerEncoderV1 as Encoder from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py index d227a8b2..a4cc54e6 100755 --- a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ConformerEncoderV1 as Encoder -from hyperion.torch.narchs import ConformerEncoderV1 as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ConformerEncoderV1 as Decoder +from hyperion.torch.narchs import ConformerEncoderV1 as Encoder from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-dc1d-dvae.py b/hyperion/bin_deprec/torch-train-dc1d-dvae.py index 343807c2..1b88beba 100755 --- a/hyperion/bin_deprec/torch-train-dc1d-dvae.py +++ b/hyperion/bin_deprec/torch-train-dc1d-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import DC1dEncoder as Encoder -from hyperion.torch.narchs import DC1dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import DC1dDecoder as Decoder +from hyperion.torch.narchs import DC1dEncoder as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-dc1d-vae.py b/hyperion/bin_deprec/torch-train-dc1d-vae.py index daa67b3e..dd5d2e72 100755 --- a/hyperion/bin_deprec/torch-train-dc1d-vae.py +++ b/hyperion/bin_deprec/torch-train-dc1d-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs.dc1d_encoder import DC1dEncoder as Encoder -from hyperion.torch.narchs.dc1d_decoder import DC1dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs.dc1d_decoder import DC1dDecoder as Decoder +from hyperion.torch.narchs.dc1d_encoder import DC1dEncoder as Encoder from hyperion.torch.trainers.vae_trainer import VAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-dc2d-dvae.py b/hyperion/bin_deprec/torch-train-dc2d-dvae.py index 2e32b9f9..3f7cb17d 100755 --- a/hyperion/bin_deprec/torch-train-dc2d-dvae.py +++ b/hyperion/bin_deprec/torch-train-dc2d-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import DC2dEncoder as Encoder -from hyperion.torch.narchs import DC2dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import DC2dDecoder as Decoder +from hyperion.torch.narchs import DC2dEncoder as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-dc2d-vae.py b/hyperion/bin_deprec/torch-train-dc2d-vae.py index d8675ae9..5b97f55c 100755 --- a/hyperion/bin_deprec/torch-train-dc2d-vae.py +++ b/hyperion/bin_deprec/torch-train-dc2d-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import DC2dEncoder as Encoder -from hyperion.torch.narchs import DC2dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import DC2dDecoder as Decoder +from hyperion.torch.narchs import DC2dEncoder as Encoder from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet1d-dvae.py b/hyperion/bin_deprec/torch-train-resnet1d-dvae.py index 420cf7b2..ca6f6996 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-dvae.py @@ -3,27 +3,26 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet1dEncoder as Encoder -from hyperion.torch.narchs import ResNet1dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import ResNet1dDecoder as Decoder +from hyperion.torch.narchs import ResNet1dEncoder as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vae.py b/hyperion/bin_deprec/torch-train-resnet1d-vae.py index a8edb3c3..a6218567 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet1dEncoder as Encoder -from hyperion.torch.narchs import ResNet1dDecoder as Decoder from hyperion.torch.models.vae.vae import VAE +from hyperion.torch.narchs import ResNet1dDecoder as Decoder +from hyperion.torch.narchs import ResNet1dEncoder as Encoder from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py b/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py index 9571eff8..89448754 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet1dEncoder as Encoder -from hyperion.torch.narchs import ResNet1dDecoder as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ResNet1dDecoder as Decoder +from hyperion.torch.narchs import ResNet1dEncoder as Encoder from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py b/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py index 373be8f3..4a84bbff 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet1dEncoder as Encoder -from hyperion.torch.narchs import ResNet1dDecoder as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ResNet1dDecoder as Decoder +from hyperion.torch.narchs import ResNet1dEncoder as Encoder from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet2d-dvae.py b/hyperion/bin_deprec/torch-train-resnet2d-dvae.py index 6845750f..3f6cd6ba 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet2dEncoder as Encoder -from hyperion.torch.narchs import ResNet2dDecoder as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vae.py b/hyperion/bin_deprec/torch-train-resnet2d-vae.py index 575c5575..4e853230 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vae.py @@ -3,32 +3,27 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import ResNet2dEncoder as Encoder -from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.models import VAE +from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py b/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py index 95eb3923..5e0add50 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet2dEncoder as Encoder -from hyperion.torch.narchs import ResNet2dDecoder as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py b/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py index 07f25d5f..6398d959 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import ResNet2dEncoder as Encoder -from hyperion.torch.narchs import ResNet2dDecoder as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import ResNet2dDecoder as Decoder +from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py index 39ee2974..0137e101 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py @@ -4,27 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import TransformerEncoderV1 as Encoder -from hyperion.torch.narchs import TransformerEncoderV1 as Decoder from hyperion.torch.models import VAE +from hyperion.torch.narchs import TransformerEncoderV1 as Decoder +from hyperion.torch.narchs import TransformerEncoderV1 as Encoder from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py index 9f5cbdf8..71021825 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import TransformerEncoderV1 as Encoder -from hyperion.torch.narchs import TransformerEncoderV1 as Decoder from hyperion.torch.models.vae.vae import VAE +from hyperion.torch.narchs import TransformerEncoderV1 as Decoder +from hyperion.torch.narchs import TransformerEncoderV1 as Encoder from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py index c6246fe3..a6908c4f 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedSeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import TransformerEncoderV1 as Encoder -from hyperion.torch.narchs import TransformerEncoderV1 as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import TransformerEncoderV1 as Decoder +from hyperion.torch.narchs import TransformerEncoderV1 as Encoder from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py index 4659e0d8..b3b07682 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs import TransformerEncoderV1 as Encoder -from hyperion.torch.narchs import TransformerEncoderV1 as Decoder from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import TransformerEncoderV1 as Decoder +from hyperion.torch.narchs import TransformerEncoderV1 as Encoder from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_vae( diff --git a/hyperion/bin_deprec/torch-train-xvector.py b/hyperion/bin_deprec/torch-train-xvector.py index 4cc443ae..4c69eb25 100755 --- a/hyperion/bin_deprec/torch-train-xvector.py +++ b/hyperion/bin_deprec/torch-train-xvector.py @@ -3,26 +3,27 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -import torch -from torch.utils.data import DataLoader -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.torch.torch_defs import float_torch -from hyperion.torch.utils import open_device -from hyperion.torch.data import SeqDataset, ClassWeightedSeqSampler as Sampler -from hyperion.torch.helpers import TorchNALoader +import torch +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset from hyperion.torch.helpers import OptimizerFactory as OF -from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF +from hyperion.torch.helpers import TorchNALoader from hyperion.torch.layers import GlobalPool1dFactory as PF -from hyperion.torch.seq_embed import XVector, XVectorTrainer +from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.seq_embed import XVector, XVectorTrainer +from hyperion.torch.torch_defs import float_torch +from hyperion.torch.utils import open_device +from torch.utils.data import DataLoader def train_xvector( diff --git a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py index 69780865..8a348728 100755 --- a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py @@ -7,18 +7,18 @@ Trains Q-scoring back-end """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger +from hyperion.classifiers import QScoringHomoGBE as GBE from hyperion.helpers import VectorClassReader as VCR +from hyperion.hyp_defs import config_logger from hyperion.transforms import TransformList -from hyperion.classifiers import QScoringHomoGBE as GBE def train_qscoring_backend(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin_deprec/vectors2scores.py b/hyperion/bin_deprec/vectors2scores.py index cc936115..ab4be8ac 100755 --- a/hyperion/bin_deprec/vectors2scores.py +++ b/hyperion/bin_deprec/vectors2scores.py @@ -4,11 +4,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time +import os import re +import sys +import time import numpy as np diff --git a/hyperion/bin_deprec2/apply-mvn-select-frames.py b/hyperion/bin_deprec2/apply-mvn-select-frames.py index 4f73628e..a2456dc9 100755 --- a/hyperion/bin_deprec2/apply-mvn-select-frames.py +++ b/hyperion/bin_deprec2/apply-mvn-select-frames.py @@ -4,27 +4,23 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.hyp_defs import config_logger -from hyperion.utils.kaldi_matrix import compression_methods -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF -from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import RandomAccessDataReaderFactory as RDRF -from hyperion.np.feats import MeanVarianceNorm as MVN +from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.np.feats import FrameSelector as FSel +from hyperion.np.feats import MeanVarianceNorm as MVN +from hyperion.utils import Utt2Info +from hyperion.utils.kaldi_matrix import compression_methods def process_feats( diff --git a/hyperion/bin_deprec2/compute-mfcc-feats.py b/hyperion/bin_deprec2/compute-mfcc-feats.py index b7e90056..a83f95d1 100755 --- a/hyperion/bin_deprec2/compute-mfcc-feats.py +++ b/hyperion/bin_deprec2/compute-mfcc-feats.py @@ -3,23 +3,19 @@ Copyright 2018 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF -from hyperion.io import DataWriterFactory as DWF from hyperion.io import compression_methods from hyperion.np.feats import MFCC diff --git a/hyperion/bin_deprec2/copy-feats.py b/hyperion/bin_deprec2/copy-feats.py index 1ef044f5..0385cc55 100755 --- a/hyperion/bin_deprec2/copy-feats.py +++ b/hyperion/bin_deprec2/copy-feats.py @@ -5,18 +5,17 @@ Copy features/vectors and change format """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF - if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/hyperion/bin_deprec2/eval-cos-1vs1.py b/hyperion/bin_deprec2/eval-cos-1vs1.py index 16c9122a..de508333 100755 --- a/hyperion/bin_deprec2/eval-cos-1vs1.py +++ b/hyperion/bin_deprec2/eval-cos-1vs1.py @@ -7,19 +7,19 @@ Evals cosine scoring """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger +from hyperion.helpers import TrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.np.transforms import LNorm, TransformList from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.helpers import TrialDataReader as TDR -from hyperion.np.transforms import TransformList, LNorm def eval_cos( diff --git a/hyperion/bin_deprec2/eval-linear-gbe-up.py b/hyperion/bin_deprec2/eval-linear-gbe-up.py index a8c3d999..d82bf967 100755 --- a/hyperion/bin_deprec2/eval-linear-gbe-up.py +++ b/hyperion/bin_deprec2/eval-linear-gbe-up.py @@ -7,21 +7,21 @@ Evals linear GBE with uncertainty propagation. """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.utils.trial_ndx import TrialNdx -from hyperion.utils.trial_scores import TrialScores -from hyperion.io import HypDataWriter as HDW from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.np.transforms import TransformList +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter as HDW from hyperion.np.classifiers import LinearGBEUP as GBE +from hyperion.np.transforms import TransformList +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scores import TrialScores def eval_linear_gbe( diff --git a/hyperion/bin_deprec2/eval-linear-gbe.py b/hyperion/bin_deprec2/eval-linear-gbe.py index 0970bb5e..cf788392 100755 --- a/hyperion/bin_deprec2/eval-linear-gbe.py +++ b/hyperion/bin_deprec2/eval-linear-gbe.py @@ -7,21 +7,21 @@ Evals linear GBE """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.utils.trial_ndx import TrialNdx -from hyperion.utils.trial_scores import TrialScores -from hyperion.io import HypDataWriter as HDW from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.np.transforms import TransformList +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter as HDW from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.transforms import TransformList +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scores import TrialScores def eval_linear_gbe( diff --git a/hyperion/bin_deprec2/eval-linear-svmc.py b/hyperion/bin_deprec2/eval-linear-svmc.py index d6c96c11..ba4c5e81 100755 --- a/hyperion/bin_deprec2/eval-linear-svmc.py +++ b/hyperion/bin_deprec2/eval-linear-svmc.py @@ -7,21 +7,21 @@ Evals SVM """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.utils.trial_ndx import TrialNdx -from hyperion.utils.trial_scores import TrialScores -from hyperion.io import HypDataWriter as HDW from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.np.transforms import TransformList +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter as HDW from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.transforms import TransformList +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scores import TrialScores def eval_svm( diff --git a/hyperion/bin_deprec2/eval-logistic-regression.py b/hyperion/bin_deprec2/eval-logistic-regression.py index 91a092ea..992ca7b8 100755 --- a/hyperion/bin_deprec2/eval-logistic-regression.py +++ b/hyperion/bin_deprec2/eval-logistic-regression.py @@ -7,21 +7,21 @@ Evals logistic regression """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.utils.trial_ndx import TrialNdx -from hyperion.utils.trial_scores import TrialScores -from hyperion.io import HypDataWriter as HDW from hyperion.helpers import ClassifTrialDataReader as TDR -from hyperion.np.transforms import TransformList +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import HypDataWriter as HDW from hyperion.np.classifiers import LogisticRegression as LR +from hyperion.np.transforms import TransformList +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scores import TrialScores def eval_lr( diff --git a/hyperion/bin_deprec2/eval-plda-1vs1.py b/hyperion/bin_deprec2/eval-plda-1vs1.py index eadf4a87..5a810cf7 100755 --- a/hyperion/bin_deprec2/eval-plda-1vs1.py +++ b/hyperion/bin_deprec2/eval-plda-1vs1.py @@ -7,20 +7,20 @@ Evals PDDA LLR """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.utils.trial_ndx import TrialNdx -from hyperion.utils.trial_scores import TrialScores -from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F +from hyperion.helpers import TrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.np.transforms import TransformList +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scores import TrialScores def eval_plda( diff --git a/hyperion/bin_deprec2/eval-plda-nvs1.py b/hyperion/bin_deprec2/eval-plda-nvs1.py index 5a63e5a5..5c5d200c 100755 --- a/hyperion/bin_deprec2/eval-plda-nvs1.py +++ b/hyperion/bin_deprec2/eval-plda-nvs1.py @@ -7,20 +7,20 @@ Evals PLDA LLR """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import set_float_cpu, float_cpu, config_logger -from hyperion.utils.trial_ndx import TrialNdx -from hyperion.utils.trial_scores import TrialScores -from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F +from hyperion.helpers import TrialDataReader as TDR +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.np.transforms import TransformList +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.utils.trial_scores import TrialScores def eval_plda( diff --git a/hyperion/bin_deprec2/merge-h5-files.py b/hyperion/bin_deprec2/merge-h5-files.py index a807c69c..aeda3bab 100755 --- a/hyperion/bin_deprec2/merge-h5-files.py +++ b/hyperion/bin_deprec2/merge-h5-files.py @@ -6,10 +6,11 @@ """ Merges multiple hdf5 files into one file """ -import sys -import os import argparse +import os +import sys import time + import numpy as np from hyperion.io import H5Merger diff --git a/hyperion/bin_deprec2/pack-audio-files.py b/hyperion/bin_deprec2/pack-audio-files.py index 4953d345..5d544df4 100755 --- a/hyperion/bin_deprec2/pack-audio-files.py +++ b/hyperion/bin_deprec2/pack-audio-files.py @@ -3,19 +3,19 @@ Copyright 2020 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging - import math +import os +import sys +import time + import numpy as np -from scipy import signal, ndimage +from scipy import ndimage, signal from hyperion.hyp_defs import config_logger -from hyperion.io import SequentialAudioReader as AR from hyperion.io import PackedAudioWriter as Writer +from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.io import WSpecifier as WS diff --git a/hyperion/bin_deprec2/plot-vector-hist.py b/hyperion/bin_deprec2/plot-vector-hist.py index 60560a80..75236726 100755 --- a/hyperion/bin_deprec2/plot-vector-hist.py +++ b/hyperion/bin_deprec2/plot-vector-hist.py @@ -4,20 +4,20 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time -import numpy as np import matplotlib +import numpy as np matplotlib.use("Agg") import matplotlib.pyplot as plt -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR +from hyperion.hyp_defs import config_logger from hyperion.np.transforms import TransformList diff --git a/hyperion/bin_deprec2/rttm-to-bin-vad.py b/hyperion/bin_deprec2/rttm-to-bin-vad.py index 9c51ba2c..19e98d8f 100755 --- a/hyperion/bin_deprec2/rttm-to-bin-vad.py +++ b/hyperion/bin_deprec2/rttm-to-bin-vad.py @@ -3,18 +3,18 @@ # Apache 2.0. # -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import pandas as pd from hyperion.hyp_defs import config_logger -from hyperion.utils import SegmentList, RTTM from hyperion.io import DataWriterFactory as DWF +from hyperion.utils import RTTM, SegmentList def rttm_to_bin_vad( diff --git a/hyperion/bin_deprec2/segments-to-bin-vad.py b/hyperion/bin_deprec2/segments-to-bin-vad.py index 2b3a7d91..24021a4b 100755 --- a/hyperion/bin_deprec2/segments-to-bin-vad.py +++ b/hyperion/bin_deprec2/segments-to-bin-vad.py @@ -3,23 +3,19 @@ # Apache 2.0. # -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.hyp_defs import config_logger -from hyperion.utils import SegmentList from hyperion.io import DataWriterFactory as DWF +from hyperion.utils import SegmentList def segments_to_bin_vad( diff --git a/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py index eb118102..ad33515c 100755 --- a/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py @@ -3,35 +3,29 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import XVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.adv_attacks import AttackFactory -from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin_deprec2/torch-adv-finetune-xvec.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec.py index ae2cb37b..850233e2 100755 --- a/hyperion/bin_deprec2/torch-adv-finetune-xvec.py +++ b/hyperion/bin_deprec2/torch-adv-finetune-xvec.py @@ -4,32 +4,27 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.models import XVector as XVec -from hyperion.torch.trainers import XVectorAdvTrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.adv_attacks import AttackFactory -from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.models import XVector as XVec +from hyperion.torch.trainers import XVectorAdvTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-compute-mfcc-feats.py b/hyperion/bin_deprec2/torch-compute-mfcc-feats.py index 4fc6bec2..07f71bfb 100755 --- a/hyperion/bin_deprec2/torch-compute-mfcc-feats.py +++ b/hyperion/bin_deprec2/torch-compute-mfcc-feats.py @@ -3,23 +3,19 @@ Copyright 2018 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging -import torch +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +import torch from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF -from hyperion.io import DataWriterFactory as DWF from hyperion.io import compression_methods from hyperion.torch.layers import AudioFeatsFactory as AFF diff --git a/hyperion/bin_deprec2/torch-eval-vae.py b/hyperion/bin_deprec2/torch-eval-vae.py index 44ed0bfb..d676b0f1 100755 --- a/hyperion/bin_deprec2/torch-eval-vae.py +++ b/hyperion/bin_deprec2/torch-eval-vae.py @@ -3,19 +3,15 @@ Copyright 2020 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging +import time from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import matplotlib import numpy as np import pandas as pd -import matplotlib +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) matplotlib.use("Agg") # matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) @@ -23,16 +19,14 @@ import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py index 411873ac..aaa91214 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py @@ -3,43 +3,35 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging +# [Added Sonal May21] +from pathlib import Path import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device -from hyperion.torch.layers import LinBinCalibrator as Calibrator -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack from hyperion.torch import TorchModelLoader as TML - from hyperion.torch.adv_attacks import AttackFactory - -# [Added Sonal May21] -from pathlib import Path from hyperion.torch.adv_defenses.wave_gan_white import WaveGANDefender +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember torch.backends.cudnn.enabled = False diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py index 18d6843f..437127b2 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py @@ -3,39 +3,32 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device -from hyperion.torch.layers import LinBinCalibrator as Calibrator -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack from hyperion.torch import TorchModelLoader as TML - from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py index 73da6088..8d4add76 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py @@ -4,42 +4,34 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from art.classifiers import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack -from hyperion.torch import TorchModelLoader as TML - -from art.classifiers import PyTorchClassifier -from hyperion.torch.adv_attacks.art_attack_factory import ( - ARTAttackFactory as AttackFactory, -) +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py index a8b4b962..0e9493c0 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py @@ -4,35 +4,29 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import RandomAccessAudioReader as AR -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device +from hyperion.torch import TorchModelLoader as TML from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import l2_norm -from hyperion.torch import TorchModelLoader as TML +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py index 51a8afbb..e0754498 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py @@ -3,39 +3,32 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device -from hyperion.torch.layers import LinBinCalibrator as Calibrator -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack from hyperion.torch import TorchModelLoader as TML - from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py index 9fcc8f30..0f9f375d 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py @@ -4,42 +4,34 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from art.classifiers import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack -from hyperion.torch import TorchModelLoader as TML - -from art.classifiers import PyTorchClassifier -from hyperion.torch.adv_attacks.art_attack_factory import ( - ARTAttackFactory as AttackFactory, -) +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py index 61acebd4..da6389fb 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.np.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py b/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py index 101d6a10..fc494448 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info, RTTM from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.np.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import RTTM, Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py index ecf65037..c85fe4c9 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py @@ -4,33 +4,27 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np import pandas as pd import yaml +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.np.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py index 7d6d9f11..6da57e16 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py @@ -4,30 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py b/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py index afa7a117..6edf60ed 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py @@ -4,30 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors.py b/hyperion/bin_deprec2/torch-extract-xvectors.py index f36e35e2..76d941e0 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors.py @@ -4,30 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.feats import MeanVarianceNorm as MVN - -from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py index 274bdf32..88b0b1d9 100755 --- a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py +++ b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py @@ -3,37 +3,30 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging from pathlib import Path import numpy as np import pandas as pd import yaml +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx +from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import VADReaderFactory as VRF - -from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack - from hyperion.torch.adv_attacks import RandomAttackFactory +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialNdx, Utt2Info def read_utt_list(list_file, class2int_file, part_idx, num_parts): diff --git a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py index c13bd815..a4df5091 100755 --- a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py +++ b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py @@ -3,41 +3,34 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging from pathlib import Path import numpy as np import pandas as pd import yaml +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import AudioWriter as AW -from hyperion.utils import Utt2Info, TrialNdx, TrialKey, TrialScores -from hyperion.utils.list_utils import ismember +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR - -from hyperion.torch.utils import open_device -from hyperion.torch.layers import LinBinCalibrator as Calibrator -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.utils.misc import l2_norm, compute_stats_adv_attack from hyperion.torch import TorchModelLoader as TML - from hyperion.torch.adv_attacks import RandomAttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-train-dc1d-ae.py b/hyperion/bin_deprec2/torch-train-dc1d-ae.py index e7547927..50ac7d42 100755 --- a/hyperion/bin_deprec2/torch-train-dc1d-ae.py +++ b/hyperion/bin_deprec2/torch-train-dc1d-ae.py @@ -3,27 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SeqDataset as SD from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.narchs.dc1d_encoder import DC1dEncoder as Encoder -from hyperion.torch.narchs.dc1d_decoder import DC1dDecoder as Decoder from hyperion.torch.models import AE +from hyperion.torch.narchs.dc1d_decoder import DC1dDecoder as Decoder +from hyperion.torch.narchs.dc1d_encoder import DC1dEncoder as Encoder from hyperion.torch.trainers import AETrainer as Trainer -from hyperion.torch.data import SeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import open_device def train_ae( diff --git a/hyperion/bin_deprec2/torch-train-dvae.py b/hyperion/bin_deprec2/torch-train-dvae.py index e13c26ac..808bfbba 100755 --- a/hyperion/bin_deprec2/torch-train-dvae.py +++ b/hyperion/bin_deprec2/torch-train-dvae.py @@ -3,36 +3,30 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import DC1dEncoder, DC1dDecoder -from hyperion.torch.narchs import DC2dEncoder, DC2dDecoder -from hyperion.torch.narchs import ResNet1dEncoder, ResNet1dDecoder -from hyperion.torch.narchs import ResNet2dEncoder, ResNet2dDecoder -from hyperion.torch.narchs import TransformerEncoderV1 -from hyperion.torch.narchs import ConformerEncoderV1 +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedFeatSeqDataset as SD from hyperion.torch.models import VAE +from hyperion.torch.narchs import (ConformerEncoderV1, DC1dDecoder, + DC1dEncoder, DC2dDecoder, DC2dEncoder, + ResNet1dDecoder, ResNet1dEncoder, + ResNet2dDecoder, ResNet2dEncoder, + TransformerEncoderV1) from hyperion.torch.trainers import DVAETrainer as Trainer -from hyperion.torch.data import PairedFeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py index 6d7c41ee..f256f735 100755 --- a/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py @@ -3,32 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import EfficientNetXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import EfficientNetXVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py index c259a590..622ac62e 100755 --- a/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py +++ b/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.models import EfficientNetXVector as XVec -from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import EfficientNetXVector as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py index 436e4001..3d135b18 100755 --- a/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py @@ -3,41 +3,36 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - -# import torch.multiprocessing as mp - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp - -# from hyperion.torch.helpers import OptimizerFactory as OF -# from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import ResNetXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy - +from hyperion.torch.models import ResNetXVector as XVec # from hyperion.torch.layers import AudioFeatsFactory as AFF # from hyperion.torch.layers import MeanVarianceNorm as MVN from hyperion.torch.narchs import AudioFeatsMVN as AF +# from hyperion.torch.helpers import OptimizerFactory as OF +# from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device + +# import torch.multiprocessing as mp + + + # from torch.utils.data import dataloader # from torch.multiprocessing import reductions diff --git a/hyperion/bin_deprec2/torch-train-resnet-xvec.py b/hyperion/bin_deprec2/torch-train-resnet-xvec.py index 6e7f4242..f976cc6e 100755 --- a/hyperion/bin_deprec2/torch-train-resnet-xvec.py +++ b/hyperion/bin_deprec2/torch-train-resnet-xvec.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.models import ResNetXVector as XVec -from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ResNetXVector as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py index bf531745..3ee6bf18 100755 --- a/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py @@ -3,34 +3,27 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import ResNet1dXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy - +from hyperion.torch.models import ResNet1dXVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py index 7bac503c..0857ce5c 100755 --- a/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py @@ -4,32 +4,27 @@ Copyright 2020 Magdalena Rybicka Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import SpineNetXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import SpineNetXVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp def init_data( diff --git a/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py index 0ab0cb67..7bbbff03 100755 --- a/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py @@ -3,32 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import TDNNXVector as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import TDNNXVector as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin_deprec2/torch-train-tdnn-xvec.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec.py index 2075ca34..5614f1b9 100755 --- a/hyperion/bin_deprec2/torch-train-tdnn-xvec.py +++ b/hyperion/bin_deprec2/torch-train-tdnn-xvec.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.models import TDNNXVector as XVec -from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import TDNNXVector as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py index 636fb390..6b361583 100755 --- a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py @@ -3,32 +3,26 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer -from hyperion.torch.models import TransformerXVectorV1 as XVec from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import TransformerXVectorV1 as XVec from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer +from hyperion.torch.utils import ddp, open_device def init_data( diff --git a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py index 033408b6..62164f15 100755 --- a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py +++ b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py @@ -4,32 +4,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import XVectorTrainer as Trainer -from hyperion.torch.models import TransformerXVectorV1 as XVec -from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import TransformerXVectorV1 as XVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp, open_device def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-vae.py b/hyperion/bin_deprec2/torch-train-vae.py index 7ceb3014..4c41d49c 100755 --- a/hyperion/bin_deprec2/torch-train-vae.py +++ b/hyperion/bin_deprec2/torch-train-vae.py @@ -3,36 +3,30 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import DC1dEncoder, DC1dDecoder -from hyperion.torch.narchs import DC2dEncoder, DC2dDecoder -from hyperion.torch.narchs import ResNet1dEncoder, ResNet1dDecoder -from hyperion.torch.narchs import ResNet2dEncoder, ResNet2dDecoder -from hyperion.torch.narchs import TransformerEncoderV1 -from hyperion.torch.narchs import ConformerEncoderV1 +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.models import VAE +from hyperion.torch.narchs import (ConformerEncoderV1, DC1dDecoder, + DC1dEncoder, DC2dDecoder, DC2dEncoder, + ResNet1dDecoder, ResNet1dEncoder, + ResNet2dDecoder, ResNet2dEncoder, + TransformerEncoderV1) from hyperion.torch.trainers import VAETrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/torch-train-vq-dvae.py b/hyperion/bin_deprec2/torch-train-vq-dvae.py index 6e49df08..5de1bbd4 100755 --- a/hyperion/bin_deprec2/torch-train-vq-dvae.py +++ b/hyperion/bin_deprec2/torch-train-vq-dvae.py @@ -3,36 +3,30 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import DC1dEncoder, DC1dDecoder -from hyperion.torch.narchs import DC2dEncoder, DC2dDecoder -from hyperion.torch.narchs import ResNet1dEncoder, ResNet1dDecoder -from hyperion.torch.narchs import ResNet2dEncoder, ResNet2dDecoder -from hyperion.torch.narchs import TransformerEncoderV1 -from hyperion.torch.narchs import ConformerEncoderV1 +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import PairedFeatSeqDataset as SD from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import (ConformerEncoderV1, DC1dDecoder, + DC1dEncoder, DC2dDecoder, DC2dEncoder, + ResNet1dDecoder, ResNet1dEncoder, + ResNet2dDecoder, ResNet2dEncoder, + TransformerEncoderV1) from hyperion.torch.trainers import VQDVAETrainer as Trainer -from hyperion.torch.data import PairedFeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/torch-train-vq-vae.py b/hyperion/bin_deprec2/torch-train-vq-vae.py index fa8b336c..2a95f853 100755 --- a/hyperion/bin_deprec2/torch-train-vq-vae.py +++ b/hyperion/bin_deprec2/torch-train-vq-vae.py @@ -3,36 +3,30 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import open_device, ddp -from hyperion.torch.narchs import DC1dEncoder, DC1dDecoder -from hyperion.torch.narchs import DC2dEncoder, DC2dDecoder -from hyperion.torch.narchs import ResNet1dEncoder, ResNet1dDecoder -from hyperion.torch.narchs import ResNet2dEncoder, ResNet2dDecoder -from hyperion.torch.narchs import TransformerEncoderV1 -from hyperion.torch.narchs import ConformerEncoderV1 +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.models import VQVAE as VAE +from hyperion.torch.narchs import (ConformerEncoderV1, DC1dDecoder, + DC1dEncoder, DC2dDecoder, DC2dEncoder, + ResNet1dDecoder, ResNet1dEncoder, + ResNet2dDecoder, ResNet2dEncoder, + TransformerEncoderV1) from hyperion.torch.trainers import VQVAETrainer as Trainer -from hyperion.torch.data import FeatSeqDataset as SD -from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.utils import ddp, open_device enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/train-cw-up.py b/hyperion/bin_deprec2/train-cw-up.py index a7392a32..c1c372ad 100755 --- a/hyperion/bin_deprec2/train-cw-up.py +++ b/hyperion/bin_deprec2/train-cw-up.py @@ -7,18 +7,18 @@ Trains Centering and whitening with uncertainty prop. """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR +from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal -from hyperion.np.transforms import TransformList, CentWhitenUP, LNormUP +from hyperion.np.transforms import CentWhitenUP, LNormUP, TransformList def load_model(input_path, with_lnorm, name, **kwargs): diff --git a/hyperion/bin_deprec2/train-cw.py b/hyperion/bin_deprec2/train-cw.py index a70485a6..cabca7c2 100755 --- a/hyperion/bin_deprec2/train-cw.py +++ b/hyperion/bin_deprec2/train-cw.py @@ -7,18 +7,18 @@ Trains Centering and whitening """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR +from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal -from hyperion.np.transforms import TransformList, CentWhiten, LNorm +from hyperion.np.transforms import CentWhiten, LNorm, TransformList def load_model(input_path, with_lnorm, name, **kwargs): diff --git a/hyperion/bin_deprec2/train-gaussianizer.py b/hyperion/bin_deprec2/train-gaussianizer.py index a265403e..aeb51e46 100755 --- a/hyperion/bin_deprec2/train-gaussianizer.py +++ b/hyperion/bin_deprec2/train-gaussianizer.py @@ -7,18 +7,18 @@ Trains Gaussianization for i-vectors. """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR +from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal -from hyperion.np.transforms import TransformList, Gaussianizer +from hyperion.np.transforms import Gaussianizer, TransformList def load_model(input_path, **kwargs): diff --git a/hyperion/bin_deprec2/train-lda.py b/hyperion/bin_deprec2/train-lda.py index 36217c8f..1887a72f 100755 --- a/hyperion/bin_deprec2/train-lda.py +++ b/hyperion/bin_deprec2/train-lda.py @@ -6,17 +6,17 @@ """ Trains LDA """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.np.transforms import TransformList, LDA, SbSw +from hyperion.hyp_defs import config_logger +from hyperion.np.transforms import LDA, SbSw, TransformList def train_lda( diff --git a/hyperion/bin_deprec2/train-linear-gbe-up.py b/hyperion/bin_deprec2/train-linear-gbe-up.py index 5accb785..9986b6bc 100755 --- a/hyperion/bin_deprec2/train-linear-gbe-up.py +++ b/hyperion/bin_deprec2/train-linear-gbe-up.py @@ -7,18 +7,18 @@ Trains linear GBE with uncertainty propagation """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.np.transforms import TransformList +from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LinearGBEUP as GBE +from hyperion.np.transforms import TransformList def train_linear_gbe(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin_deprec2/train-linear-gbe.py b/hyperion/bin_deprec2/train-linear-gbe.py index a7ac5236..e9455cb8 100755 --- a/hyperion/bin_deprec2/train-linear-gbe.py +++ b/hyperion/bin_deprec2/train-linear-gbe.py @@ -7,18 +7,18 @@ Trains linear Gaussian back-end """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.np.transforms import TransformList +from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.transforms import TransformList def train_linear_gbe(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin_deprec2/train-linear-svmc.py b/hyperion/bin_deprec2/train-linear-svmc.py index 6b589491..90ff8768 100755 --- a/hyperion/bin_deprec2/train-linear-svmc.py +++ b/hyperion/bin_deprec2/train-linear-svmc.py @@ -7,18 +7,18 @@ Trains linear SVM classifier """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.np.transforms import TransformList +from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.transforms import TransformList def train_svm(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin_deprec2/train-logistic-regression.py b/hyperion/bin_deprec2/train-logistic-regression.py index 1d657dc4..1aa128a3 100755 --- a/hyperion/bin_deprec2/train-logistic-regression.py +++ b/hyperion/bin_deprec2/train-logistic-regression.py @@ -7,18 +7,18 @@ Trains linear logistic regression classifier """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.np.transforms import TransformList +from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LogisticRegression as LR +from hyperion.np.transforms import TransformList def train_lr(iv_file, train_list, preproc_file, output_path, **kwargs): diff --git a/hyperion/bin_deprec2/train-mvn.py b/hyperion/bin_deprec2/train-mvn.py index a0204fd5..2d10b116 100755 --- a/hyperion/bin_deprec2/train-mvn.py +++ b/hyperion/bin_deprec2/train-mvn.py @@ -7,18 +7,18 @@ Trains global mean and variance normalization of i-vectors. """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR +from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal -from hyperion.np.transforms import TransformList, MVN, SbSw +from hyperion.np.transforms import MVN, SbSw, TransformList def train_mvn( diff --git a/hyperion/bin_deprec2/train-nda.py b/hyperion/bin_deprec2/train-nda.py index 11cd7da3..946a8baa 100755 --- a/hyperion/bin_deprec2/train-nda.py +++ b/hyperion/bin_deprec2/train-nda.py @@ -7,17 +7,17 @@ Trains NDA """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorClassReader as VCR -from hyperion.np.transforms import TransformList, NDA, NSbSw +from hyperion.hyp_defs import config_logger +from hyperion.np.transforms import NDA, NSbSw, TransformList def train_nda( diff --git a/hyperion/bin_deprec2/train-pca.py b/hyperion/bin_deprec2/train-pca.py index d1ab1c7e..25dcb366 100755 --- a/hyperion/bin_deprec2/train-pca.py +++ b/hyperion/bin_deprec2/train-pca.py @@ -6,17 +6,17 @@ """ Trains PCA """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger from hyperion.helpers import VectorReader as VR -from hyperion.np.transforms import TransformList, PCA +from hyperion.hyp_defs import config_logger +from hyperion.np.transforms import PCA, TransformList def load_model(input_path, name, **kwargs): diff --git a/hyperion/bin_deprec2/train-plda.py b/hyperion/bin_deprec2/train-plda.py index 26f6e0a8..520f4cd7 100755 --- a/hyperion/bin_deprec2/train-plda.py +++ b/hyperion/bin_deprec2/train-plda.py @@ -7,17 +7,17 @@ Trains PLDA """ -import sys -import os import argparse -import time import logging +import os +import sys +import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.helpers import VectorClassReader as VCR from hyperion.helpers import PLDAFactory as F +from hyperion.helpers import VectorClassReader as VCR +from hyperion.hyp_defs import config_logger from hyperion.np.transforms import TransformList diff --git a/hyperion/helpers/__init__.py b/hyperion/helpers/__init__.py index 48bf1476..8b48b161 100644 --- a/hyperion/helpers/__init__.py +++ b/hyperion/helpers/__init__.py @@ -3,12 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .vector_reader import VectorReader -from .vector_class_reader import VectorClassReader - -from .trial_data_reader import TrialDataReader +from .classif_trial_data_reader import ClassifTrialDataReader from .multi_test_trial_data_reader import MultiTestTrialDataReader from .multi_test_trial_data_reader_v2 import MultiTestTrialDataReaderV2 -from .classif_trial_data_reader import ClassifTrialDataReader - from .plda_factory import PLDAFactory +from .trial_data_reader import TrialDataReader +from .vector_class_reader import VectorClassReader +from .vector_reader import VectorReader diff --git a/hyperion/helpers/classif_trial_data_reader.py b/hyperion/helpers/classif_trial_data_reader.py index f7aeb727..2f577621 100644 --- a/hyperion/helpers/classif_trial_data_reader.py +++ b/hyperion/helpers/classif_trial_data_reader.py @@ -3,18 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -import logging import argparse -import time import copy +import logging +import os +import sys +import time import numpy as np from ..io import HypDataReader -from ..utils import TrialNdx, SCPList from ..np.transforms import TransformList +from ..utils import SCPList, TrialNdx class ClassifTrialDataReader(object): diff --git a/hyperion/helpers/multi_test_trial_data_reader.py b/hyperion/helpers/multi_test_trial_data_reader.py index eeea60f2..bd2d5a35 100644 --- a/hyperion/helpers/multi_test_trial_data_reader.py +++ b/hyperion/helpers/multi_test_trial_data_reader.py @@ -3,17 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF -from ..utils import TrialNdx, TrialKey, Utt2Info from ..np.transforms import TransformList +from ..utils import TrialKey, TrialNdx, Utt2Info class MultiTestTrialDataReader(object): diff --git a/hyperion/helpers/multi_test_trial_data_reader_v2.py b/hyperion/helpers/multi_test_trial_data_reader_v2.py index 43fd1254..226131bf 100644 --- a/hyperion/helpers/multi_test_trial_data_reader_v2.py +++ b/hyperion/helpers/multi_test_trial_data_reader_v2.py @@ -3,17 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF -from ..utils import Utt2Info, TrialNdx, TrialKey from ..np.transforms import TransformList +from ..utils import TrialKey, TrialNdx, Utt2Info class MultiTestTrialDataReaderV2(object): diff --git a/hyperion/helpers/plda_factory.py b/hyperion/helpers/plda_factory.py index 0fdd2609..16cf01c4 100644 --- a/hyperion/helpers/plda_factory.py +++ b/hyperion/helpers/plda_factory.py @@ -5,7 +5,7 @@ import numpy as np -from ..np.pdfs.plda import FRPLDA, SPLDA, PLDA +from ..np.pdfs.plda import FRPLDA, PLDA, SPLDA class PLDAFactory(object): diff --git a/hyperion/helpers/tracking_data_reader.py b/hyperion/helpers/tracking_data_reader.py index 4bac5be2..f6741d9a 100644 --- a/hyperion/helpers/tracking_data_reader.py +++ b/hyperion/helpers/tracking_data_reader.py @@ -3,17 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF -from ..utils import Utt2Info, TrialNdx, ExtSegmentList from ..np.transforms import TransformList +from ..utils import ExtSegmentList, TrialNdx, Utt2Info class TrackingDataReader(object): diff --git a/hyperion/helpers/trial_data_reader.py b/hyperion/helpers/trial_data_reader.py index 219ee6ce..4f33770b 100644 --- a/hyperion/helpers/trial_data_reader.py +++ b/hyperion/helpers/trial_data_reader.py @@ -2,18 +2,18 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF -from ..utils.utt2info import Utt2Info -from ..utils import TrialNdx, TrialKey # , SparseTrialNdx, SparseTrialKey from ..np.transforms import TransformList +from ..utils import TrialKey, TrialNdx # , SparseTrialNdx, SparseTrialKey +from ..utils.utt2info import Utt2Info class TrialDataReader(object): diff --git a/hyperion/helpers/vector_class_reader.py b/hyperion/helpers/vector_class_reader.py index 0c6f346d..c4c531ad 100644 --- a/hyperion/helpers/vector_class_reader.py +++ b/hyperion/helpers/vector_class_reader.py @@ -3,18 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np from ..io import RandomAccessDataReaderFactory as DRF -from ..utils.utt2info import Utt2Info -from ..utils.tensors import to3D_by_class from ..np.transforms import TransformList +from ..utils.tensors import to3D_by_class +from ..utils.utt2info import Utt2Info class VectorClassReader(object): diff --git a/hyperion/helpers/vector_reader.py b/hyperion/helpers/vector_reader.py index 0ac1b11a..4f480d6d 100644 --- a/hyperion/helpers/vector_reader.py +++ b/hyperion/helpers/vector_reader.py @@ -2,18 +2,18 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser -import sys -import os import argparse -import time import copy +import os +import sys +import time import numpy as np +from jsonargparse import ActionParser, ArgumentParser from ..io import RandomAccessDataReaderFactory as DRF -from ..utils.scp_list import SCPList from ..np.transforms import TransformList +from ..utils.scp_list import SCPList class VectorReader(object): diff --git a/hyperion/io/__init__.py b/hyperion/io/__init__.py index 5ddf131b..14b1b35f 100644 --- a/hyperion/io/__init__.py +++ b/hyperion/io/__init__.py @@ -5,29 +5,21 @@ from .ark_data_reader import * from .ark_data_writer import * -from .h5_data_reader import * -from .h5_data_writer import * -from .data_rw_factory import * -from .copy_feats import CopyFeats - - -from .bin_vad_reader import BinVADReader -from .segment_vad_reader import SegmentVADReader -from .vad_rw_factory import VADReaderFactory - from .audio_reader import * from .audio_writer import * -from .packed_audio_reader import ( - SequentialPackedAudioReader, - RandomAccessPackedAudioReader, -) -from .packed_audio_writer import PackedAudioWriter - - +from .bin_vad_reader import BinVADReader +from .copy_feats import CopyFeats +from .data_rw_factory import * +from .h5_data_reader import * +from .h5_data_writer import * +from .h5_merger import * from .hyp_data_reader import * from .hyp_data_writer import * -from .h5_merger import * from .kaldi_data_reader import * - +from .packed_audio_reader import (RandomAccessPackedAudioReader, + SequentialPackedAudioReader) +from .packed_audio_writer import PackedAudioWriter +from .segment_vad_reader import SegmentVADReader +from .vad_rw_factory import VADReaderFactory # from .queues import * diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py index 7f6ec350..3919ddfa 100644 --- a/hyperion/io/ark_data_reader.py +++ b/hyperion/io/ark_data_reader.py @@ -3,15 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import multiprocessing as threading import sys + import numpy as np -import multiprocessing as threading from ..hyp_defs import float_cpu +from ..utils.kaldi_io_funcs import (init_kaldi_input_stream, is_token, peek, + read_token) +from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix from ..utils.scp_list import SCPList -from ..utils.kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix -from ..utils.kaldi_io_funcs import is_token, read_token, peek, init_kaldi_input_stream -from .data_reader import SequentialDataReader, RandomAccessDataReader +from .data_reader import RandomAccessDataReader, SequentialDataReader class SequentialArkDataReader(SequentialDataReader): diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py index 50fdd3f6..58f5c0a1 100644 --- a/hyperion/io/ark_data_writer.py +++ b/hyperion/io/ark_data_writer.py @@ -4,12 +4,14 @@ """ import sys + import numpy as np from ..hyp_defs import float_save +from ..utils.kaldi_io_funcs import (init_kaldi_output_stream, is_token, + write_token) +from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix from ..utils.scp_list import SCPList -from ..utils.kaldi_io_funcs import is_token, write_token, init_kaldi_output_stream -from ..utils.kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix from .data_writer import DataWriter diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 043ae778..0c2f0446 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -3,14 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -import logging import io +import logging import math +import os import subprocess -import soundfile as sf import numpy as np +import soundfile as sf from ..hyp_defs import float_cpu from ..utils import SCPList, SegmentList diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py index 2fb9ce3c..f98a3251 100644 --- a/hyperion/io/audio_writer.py +++ b/hyperion/io/audio_writer.py @@ -5,13 +5,13 @@ import os import re -import soundfile as sf import numpy as np +import soundfile as sf from ..hyp_defs import float_cpu -from ..utils.scp_list import SCPList from ..utils.kaldi_io_funcs import is_token +from ..utils.scp_list import SCPList from .audio_reader import valid_ext subtype_to_npdtype = { diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py index 452eb106..e4e64777 100644 --- a/hyperion/io/bin_vad_reader.py +++ b/hyperion/io/bin_vad_reader.py @@ -4,12 +4,13 @@ """ import logging + import numpy as np from ..hyp_defs import float_cpu from ..utils.vad_utils import bin_vad_to_timestamps -from .vad_reader import VADReader from .data_rw_factory import RandomAccessDataReaderFactory as DRF +from .vad_reader import VADReader class BinVADReader(VADReader): diff --git a/hyperion/io/data_reader.py b/hyperion/io/data_reader.py index da0279e1..bbefa62d 100644 --- a/hyperion/io/data_reader.py +++ b/hyperion/io/data_reader.py @@ -4,13 +4,14 @@ """ import logging +import multiprocessing from abc import ABCMeta, abstractmethod + import numpy as np -import multiprocessing from ..hyp_defs import float_cpu -from ..utils.scp_list import SCPList from ..np.transforms import TransformList +from ..utils.scp_list import SCPList class DataReader(object): diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py index 0c49cd9f..7868baae 100644 --- a/hyperion/io/data_rw_factory.py +++ b/hyperion/io/data_rw_factory.py @@ -4,19 +4,21 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser from ..utils.kaldi_matrix import compression_methods -from .rw_specifiers import ArchiveType, WSpecifier, RSpecifier, WSpecType, RSpecType -from .h5_data_writer import H5DataWriter as H5DW -from .ark_data_writer import ArkDataWriter as ADW +from .ark_data_reader import RandomAccessArkDataReader as RADR from .ark_data_reader import SequentialArkFileDataReader as SAFDR from .ark_data_reader import SequentialArkScriptDataReader as SASDR -from .ark_data_reader import RandomAccessArkDataReader as RADR -from .h5_data_reader import SequentialH5FileDataReader as SH5FDR -from .h5_data_reader import SequentialH5ScriptDataReader as SH5SDR +from .ark_data_writer import ArkDataWriter as ADW from .h5_data_reader import RandomAccessH5FileDataReader as RH5FDR from .h5_data_reader import RandomAccessH5ScriptDataReader as RH5SDR +from .h5_data_reader import SequentialH5FileDataReader as SH5FDR +from .h5_data_reader import SequentialH5ScriptDataReader as SH5SDR +from .h5_data_writer import H5DataWriter as H5DW +from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier, + WSpecType) class DataWriterFactory(object): diff --git a/hyperion/io/h5_data_reader.py b/hyperion/io/h5_data_reader.py index 7ade2549..dfefbec3 100644 --- a/hyperion/io/h5_data_reader.py +++ b/hyperion/io/h5_data_reader.py @@ -5,18 +5,19 @@ Classes to read data from hdf5 files. """ +import multiprocessing import sys import time -import numpy as np + import h5py -import multiprocessing +import numpy as np from ..hyp_defs import float_cpu +from ..utils.kaldi_io_funcs import is_token +from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix from ..utils.list_utils import split_list, split_list_group_by_key from ..utils.scp_list import SCPList -from ..utils.kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix -from ..utils.kaldi_io_funcs import is_token -from .data_reader import SequentialDataReader, RandomAccessDataReader +from .data_reader import RandomAccessDataReader, SequentialDataReader def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None): diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py index 0685d9b8..fed91d1e 100644 --- a/hyperion/io/h5_data_writer.py +++ b/hyperion/io/h5_data_writer.py @@ -4,13 +4,14 @@ """ import sys -import numpy as np + import h5py +import numpy as np from ..hyp_defs import float_save -from ..utils.scp_list import SCPList -from ..utils.kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix from ..utils.kaldi_io_funcs import is_token +from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix +from ..utils.scp_list import SCPList from .data_writer import DataWriter diff --git a/hyperion/io/h5_merger.py b/hyperion/io/h5_merger.py index f1b408e7..3e73608e 100644 --- a/hyperion/io/h5_merger.py +++ b/hyperion/io/h5_merger.py @@ -4,6 +4,7 @@ """ import sys + import numpy as np from .hyp_data_reader import HypDataReader as HR diff --git a/hyperion/io/hyp_data_reader.py b/hyperion/io/hyp_data_reader.py index 9219187a..575c3087 100644 --- a/hyperion/io/hyp_data_reader.py +++ b/hyperion/io/hyp_data_reader.py @@ -4,11 +4,12 @@ """ import sys -import numpy as np + import h5py +import numpy as np from ..hyp_defs import float_cpu -from ..utils.list_utils import list2ndarray, ismember +from ..utils.list_utils import ismember, list2ndarray class HypDataReader(object): diff --git a/hyperion/io/hyp_data_writer.py b/hyperion/io/hyp_data_writer.py index 9a5b5906..81ad2501 100644 --- a/hyperion/io/hyp_data_writer.py +++ b/hyperion/io/hyp_data_writer.py @@ -4,11 +4,12 @@ """ import sys -import numpy as np + import h5py +import numpy as np from ..hyp_defs import float_save -from ..utils.list_utils import list2ndarray, ismember +from ..utils.list_utils import ismember, list2ndarray class HypDataWriter(object): diff --git a/hyperion/io/kaldi_data_reader.py b/hyperion/io/kaldi_data_reader.py index 6313cb29..60b55bfd 100644 --- a/hyperion/io/kaldi_data_reader.py +++ b/hyperion/io/kaldi_data_reader.py @@ -3,9 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import gzip +import re +import struct import sys -import gzip, struct, re from collections import OrderedDict + import numpy as np from ..hyp_defs import float_cpu diff --git a/hyperion/io/packed_audio_reader.py b/hyperion/io/packed_audio_reader.py index 61ebbd65..17f78bc2 100644 --- a/hyperion/io/packed_audio_reader.py +++ b/hyperion/io/packed_audio_reader.py @@ -2,15 +2,15 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import soundfile as sf - -import time -import math import logging -import numpy as np +import math import multiprocessing +import time from copy import deepcopy +import numpy as np +import soundfile as sf + from ..hyp_defs import float_cpu from ..utils import SCPList, SegmentList diff --git a/hyperion/io/packed_audio_writer.py b/hyperion/io/packed_audio_writer.py index 3a15227a..ceda0d90 100644 --- a/hyperion/io/packed_audio_writer.py +++ b/hyperion/io/packed_audio_writer.py @@ -4,9 +4,9 @@ """ import os import re -import soundfile as sf import numpy as np +import soundfile as sf from ..utils.kaldi_io_funcs import is_token from .audio_reader import valid_ext diff --git a/hyperion/io/segment_vad_reader.py b/hyperion/io/segment_vad_reader.py index df8d39e5..01bf413e 100644 --- a/hyperion/io/segment_vad_reader.py +++ b/hyperion/io/segment_vad_reader.py @@ -3,13 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging + import numpy as np from ..hyp_defs import float_cpu from ..utils import SegmentList from ..utils.vad_utils import vad_timestamps_to_bin -from .vad_reader import VADReader from .data_reader import DataReader +from .vad_reader import VADReader class SegmentVADReader(VADReader): diff --git a/hyperion/io/vad_reader.py b/hyperion/io/vad_reader.py index c56a8ffe..40e2dda2 100644 --- a/hyperion/io/vad_reader.py +++ b/hyperion/io/vad_reader.py @@ -4,6 +4,7 @@ """ import logging + import numpy as np from ..hyp_defs import float_cpu diff --git a/hyperion/io/vad_rw_factory.py b/hyperion/io/vad_rw_factory.py index 7b855b07..32032d1d 100644 --- a/hyperion/io/vad_rw_factory.py +++ b/hyperion/io/vad_rw_factory.py @@ -5,8 +5,9 @@ import logging -from .rw_specifiers import ArchiveType, WSpecifier, RSpecifier, WSpecType, RSpecType from .bin_vad_reader import BinVADReader as BVR +from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier, + WSpecType) from .segment_vad_reader import SegmentVADReader as SVR diff --git a/hyperion/np/augment/__init__.py b/hyperion/np/augment/__init__.py index 210f54e7..1f99ffb0 100644 --- a/hyperion/np/augment/__init__.py +++ b/hyperion/np/augment/__init__.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .speech_augment import SpeechAugment -from .speed_augment import SpeedAugment from .noise_augment import NoiseAugment from .reverb_augment import ReverbAugment +from .speech_augment import SpeechAugment +from .speed_augment import SpeedAugment diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py index e180a292..799db930 100644 --- a/hyperion/np/augment/noise_augment.py +++ b/hyperion/np/augment/noise_augment.py @@ -6,10 +6,10 @@ import logging import math import multiprocessing -import yaml from copy import deepcopy import numpy as np +import yaml from ...hyp_defs import float_cpu from ...io import RandomAccessAudioReader as AR diff --git a/hyperion/np/augment/reverb_augment.py b/hyperion/np/augment/reverb_augment.py index ef5293d6..cf4cc6cb 100644 --- a/hyperion/np/augment/reverb_augment.py +++ b/hyperion/np/augment/reverb_augment.py @@ -3,15 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging import math import multiprocessing -import yaml +import time from copy import deepcopy from enum import Enum import numpy as np +import yaml from scipy import signal from ...hyp_defs import float_cpu diff --git a/hyperion/np/augment/speech_augment.py b/hyperion/np/augment/speech_augment.py index e3eab4ea..0b1233f1 100644 --- a/hyperion/np/augment/speech_augment.py +++ b/hyperion/np/augment/speech_augment.py @@ -5,12 +5,11 @@ import logging import math -import yaml import numpy as np +import yaml from ...hyp_defs import float_cpu - from .noise_augment import NoiseAugment from .reverb_augment import ReverbAugment from .speed_augment import SpeedAugment diff --git a/hyperion/np/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py index 2f353ebe..4400a4b4 100644 --- a/hyperion/np/augment/speed_augment.py +++ b/hyperion/np/augment/speed_augment.py @@ -5,8 +5,9 @@ import logging from copy import deepcopy -import yaml + import numpy as np +import yaml from librosa.effects import time_stretch from ...hyp_defs import float_cpu diff --git a/hyperion/np/calibration/unsup_gauss_calibration.py b/hyperion/np/calibration/unsup_gauss_calibration.py index 5f368a71..fd440995 100644 --- a/hyperion/np/calibration/unsup_gauss_calibration.py +++ b/hyperion/np/calibration/unsup_gauss_calibration.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import sys + import numpy as np from ..pdfs.mixtures.diag_gmm_tiedcovs import DiagGMMTiedCovs as GMM diff --git a/hyperion/np/classifiers/__init__.py b/hyperion/np/classifiers/__init__.py index 92a9305d..d9d02ed0 100644 --- a/hyperion/np/classifiers/__init__.py +++ b/hyperion/np/classifiers/__init__.py @@ -3,11 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .linear_gbe import LinearGBE -from .linear_gbe_up import LinearGBEUP -from .logistic_regression import LogisticRegression from .binary_logistic_regression import BinaryLogisticRegression from .greedy_fusion import GreedyFusionBinaryLR +from .linear_gbe import LinearGBE +from .linear_gbe_up import LinearGBEUP from .linear_svmc import LinearSVMC +from .logistic_regression import LogisticRegression from .q_scoring_homo_gbe import QScoringHomoGBE from .svmc import GaussianSVMC diff --git a/hyperion/np/classifiers/greedy_fusion.py b/hyperion/np/classifiers/greedy_fusion.py index 2102bc22..842b850e 100644 --- a/hyperion/np/classifiers/greedy_fusion.py +++ b/hyperion/np/classifiers/greedy_fusion.py @@ -4,12 +4,12 @@ """ import logging + import numpy as np from ...hyp_defs import float_cpu, float_save -from ..np_model import NPModel from ..metrics import dcf - +from ..np_model import NPModel from .binary_logistic_regression import BinaryLogisticRegression as BLR diff --git a/hyperion/np/classifiers/linear_gbe.py b/hyperion/np/classifiers/linear_gbe.py index 00a8b1bf..a6b8c7cc 100644 --- a/hyperion/np/classifiers/linear_gbe.py +++ b/hyperion/np/classifiers/linear_gbe.py @@ -4,13 +4,14 @@ """ import logging + import numpy as np -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from scipy.special import gammaln from ...hyp_defs import float_cpu +from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax from ..np_model import NPModel -from ...utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax class LinearGBE(NPModel): diff --git a/hyperion/np/classifiers/linear_gbe_up.py b/hyperion/np/classifiers/linear_gbe_up.py index 4a489639..8566aeab 100644 --- a/hyperion/np/classifiers/linear_gbe_up.py +++ b/hyperion/np/classifiers/linear_gbe_up.py @@ -4,18 +4,14 @@ """ import logging + import numpy as np from scipy.special import gammaln from ...hyp_defs import float_cpu +from ...utils.math import (fullcov_varfloor, int2onehot, invert_pdmat, + logdet_pdmat, softmax) from ..np_model import NPModel -from ...utils.math import ( - int2onehot, - logdet_pdmat, - invert_pdmat, - softmax, - fullcov_varfloor, -) from .linear_gbe import LinearGBE diff --git a/hyperion/np/classifiers/linear_svmc.py b/hyperion/np/classifiers/linear_svmc.py index 607d83de..5d743a46 100644 --- a/hyperion/np/classifiers/linear_svmc.py +++ b/hyperion/np/classifiers/linear_svmc.py @@ -4,14 +4,14 @@ """ import logging -import numpy as np -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from sklearn.svm import LinearSVC as SVC from ...hyp_defs import float_cpu -from ..np_model import NPModel from ...utils.math import softmax +from ..np_model import NPModel class LinearSVMC(NPModel): diff --git a/hyperion/np/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py index 932a28e3..8e3d7e2e 100644 --- a/hyperion/np/classifiers/logistic_regression.py +++ b/hyperion/np/classifiers/logistic_regression.py @@ -4,13 +4,13 @@ """ import logging -import numpy as np +import numpy as np from sklearn.linear_model import LogisticRegression as LR from ...hyp_defs import float_cpu -from ..np_model import NPModel from ...utils.math import softmax +from ..np_model import NPModel class LogisticRegression(NPModel): diff --git a/hyperion/np/classifiers/q_scoring_homo_gbe.py b/hyperion/np/classifiers/q_scoring_homo_gbe.py index 8ef42052..9e54e0f4 100644 --- a/hyperion/np/classifiers/q_scoring_homo_gbe.py +++ b/hyperion/np/classifiers/q_scoring_homo_gbe.py @@ -4,12 +4,13 @@ """ import logging + import numpy as np from scipy.special import gammaln from ...hyp_defs import float_cpu +from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax from ..np_model import NPModel -from ...utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax class QScoringHomoGBE(NPModel): diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py index 77a05ff9..9311b8e8 100644 --- a/hyperion/np/classifiers/svmc.py +++ b/hyperion/np/classifiers/svmc.py @@ -3,17 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os import logging +import os import pickle -import numpy as np -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from sklearn.svm import SVC as SVC from ...hyp_defs import float_cpu -from ..np_model import NPModel from ...utils.math import softmax +from ..np_model import NPModel class GaussianSVMC(NPModel): diff --git a/hyperion/np/clustering/__init__.py b/hyperion/np/clustering/__init__.py index f22aa6f3..0841d47e 100644 --- a/hyperion/np/clustering/__init__.py +++ b/hyperion/np/clustering/__init__.py @@ -3,5 +3,5 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .kmeans import KMeans from .ahc import AHC +from .kmeans import KMeans diff --git a/hyperion/np/clustering/ahc.py b/hyperion/np/clustering/ahc.py index f2f0b93b..e6e0d81b 100644 --- a/hyperion/np/clustering/ahc.py +++ b/hyperion/np/clustering/ahc.py @@ -3,12 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np -import h5py from copy import copy +import h5py +import numpy as np from scipy.cluster.hierarchy import linkage -from sklearn.metrics import homogeneity_score, completeness_score +from sklearn.metrics import completeness_score, homogeneity_score from ...hyp_defs import float_cpu from ..np_model import NPModel diff --git a/hyperion/np/clustering/kmeans.py b/hyperion/np/clustering/kmeans.py index dc5b67c0..abb88463 100644 --- a/hyperion/np/clustering/kmeans.py +++ b/hyperion/np/clustering/kmeans.py @@ -3,10 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys import logging -import numpy as np +import sys + import h5py +import numpy as np from ...hyp_defs import float_cpu from ..np_model import NPModel diff --git a/hyperion/np/diarization/diar_ahc_plda.py b/hyperion/np/diarization/diar_ahc_plda.py index b8fb0fa6..4bfbc06b 100644 --- a/hyperion/np/diarization/diar_ahc_plda.py +++ b/hyperion/np/diarization/diar_ahc_plda.py @@ -5,14 +5,13 @@ import logging from pathlib import Path -import numpy as np import h5py import matplotlib +import numpy as np matplotlib.use("Agg") import matplotlib.pyplot as plt - from ..clustering import AHC from ..pdfs import GMMTiedDiagCov as GMM from ..transforms import PCA, LNorm diff --git a/hyperion/np/feats/__init__.py b/hyperion/np/feats/__init__.py index 9d77e032..5173bf4b 100644 --- a/hyperion/np/feats/__init__.py +++ b/hyperion/np/feats/__init__.py @@ -4,10 +4,10 @@ """ # -from .filter_banks import FilterBankFactory -from .feature_windows import FeatureWindowFactory -from .stft import * -from .mfcc import MFCC from .energy_vad import EnergyVAD -from .frame_selector import FrameSelector from .feature_normalization import MeanVarianceNorm +from .feature_windows import FeatureWindowFactory +from .filter_banks import FilterBankFactory +from .frame_selector import FrameSelector +from .mfcc import MFCC +from .stft import * diff --git a/hyperion/np/feats/feature_normalization.py b/hyperion/np/feats/feature_normalization.py index 2a8cf6e2..27683739 100644 --- a/hyperion/np/feats/feature_normalization.py +++ b/hyperion/np/feats/feature_normalization.py @@ -4,7 +4,7 @@ """ import numpy as np -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser from scipy.signal import convolve2d from ...hyp_defs import float_cpu diff --git a/hyperion/np/feats/filter_banks.py b/hyperion/np/feats/filter_banks.py index 3b0da644..0e0eaf84 100644 --- a/hyperion/np/feats/filter_banks.py +++ b/hyperion/np/feats/filter_banks.py @@ -3,10 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import logging import numpy as np +from jsonargparse import ActionParser, ArgumentParser from librosa.filters import mel as make_mel_librosa from ...hyp_defs import float_cpu diff --git a/hyperion/np/feats/mfcc.py b/hyperion/np/feats/mfcc.py index d6b8dd3f..cd98840d 100644 --- a/hyperion/np/feats/mfcc.py +++ b/hyperion/np/feats/mfcc.py @@ -13,7 +13,7 @@ from ...utils.misc import str2bool from .feature_windows import FeatureWindowFactory as FWF from .filter_banks import FilterBankFactory as FBF -from .stft import strft, st_logE +from .stft import st_logE, strft class MFCCSteps(Enum): diff --git a/hyperion/np/metrics/__init__.py b/hyperion/np/metrics/__init__.py index 6725621a..36afdbf5 100644 --- a/hyperion/np/metrics/__init__.py +++ b/hyperion/np/metrics/__init__.py @@ -3,8 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .utils import effective_prior from .acc import compute_accuracy from .confusion_matrix import * +from .dcf import (compute_act_dcf, compute_dcf, compute_min_dcf, + fast_eval_dcf_eer) from .eer import compute_eer, compute_prbep -from .dcf import compute_dcf, compute_min_dcf, compute_act_dcf, fast_eval_dcf_eer +from .utils import effective_prior diff --git a/hyperion/np/metrics/confusion_matrix.py b/hyperion/np/metrics/confusion_matrix.py index 084aa7a9..57f8f1ab 100644 --- a/hyperion/np/metrics/confusion_matrix.py +++ b/hyperion/np/metrics/confusion_matrix.py @@ -4,8 +4,9 @@ """ import sys -import numpy as np + import matplotlib.pyplot as plt +import numpy as np from sklearn.metrics import confusion_matrix from ...utils.list_utils import list2ndarray diff --git a/hyperion/np/metrics/roc.py b/hyperion/np/metrics/roc.py index 38e4fa3c..f8df8d10 100644 --- a/hyperion/np/metrics/roc.py +++ b/hyperion/np/metrics/roc.py @@ -3,9 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import matplotlib.pyplot as plt import numpy as np import scipy.linalg as sla -import matplotlib.pyplot as plt from .utils import pavx diff --git a/hyperion/np/metrics/utils.py b/hyperion/np/metrics/utils.py index c5871dfd..0715d809 100644 --- a/hyperion/np/metrics/utils.py +++ b/hyperion/np/metrics/utils.py @@ -8,7 +8,7 @@ import numpy as np from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp +from ...utils.math import logsumexp, softmax def effective_prior(p_tar, c_miss, c_fa): diff --git a/hyperion/np/metrics/verification_evaluator.py b/hyperion/np/metrics/verification_evaluator.py index 9c9c3208..2adf15cf 100644 --- a/hyperion/np/metrics/verification_evaluator.py +++ b/hyperion/np/metrics/verification_evaluator.py @@ -4,13 +4,13 @@ """ +import copy import logging import re -import numpy as np -import pandas as pd -import copy import matplotlib +import numpy as np +import pandas as pd matplotlib.use("Agg") matplotlib.rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"]}) @@ -20,8 +20,8 @@ from ...hyp_defs import float_cpu from ...utils import TrialKey, TrialScores from ...utils.trial_stats import TrialStats -from .utils import effective_prior from .dcf import fast_eval_dcf_eer +from .utils import effective_prior class VerificationEvaluator(object): diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py index db49f6d5..8ee84ee8 100644 --- a/hyperion/np/np_model.py +++ b/hyperion/np/np_model.py @@ -2,14 +2,14 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os import json +import os from copy import deepcopy -import numpy as np import h5py +import numpy as np -from ..hyp_defs import float_save, float_cpu +from ..hyp_defs import float_cpu, float_save class NPModel(object): diff --git a/hyperion/np/pdfs/__init__.py b/hyperion/np/pdfs/__init__.py index 91af5497..8a91e269 100644 --- a/hyperion/np/pdfs/__init__.py +++ b/hyperion/np/pdfs/__init__.py @@ -4,7 +4,7 @@ """ from .core import * +from .hmm import * +from .jfa import * from .mixtures import * from .plda import * -from .jfa import * -from .hmm import * diff --git a/hyperion/np/pdfs/core/__init__.py b/hyperion/np/pdfs/core/__init__.py index 2defe6d4..0f6287f2 100644 --- a/hyperion/np/pdfs/core/__init__.py +++ b/hyperion/np/pdfs/core/__init__.py @@ -4,7 +4,7 @@ """ -from .pdf import PDF from .exp_family import ExpFamily -from .normal_diag_cov import NormalDiagCov, DiagNormal from .normal import Normal +from .normal_diag_cov import DiagNormal, NormalDiagCov +from .pdf import PDF diff --git a/hyperion/np/pdfs/core/normal.py b/hyperion/np/pdfs/core/normal.py index 4c3c70cf..b8f8bb54 100644 --- a/hyperion/np/pdfs/core/normal.py +++ b/hyperion/np/pdfs/core/normal.py @@ -7,21 +7,11 @@ import scipy.linalg as la from ....hyp_defs import float_cpu -from ....utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) -from ....utils.math import ( - invert_pdmat, - invert_trimat, - symmat2vec, - vec2symmat, - fullcov_varfloor, - logdet_pdmat, -) - +from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat, + logdet_pdmat, symmat2vec, vec2symmat) +from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D) from .exp_family import ExpFamily diff --git a/hyperion/np/pdfs/core/normal_diag_cov.py b/hyperion/np/pdfs/core/normal_diag_cov.py index 8a896cd5..c9986f4c 100644 --- a/hyperion/np/pdfs/core/normal_diag_cov.py +++ b/hyperion/np/pdfs/core/normal_diag_cov.py @@ -7,13 +7,9 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) - +from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D) from .exp_family import ExpFamily diff --git a/hyperion/np/pdfs/hmm/hmm.py b/hyperion/np/pdfs/hmm/hmm.py index 704f0991..80232e36 100644 --- a/hyperion/np/pdfs/hmm/hmm.py +++ b/hyperion/np/pdfs/hmm/hmm.py @@ -6,7 +6,7 @@ import numpy as np from ....hyp_defs import float_cpu -from ....utils.math import softmax, logsumexp +from ....utils.math import logsumexp, softmax from ..core import PDF diff --git a/hyperion/np/pdfs/jfa/jfa_total.py b/hyperion/np/pdfs/jfa/jfa_total.py index 993da9d6..041431fb 100644 --- a/hyperion/np/pdfs/jfa/jfa_total.py +++ b/hyperion/np/pdfs/jfa/jfa_total.py @@ -7,13 +7,8 @@ from scipy import linalg as la from ....hyp_defs import float_cpu -from ....utils.math import ( - invert_pdmat, - invert_trimat, - logdet_pdmat, - vec2symmat, - symmat2vec, -) +from ....utils.math import (invert_pdmat, invert_trimat, logdet_pdmat, + symmat2vec, vec2symmat) from ..core.pdf import PDF diff --git a/hyperion/np/pdfs/mixtures/__init__.py b/hyperion/np/pdfs/mixtures/__init__.py index f9168905..dccad8d1 100644 --- a/hyperion/np/pdfs/mixtures/__init__.py +++ b/hyperion/np/pdfs/mixtures/__init__.py @@ -5,6 +5,6 @@ from .exp_family_mixture import ExpFamilyMixture -from .gmm_diag_cov import GMMDiagCov, DiagGMM -from .gmm_tied_diag_cov import GMMTiedDiagCov, DiagGMMTiedCov from .gmm import GMM +from .gmm_diag_cov import DiagGMM, GMMDiagCov +from .gmm_tied_diag_cov import DiagGMMTiedCov, GMMTiedDiagCov diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index f684e453..5560882c 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -2,12 +2,12 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np - import logging +import numpy as np + from ....hyp_defs import float_cpu -from ....utils.math import softmax, logsumexp +from ....utils.math import logsumexp, softmax from ....utils.queues import GeneratorQueue from ..core import PDF diff --git a/hyperion/np/pdfs/mixtures/gmm.py b/hyperion/np/pdfs/mixtures/gmm.py index 4f6d599e..ca197142 100644 --- a/hyperion/np/pdfs/mixtures/gmm.py +++ b/hyperion/np/pdfs/mixtures/gmm.py @@ -2,31 +2,19 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py +import numpy as np import scipy.linalg as la from scipy.special import erf - from ....hyp_defs import float_cpu -from ....utils.math import ( - softmax, - logsumexp, - invert_pdmat, - invert_trimat, - symmat2vec, - vec2symmat, - fullcov_varfloor, - logdet_pdmat, -) -from ....utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) +from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat, + logdet_pdmat, logsumexp, softmax, symmat2vec, + vec2symmat) +from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D) from ...clustering import KMeans - from ..core import Normal from .exp_family_mixture import ExpFamilyMixture diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index 4a0ba27d..90141573 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -3,20 +3,16 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py +import numpy as np from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import softmax, logsumexp -from ....utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) +from ....utils.math import logsumexp, softmax +from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D) from ...clustering import KMeans - from .exp_family_mixture import ExpFamilyMixture diff --git a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py index ff02ec62..4dc8f46e 100644 --- a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py @@ -2,20 +2,16 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py +import numpy as np from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import softmax, logsumexp -from ....utils.plotting import ( - plot_gaussian_1D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D, - plot_gaussian_3D, -) +from ....utils.math import logsumexp, softmax +from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D) from ...clustering import KMeans - from .gmm_diag_cov import GMMDiagCov diff --git a/hyperion/np/pdfs/plda/__init__.py b/hyperion/np/pdfs/plda/__init__.py index 9d11ad38..13bc2d81 100644 --- a/hyperion/np/pdfs/plda/__init__.py +++ b/hyperion/np/pdfs/plda/__init__.py @@ -4,7 +4,7 @@ """ -from .plda_base import PLDABase from .frplda import FRPLDA -from .splda import SPLDA from .plda import PLDA +from .plda_base import PLDABase +from .splda import SPLDA diff --git a/hyperion/np/pdfs/plda/plda_base.py b/hyperion/np/pdfs/plda/plda_base.py index 72503965..9dde58b1 100644 --- a/hyperion/np/pdfs/plda/plda_base.py +++ b/hyperion/np/pdfs/plda/plda_base.py @@ -6,8 +6,8 @@ import numpy as np from ....hyp_defs import float_cpu -from ..core.pdf import PDF from ...transforms import LNorm +from ..core.pdf import PDF class PLDABase(PDF): diff --git a/hyperion/np/score_norm/__init__.py b/hyperion/np/score_norm/__init__.py index b0eb8000..7707b669 100644 --- a/hyperion/np/score_norm/__init__.py +++ b/hyperion/np/score_norm/__init__.py @@ -4,9 +4,9 @@ """ +from .adapt_s_norm import AdaptSNorm +from .s_norm import SNorm from .t_norm import TNorm +from .tz_norm import TZNorm from .z_norm import ZNorm from .zt_norm import ZTNorm -from .tz_norm import TZNorm -from .s_norm import SNorm -from .adapt_s_norm import AdaptSNorm diff --git a/hyperion/np/score_norm/adapt_s_norm.py b/hyperion/np/score_norm/adapt_s_norm.py index 46d1fc14..a5ae6f13 100644 --- a/hyperion/np/score_norm/adapt_s_norm.py +++ b/hyperion/np/score_norm/adapt_s_norm.py @@ -4,8 +4,8 @@ """ -import numpy as np import h5py +import numpy as np from .score_norm import ScoreNorm diff --git a/hyperion/np/score_norm/s_norm.py b/hyperion/np/score_norm/s_norm.py index 2cf81ffc..4c991d95 100644 --- a/hyperion/np/score_norm/s_norm.py +++ b/hyperion/np/score_norm/s_norm.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py +import numpy as np from .score_norm import ScoreNorm from .t_norm import TNorm diff --git a/hyperion/np/score_norm/t_norm.py b/hyperion/np/score_norm/t_norm.py index a5a80def..bf514b3d 100644 --- a/hyperion/np/score_norm/t_norm.py +++ b/hyperion/np/score_norm/t_norm.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py +import numpy as np from .score_norm import ScoreNorm diff --git a/hyperion/np/score_norm/zt_norm.py b/hyperion/np/score_norm/zt_norm.py index 415ddca8..078dd8ce 100644 --- a/hyperion/np/score_norm/zt_norm.py +++ b/hyperion/np/score_norm/zt_norm.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py +import numpy as np from .score_norm import ScoreNorm from .t_norm import TNorm diff --git a/hyperion/np/transforms/__init__.py b/hyperion/np/transforms/__init__.py index 3f6c5f45..c963e32b 100644 --- a/hyperion/np/transforms/__init__.py +++ b/hyperion/np/transforms/__init__.py @@ -4,17 +4,16 @@ """ from .cent_whiten import CentWhiten -from .lnorm import LNorm -from .sb_sw import SbSw -from .pca import PCA -from .lda import LDA -from .nda import NDA -from .nap import NAP -from .mvn import MVN +from .cent_whiten_up import CentWhitenUP from .coral import CORAL from .gaussianizer import Gaussianizer +from .lda import LDA +from .lnorm import LNorm +from .lnorm_up import LNormUP +from .mvn import MVN +from .nap import NAP +from .nda import NDA +from .pca import PCA +from .sb_sw import SbSw from .skl_tsne import SklTSNE from .transform_list import TransformList - -from .cent_whiten_up import CentWhitenUP -from .lnorm_up import LNormUP diff --git a/hyperion/np/transforms/cent_whiten.py b/hyperion/np/transforms/cent_whiten.py index 5f71c173..35e79d80 100644 --- a/hyperion/np/transforms/cent_whiten.py +++ b/hyperion/np/transforms/cent_whiten.py @@ -2,11 +2,10 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo -import numpy as np import h5py - +import numpy as np import scipy.linalg as la +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..np_model import NPModel from ..pdfs import Normal diff --git a/hyperion/np/transforms/cent_whiten_up.py b/hyperion/np/transforms/cent_whiten_up.py index 9290eae6..7e677d16 100644 --- a/hyperion/np/transforms/cent_whiten_up.py +++ b/hyperion/np/transforms/cent_whiten_up.py @@ -3,9 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py - +import numpy as np import scipy.linalg as la from ..np_model import NPModel diff --git a/hyperion/np/transforms/coral.py b/hyperion/np/transforms/coral.py index 54bd27bc..90cc9774 100644 --- a/hyperion/np/transforms/coral.py +++ b/hyperion/np/transforms/coral.py @@ -3,9 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py - +import numpy as np import scipy.linalg as la from ..np_model import NPModel diff --git a/hyperion/np/transforms/gaussianizer.py b/hyperion/np/transforms/gaussianizer.py index 393364b6..2c208e02 100644 --- a/hyperion/np/transforms/gaussianizer.py +++ b/hyperion/np/transforms/gaussianizer.py @@ -4,9 +4,9 @@ """ import logging -import numpy as np -import h5py +import h5py +import numpy as np import scipy.linalg as la from scipy.special import erfinv diff --git a/hyperion/np/transforms/lda.py b/hyperion/np/transforms/lda.py index b4f5cbc8..fc886ede 100644 --- a/hyperion/np/transforms/lda.py +++ b/hyperion/np/transforms/lda.py @@ -3,9 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py - +import numpy as np import scipy.linalg as la from ..np_model import NPModel diff --git a/hyperion/np/transforms/lnorm.py b/hyperion/np/transforms/lnorm.py index 9b4f36fe..302dedbe 100644 --- a/hyperion/np/transforms/lnorm.py +++ b/hyperion/np/transforms/lnorm.py @@ -2,8 +2,8 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py +import numpy as np from .cent_whiten import CentWhiten diff --git a/hyperion/np/transforms/lnorm_up.py b/hyperion/np/transforms/lnorm_up.py index 0814f9fe..2f3c1baf 100644 --- a/hyperion/np/transforms/lnorm_up.py +++ b/hyperion/np/transforms/lnorm_up.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py +import numpy as np from .cent_whiten_up import CentWhitenUP diff --git a/hyperion/np/transforms/mvn.py b/hyperion/np/transforms/mvn.py index 484a6913..f8154148 100644 --- a/hyperion/np/transforms/mvn.py +++ b/hyperion/np/transforms/mvn.py @@ -3,9 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py - +import numpy as np import scipy.linalg as la from ..np_model import NPModel diff --git a/hyperion/np/transforms/nap.py b/hyperion/np/transforms/nap.py index c6f8f8de..c826e887 100644 --- a/hyperion/np/transforms/nap.py +++ b/hyperion/np/transforms/nap.py @@ -3,9 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py - +import numpy as np import scipy.linalg as la from ..np_model import NPModel diff --git a/hyperion/np/transforms/nda.py b/hyperion/np/transforms/nda.py index 71910c92..13fe6aef 100644 --- a/hyperion/np/transforms/nda.py +++ b/hyperion/np/transforms/nda.py @@ -3,13 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py - +import numpy as np import scipy.linalg as la -from ..np_model import NPModel from ...hyp_defs import float_cpu +from ..np_model import NPModel from .sb_sw import NSbSw diff --git a/hyperion/np/transforms/pca.py b/hyperion/np/transforms/pca.py index 36f6012b..eabb200d 100644 --- a/hyperion/np/transforms/pca.py +++ b/hyperion/np/transforms/pca.py @@ -2,12 +2,11 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo -import numpy as np import h5py - -from numpy.linalg import matrix_rank +import numpy as np import scipy.linalg as la +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from numpy.linalg import matrix_rank from ..np_model import NPModel diff --git a/hyperion/np/transforms/sb_sw.py b/hyperion/np/transforms/sb_sw.py index 6d013e55..e182c8e6 100644 --- a/hyperion/np/transforms/sb_sw.py +++ b/hyperion/np/transforms/sb_sw.py @@ -2,14 +2,13 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import h5py - +import numpy as np import scipy.linalg as la from sklearn.neighbors import BallTree -from ..np_model import NPModel from ...hyp_defs import float_cpu +from ..np_model import NPModel class SbSw(NPModel): diff --git a/hyperion/np/transforms/skl_tsne.py b/hyperion/np/transforms/skl_tsne.py index 71a3e084..3f60c4be 100644 --- a/hyperion/np/transforms/skl_tsne.py +++ b/hyperion/np/transforms/skl_tsne.py @@ -2,9 +2,8 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import numpy as np - +from jsonargparse import ActionParser, ArgumentParser from sklearn.manifold import TSNE from ..np_model import NPModel diff --git a/hyperion/np/transforms/transform_list.py b/hyperion/np/transforms/transform_list.py index 1ddceeaa..58da16eb 100644 --- a/hyperion/np/transforms/transform_list.py +++ b/hyperion/np/transforms/transform_list.py @@ -5,21 +5,20 @@ import logging -import numpy as np import h5py +import numpy as np from ..np_model import NPModel - from .cent_whiten import CentWhiten from .cent_whiten_up import CentWhitenUP +from .gaussianizer import Gaussianizer +from .lda import LDA from .lnorm import LNorm from .lnorm_up import LNormUP -from .pca import PCA -from .lda import LDA -from .nda import NDA -from .nap import NAP from .mvn import MVN -from .gaussianizer import Gaussianizer +from .nap import NAP +from .nda import NDA +from .pca import PCA class TransformList(NPModel): diff --git a/hyperion/torch/adv_attacks/__init__.py b/hyperion/torch/adv_attacks/__init__.py index 906b8740..5fda4ac9 100644 --- a/hyperion/torch/adv_attacks/__init__.py +++ b/hyperion/torch/adv_attacks/__init__.py @@ -3,14 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .fgsm_attack import FGSMAttack -from .snr_fgsm_attack import SNRFGSMAttack -from .rand_fgsm_attack import RandFGSMAttack -from .iter_fgsm_attack import IterFGSMAttack +from .attack_factory import AttackFactory +from .carlini_wagner_l0 import CarliniWagnerL0 from .carlini_wagner_l2 import CarliniWagnerL2 from .carlini_wagner_linf import CarliniWagnerLInf -from .carlini_wagner_l0 import CarliniWagnerL0 +from .fgsm_attack import FGSMAttack +from .iter_fgsm_attack import IterFGSMAttack from .pgd_attack import PGDAttack - -from .attack_factory import AttackFactory +from .rand_fgsm_attack import RandFGSMAttack from .random_attack_factory import RandomAttackFactory +from .snr_fgsm_attack import SNRFGSMAttack diff --git a/hyperion/torch/adv_attacks/art_attack_factory.py b/hyperion/torch/adv_attacks/art_attack_factory.py index e09c62ff..ba103acf 100644 --- a/hyperion/torch/adv_attacks/art_attack_factory.py +++ b/hyperion/torch/adv_attacks/art_attack_factory.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import numpy as np +from jsonargparse import ActionParser, ArgumentParser try: from art.attacks import evasion as attacks diff --git a/hyperion/torch/adv_attacks/attack_factory.py b/hyperion/torch/adv_attacks/attack_factory.py index 8ea952ad..5d53f6bc 100644 --- a/hyperion/torch/adv_attacks/attack_factory.py +++ b/hyperion/torch/adv_attacks/attack_factory.py @@ -2,16 +2,16 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser -from .fgsm_attack import FGSMAttack -from .snr_fgsm_attack import SNRFGSMAttack -from .rand_fgsm_attack import RandFGSMAttack -from .iter_fgsm_attack import IterFGSMAttack -from .carlini_wagner_l2 import CarliniWagnerL2 from .carlini_wagner_l0 import CarliniWagnerL0 +from .carlini_wagner_l2 import CarliniWagnerL2 from .carlini_wagner_linf import CarliniWagnerLInf +from .fgsm_attack import FGSMAttack +from .iter_fgsm_attack import IterFGSMAttack from .pgd_attack import PGDAttack +from .rand_fgsm_attack import RandFGSMAttack +from .snr_fgsm_attack import SNRFGSMAttack class AttackFactory(object): diff --git a/hyperion/torch/adv_attacks/carlini_wagner_l2.py b/hyperion/torch/adv_attacks/carlini_wagner_l2.py index 27cffe97..e8b545b5 100644 --- a/hyperion/torch/adv_attacks/carlini_wagner_l2.py +++ b/hyperion/torch/adv_attacks/carlini_wagner_l2.py @@ -2,8 +2,8 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math import logging +import math import torch import torch.nn as nn diff --git a/hyperion/torch/adv_attacks/pgd_attack.py b/hyperion/torch/adv_attacks/pgd_attack.py index 879531ed..ca496e64 100644 --- a/hyperion/torch/adv_attacks/pgd_attack.py +++ b/hyperion/torch/adv_attacks/pgd_attack.py @@ -2,10 +2,11 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math import logging +import math import torch + from .adv_attack import AdvAttack diff --git a/hyperion/torch/adv_attacks/random_attack_factory.py b/hyperion/torch/adv_attacks/random_attack_factory.py index e333b119..0c83bc56 100644 --- a/hyperion/torch/adv_attacks/random_attack_factory.py +++ b/hyperion/torch/adv_attacks/random_attack_factory.py @@ -4,9 +4,11 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch + from .attack_factory import AttackFactory as AF diff --git a/hyperion/torch/adv_defenses/wave_gan_white.py b/hyperion/torch/adv_defenses/wave_gan_white.py index ad7f985e..5d045f08 100644 --- a/hyperion/torch/adv_defenses/wave_gan_white.py +++ b/hyperion/torch/adv_defenses/wave_gan_white.py @@ -2,15 +2,15 @@ # Added wave_gan_model_ckpt to test using different model ckpts [Sonal 24Aug20] import logging +import math from pathlib import Path from typing import Tuple -import math import librosa import numpy as np +import yaml import torch -import yaml try: # import parallel_wavegan.models @@ -21,6 +21,7 @@ pass from sklearn.preprocessing import StandardScaler + from torch import nn diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index 65608a0c..239b278d 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -4,12 +4,10 @@ """ from .audio_dataset import AudioDataset - +from .embed_sampler_factory import EmbedSamplerFactory # datasets from .feat_seq_dataset import FeatSeqDataset from .paired_feat_seq_dataset import PairedFeatSeqDataset - # samplers # from .weighted_seq_sampler import ClassWeightedSeqSampler from .seg_sampler_factory import SegSamplerFactory -from .embed_sampler_factory import EmbedSamplerFactory diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 02b81efa..60d4bc98 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -9,15 +9,14 @@ import numpy as np import pandas as pd -import torch -import torch.distributed as dist import torchaudio.transforms as tat - from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +import torch +import torch.distributed as dist from torch.utils.data import Dataset from ...io import RandomAccessAudioReader as AR - # from ...utils.utt2info import Utt2Info from ...np.augment import SpeechAugment from ...utils.class_info import ClassInfo diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index 02497f3b..c9ba677f 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -7,9 +7,10 @@ import math import numpy as np +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.distributed as dist -from jsonargparse import ActionParser, ArgumentParser from .hyp_sampler import HypSampler from .seg_sampler import SegSampler diff --git a/hyperion/torch/data/class_weighted_embed_sampler.py b/hyperion/torch/data/class_weighted_embed_sampler.py index aed9105d..edf1c00d 100644 --- a/hyperion/torch/data/class_weighted_embed_sampler.py +++ b/hyperion/torch/data/class_weighted_embed_sampler.py @@ -9,9 +9,10 @@ import numpy as np import pandas as pd -import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +import torch + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 184c4ab0..81e9082f 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -9,9 +9,10 @@ import numpy as np import pandas as pd -import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +import torch + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/embed_dataset.py b/hyperion/torch/data/embed_dataset.py index 2963854d..519f498d 100644 --- a/hyperion/torch/data/embed_dataset.py +++ b/hyperion/torch/data/embed_dataset.py @@ -10,16 +10,16 @@ import numpy as np import pandas as pd +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.distributed as dist - -from jsonargparse import ActionParser, ArgumentParser, ActionYesNo from torch.utils.data import Dataset from ...io import RandomAccessDataReaderFactory as RF -from ...utils.misc import filter_func_args from ...utils.class_info import ClassInfo from ...utils.info_table import InfoTable +from ...utils.misc import filter_func_args from ..torch_defs import floatstr_torch diff --git a/hyperion/torch/data/embed_sampler.py b/hyperion/torch/data/embed_sampler.py index 8836fe2a..65adcba6 100644 --- a/hyperion/torch/data/embed_sampler.py +++ b/hyperion/torch/data/embed_sampler.py @@ -7,9 +7,10 @@ import math import numpy as np -import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +import torch + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/embed_sampler_factory.py b/hyperion/torch/data/embed_sampler_factory.py index 43d00b1d..aea35ddf 100644 --- a/hyperion/torch/data/embed_sampler_factory.py +++ b/hyperion/torch/data/embed_sampler_factory.py @@ -7,9 +7,9 @@ from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from .embed_dataset import EmbedDataset from .bucketing_seg_sampler import BucketingSegSampler from .class_weighted_embed_sampler import ClassWeightedEmbedSampler +from .embed_dataset import EmbedDataset from .embed_sampler import EmbedSampler sampler_dict = { diff --git a/hyperion/torch/data/feat_seq_dataset.py b/hyperion/torch/data/feat_seq_dataset.py index 1605ead3..bb487dda 100644 --- a/hyperion/torch/data/feat_seq_dataset.py +++ b/hyperion/torch/data/feat_seq_dataset.py @@ -12,15 +12,15 @@ import numpy as np import pandas as pd +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.distributed as dist - -from jsonargparse import ActionParser, ArgumentParser, ActionYesNo from torch.utils.data import Dataset from ...io import RandomAccessDataReaderFactory as RF -from ...utils.misc import filter_func_args from ...utils.class_info import ClassInfo +from ...utils.misc import filter_func_args from ...utils.segment_set import SegmentSet from ..torch_defs import floatstr_torch diff --git a/hyperion/torch/data/hyp_sampler.py b/hyperion/torch/data/hyp_sampler.py index c5097723..d1bcb0a8 100644 --- a/hyperion/torch/data/hyp_sampler.py +++ b/hyperion/torch/data/hyp_sampler.py @@ -2,9 +2,10 @@ import math import numpy as np +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.distributed as dist -from jsonargparse import ActionParser, ArgumentParser from torch.utils.data import Sampler diff --git a/hyperion/torch/data/paired_feat_seq_dataset.py b/hyperion/torch/data/paired_feat_seq_dataset.py index fc17593e..eff2ed58 100644 --- a/hyperion/torch/data/paired_feat_seq_dataset.py +++ b/hyperion/torch/data/paired_feat_seq_dataset.py @@ -6,6 +6,7 @@ import logging import numpy as np + import torch from ...utils.utt2info import Utt2Info diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py index 76054cd8..2933dcc6 100644 --- a/hyperion/torch/data/seg_chunk_sampler.py +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -8,9 +8,10 @@ import numpy as np import pandas as pd +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.distributed as dist -from jsonargparse import ActionParser, ArgumentParser from ...utils.segment_set import SegmentSet from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index 1c54a021..ac66eaf6 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -7,9 +7,10 @@ import math import numpy as np -import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +import torch + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/weighted_embed_sampler.py b/hyperion/torch/data/weighted_embed_sampler.py index 22da93f9..5870512a 100644 --- a/hyperion/torch/data/weighted_embed_sampler.py +++ b/hyperion/torch/data/weighted_embed_sampler.py @@ -7,6 +7,7 @@ import math import numpy as np + import torch from torch.utils.data import Sampler diff --git a/hyperion/torch/data/weighted_seq_sampler.py b/hyperion/torch/data/weighted_seq_sampler.py index 345c2429..b6f0b670 100644 --- a/hyperion/torch/data/weighted_seq_sampler.py +++ b/hyperion/torch/data/weighted_seq_sampler.py @@ -7,9 +7,10 @@ import math import numpy as np +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.distributed as dist -from jsonargparse import ActionParser, ArgumentParser from torch.utils.data import Sampler diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index 2fa71766..7ec806a5 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -3,56 +3,34 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .fc_blocks import FCBlock -from .se_blocks import ( - SEBlock2D, - TSEBlock2D, - SEBlock2d, - TSEBlock2d, - FwSEBlock2d, - CFwSEBlock2d, - SEBlock1d, -) -from .tdnn_blocks import TDNNBlock -from .etdnn_blocks import ETDNNBlock -from .resetdnn_blocks import ResETDNNBlock -from .resnet_blocks import ResNetInputBlock, ResNetBasicBlock, ResNetBNBlock -from .resnet_blocks import ResNetEndpointBlock -from .seresnet_blocks import SEResNetBasicBlock, SEResNetBNBlock -from .res2net_blocks import Res2NetBasicBlock, Res2NetBNBlock -from .mbconv_blocks import MBConvBlock, MBConvInOutBlock -from .transformer_feedforward import PositionwiseFeedForward, Conv1dx2, Conv1dLinear -from .transformer_encoder_v1 import TransformerEncoderBlockV1 -from .transformer_conv2d_subsampler import TransformerConv2dSubsampler from .conformer_conv import ConformerConvBlock from .conformer_encoder_v1 import ConformerEncoderBlockV1 -from .dc1d_blocks import DC1dEncBlock, DC1dDecBlock -from .dc2d_blocks import DC2dEncBlock, DC2dDecBlock -from .resnet1d_blocks import ( - ResNet1dBasicBlock, - ResNet1dBasicDecBlock, - ResNet1dBNBlock, - ResNet1dBNDecBlock, - ResNet1dEndpoint, -) -from .resnet1d_blocks import ( - SEResNet1dBasicBlock, - SEResNet1dBasicDecBlock, - SEResNet1dBNBlock, - SEResNet1dBNDecBlock, -) +from .dc1d_blocks import DC1dDecBlock, DC1dEncBlock +from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock +from .etdnn_blocks import ETDNNBlock +from .fc_blocks import FCBlock +from .mbconv_blocks import MBConvBlock, MBConvInOutBlock from .res2net1d_blocks import Res2Net1dBasicBlock, Res2Net1dBNBlock -from .resnet2d_blocks import ( - ResNet2dBasicBlock, - ResNet2dBasicDecBlock, - ResNet2dBNBlock, - ResNet2dBNDecBlock, -) -from .resnet2d_blocks import ( - SEResNet2dBasicBlock, - SEResNet2dBasicDecBlock, - SEResNet2dBNBlock, - SEResNet2dBNDecBlock, -) from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock -from .spine_blocks import BlockSpec, SpineResample, SpineEndpoints, SpineConv +from .res2net_blocks import Res2NetBasicBlock, Res2NetBNBlock +from .resetdnn_blocks import ResETDNNBlock +from .resnet1d_blocks import (ResNet1dBasicBlock, ResNet1dBasicDecBlock, + ResNet1dBNBlock, ResNet1dBNDecBlock, + ResNet1dEndpoint, SEResNet1dBasicBlock, + SEResNet1dBasicDecBlock, SEResNet1dBNBlock, + SEResNet1dBNDecBlock) +from .resnet2d_blocks import (ResNet2dBasicBlock, ResNet2dBasicDecBlock, + ResNet2dBNBlock, ResNet2dBNDecBlock, + SEResNet2dBasicBlock, SEResNet2dBasicDecBlock, + SEResNet2dBNBlock, SEResNet2dBNDecBlock) +from .resnet_blocks import (ResNetBasicBlock, ResNetBNBlock, + ResNetEndpointBlock, ResNetInputBlock) +from .se_blocks import (CFwSEBlock2d, FwSEBlock2d, SEBlock1d, SEBlock2D, + SEBlock2d, TSEBlock2D, TSEBlock2d) +from .seresnet_blocks import SEResNetBasicBlock, SEResNetBNBlock +from .spine_blocks import BlockSpec, SpineConv, SpineEndpoints, SpineResample +from .tdnn_blocks import TDNNBlock +from .transformer_conv2d_subsampler import TransformerConv2dSubsampler +from .transformer_encoder_v1 import TransformerEncoderBlockV1 +from .transformer_feedforward import (Conv1dLinear, Conv1dx2, + PositionwiseFeedForward) diff --git a/hyperion/torch/layer_blocks/conformer_encoder_v1.py b/hyperion/torch/layer_blocks/conformer_encoder_v1.py index a54e3b99..b2eab352 100644 --- a/hyperion/torch/layer_blocks/conformer_encoder_v1.py +++ b/hyperion/torch/layer_blocks/conformer_encoder_v1.py @@ -8,8 +8,8 @@ import torch.nn as nn from ..layers.attention import * -from .transformer_feedforward import * from .conformer_conv import ConformerConvBlock +from .transformer_feedforward import * class ConformerEncoderBlockV1(nn.Module): diff --git a/hyperion/torch/layer_blocks/dc1d_blocks.py b/hyperion/torch/layer_blocks/dc1d_blocks.py index da643c34..780af960 100644 --- a/hyperion/torch/layer_blocks/dc1d_blocks.py +++ b/hyperion/torch/layer_blocks/dc1d_blocks.py @@ -4,7 +4,7 @@ """ import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear from ..layers import ActivationFactory as AF from ..layers import Dropout1d diff --git a/hyperion/torch/layer_blocks/dc2d_blocks.py b/hyperion/torch/layer_blocks/dc2d_blocks.py index bae8e203..a99f9211 100644 --- a/hyperion/torch/layer_blocks/dc2d_blocks.py +++ b/hyperion/torch/layer_blocks/dc2d_blocks.py @@ -4,7 +4,7 @@ """ import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF from ..layers.subpixel_convs import SubPixelConv2d diff --git a/hyperion/torch/layer_blocks/etdnn_blocks.py b/hyperion/torch/layer_blocks/etdnn_blocks.py index 17f3f8ef..b6afdd29 100644 --- a/hyperion/torch/layer_blocks/etdnn_blocks.py +++ b/hyperion/torch/layer_blocks/etdnn_blocks.py @@ -6,7 +6,7 @@ import numpy as np import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear from ..layers import ActivationFactory as AF from ..layers import Dropout1d diff --git a/hyperion/torch/layer_blocks/fc_blocks.py b/hyperion/torch/layer_blocks/fc_blocks.py index 567474bf..e56ab83e 100644 --- a/hyperion/torch/layer_blocks/fc_blocks.py +++ b/hyperion/torch/layer_blocks/fc_blocks.py @@ -7,7 +7,7 @@ # import numpy as np import torch.nn as nn -from torch.nn import Linear, BatchNorm1d, Dropout +from torch.nn import BatchNorm1d, Dropout, Linear from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/layer_blocks/mbconv_blocks.py b/hyperion/torch/layer_blocks/mbconv_blocks.py index 89c746ea..8a956b21 100644 --- a/hyperion/torch/layer_blocks/mbconv_blocks.py +++ b/hyperion/torch/layer_blocks/mbconv_blocks.py @@ -7,12 +7,13 @@ import torch import torch.nn as nn -# from torch.nn import Conv2d, BatchNorm2d - from ..layers import ActivationFactory as AF from ..layers import DropConnect2d from .se_blocks import SEBlock2D, TSEBlock2D +# from torch.nn import Conv2d, BatchNorm2d + + def _conv1x1(in_channels, out_channels, stride=1, bias=False): """1x1 convolution""" diff --git a/hyperion/torch/layer_blocks/res2net1d_blocks.py b/hyperion/torch/layer_blocks/res2net1d_blocks.py index 804dbbd3..1decc327 100644 --- a/hyperion/torch/layer_blocks/res2net1d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net1d_blocks.py @@ -3,12 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math + import torch import torch.nn as nn -from torch.nn import Conv1d, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d from ..layers import ActivationFactory as AF -from ..layers import Dropout1d, DropConnect1d +from ..layers import DropConnect1d, Dropout1d from .se_blocks import SEBlock1d diff --git a/hyperion/torch/layer_blocks/res2net2d_blocks.py b/hyperion/torch/layer_blocks/res2net2d_blocks.py index 26d19a9a..d833a5e3 100644 --- a/hyperion/torch/layer_blocks/res2net2d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net2d_blocks.py @@ -3,9 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math + import torch import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF from .se_blocks import SEBlock2d, TSEBlock2d diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 072926c9..6a785956 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -3,12 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math + import torch import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF -from .se_blocks import SEBlock2d, TSEBlock2d, FwSEBlock2d, CFwSEBlock2d +from .se_blocks import CFwSEBlock2d, FwSEBlock2d, SEBlock2d, TSEBlock2d def _conv3x3(in_channels, out_channels, stride=1, groups=1, dilation=1, bias=False): diff --git a/hyperion/torch/layer_blocks/resetdnn_blocks.py b/hyperion/torch/layer_blocks/resetdnn_blocks.py index 775118d1..dfea3720 100644 --- a/hyperion/torch/layer_blocks/resetdnn_blocks.py +++ b/hyperion/torch/layer_blocks/resetdnn_blocks.py @@ -7,7 +7,7 @@ import numpy as np import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear from ..layers import ActivationFactory as AF from ..layers import Dropout1d diff --git a/hyperion/torch/layer_blocks/resnet1d_blocks.py b/hyperion/torch/layer_blocks/resnet1d_blocks.py index ca99bb3d..dd914eba 100644 --- a/hyperion/torch/layer_blocks/resnet1d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet1d_blocks.py @@ -3,10 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import torch.nn as nn -from torch.nn import Conv1d, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d from ..layers import ActivationFactory as AF -from ..layers import Dropout1d, DropConnect1d, Interpolate +from ..layers import DropConnect1d, Dropout1d, Interpolate from ..layers.subpixel_convs import SubPixelConv1d from .se_blocks import SEBlock1d diff --git a/hyperion/torch/layer_blocks/resnet2d_blocks.py b/hyperion/torch/layer_blocks/resnet2d_blocks.py index 65761526..7fe89b56 100644 --- a/hyperion/torch/layer_blocks/resnet2d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet2d_blocks.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF from ..layers.subpixel_convs import SubPixelConv2d diff --git a/hyperion/torch/layer_blocks/resnet_blocks.py b/hyperion/torch/layer_blocks/resnet_blocks.py index 83e6d174..e25c0cbb 100644 --- a/hyperion/torch/layer_blocks/resnet_blocks.py +++ b/hyperion/torch/layer_blocks/resnet_blocks.py @@ -4,8 +4,8 @@ """ import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d import torch.nn.functional as nnf +from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/layer_blocks/se_blocks.py b/hyperion/torch/layer_blocks/se_blocks.py index e99d545e..c53d5ecc 100644 --- a/hyperion/torch/layer_blocks/se_blocks.py +++ b/hyperion/torch/layer_blocks/se_blocks.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn -from torch.nn import Conv2d, Conv1d +from torch.nn import Conv1d, Conv2d from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/layer_blocks/seresnet_blocks.py b/hyperion/torch/layer_blocks/seresnet_blocks.py index 5074f466..4807e94b 100644 --- a/hyperion/torch/layer_blocks/seresnet_blocks.py +++ b/hyperion/torch/layer_blocks/seresnet_blocks.py @@ -5,11 +5,11 @@ import torch import torch.nn as nn -from torch.nn import Conv2d, Linear, BatchNorm2d, Dropout2d +from torch.nn import BatchNorm2d, Conv2d, Dropout2d, Linear from ..layers import ActivationFactory as AF -from .se_blocks import SEBlock2d, TSEBlock2d, FwSEBlock2d, CFwSEBlock2d from .resnet_blocks import ResNetBasicBlock, ResNetBNBlock +from .se_blocks import CFwSEBlock2d, FwSEBlock2d, SEBlock2d, TSEBlock2d class SEResNetBasicBlock(ResNetBasicBlock): diff --git a/hyperion/torch/layer_blocks/spine_blocks.py b/hyperion/torch/layer_blocks/spine_blocks.py index c97cb027..bb7a454a 100644 --- a/hyperion/torch/layer_blocks/spine_blocks.py +++ b/hyperion/torch/layer_blocks/spine_blocks.py @@ -3,14 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging + import torch.nn as nn -from torch.nn import Conv2d, BatchNorm2d, Dropout2d import torch.nn.functional as nnf +from torch.nn import BatchNorm2d, Conv2d, Dropout2d -from ..layers.subpixel_convs import SubPixelConv2d from ..layers import ActivationFactory as AF - -import logging +from ..layers.subpixel_convs import SubPixelConv2d class Interpolate(nn.Module): diff --git a/hyperion/torch/layer_blocks/tdnn_blocks.py b/hyperion/torch/layer_blocks/tdnn_blocks.py index e979b7db..c1a21d52 100644 --- a/hyperion/torch/layer_blocks/tdnn_blocks.py +++ b/hyperion/torch/layer_blocks/tdnn_blocks.py @@ -4,7 +4,7 @@ """ import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear from ..layers import ActivationFactory as AF from ..layers import Dropout1d diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py index 45ce75f8..42b40303 100644 --- a/hyperion/torch/layers/__init__.py +++ b/hyperion/torch/layers/__init__.py @@ -3,29 +3,20 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .dropout import Dropout1d, DropConnect2d, DropConnect1d -from .global_pool import * - from .activation_factory import ActivationFactory -from .norm_layer_factory import NormLayer2dFactory, NormLayer1dFactory -from .pool_factory import GlobalPool1dFactory - -from .margin_losses import CosLossOutput, ArcLossOutput, SubCenterArcLossOutput - +from .attention import (LocalScaledDotProdAttRelPosEncV1, + LocalScaledDotProdAttV1, ScaledDotProdAttRelPosEncV1, + ScaledDotProdAttV1) from .audio_feats import * from .audio_feats_factory import AudioFeatsFactory -from .spec_augment import AxisMasker, SpecWarper, SpecAugment -from .mvn import MeanVarianceNorm - -from .attention import ( - ScaledDotProdAttV1, - LocalScaledDotProdAttV1, - ScaledDotProdAttRelPosEncV1, - LocalScaledDotProdAttRelPosEncV1, -) -from .pos_encoder import PosEncoder, RelPosEncoder, NoPosEncoder - -from .subpixel_convs import SubPixelConv1d, SubPixelConv2d, ICNR1d, ICNR2d -from .interpolate import Interpolate - from .calibrators import LinBinCalibrator +from .dropout import DropConnect1d, DropConnect2d, Dropout1d +from .global_pool import * +from .interpolate import Interpolate +from .margin_losses import ArcLossOutput, CosLossOutput, SubCenterArcLossOutput +from .mvn import MeanVarianceNorm +from .norm_layer_factory import NormLayer1dFactory, NormLayer2dFactory +from .pool_factory import GlobalPool1dFactory +from .pos_encoder import NoPosEncoder, PosEncoder, RelPosEncoder +from .spec_augment import AxisMasker, SpecAugment, SpecWarper +from .subpixel_convs import ICNR1d, ICNR2d, SubPixelConv1d, SubPixelConv2d diff --git a/hyperion/torch/layers/activation_factory.py b/hyperion/torch/layers/activation_factory.py index 1d3bdfd2..d07b184e 100644 --- a/hyperion/torch/layers/activation_factory.py +++ b/hyperion/torch/layers/activation_factory.py @@ -5,6 +5,7 @@ # import torch.nn as nn + from .swish import Swish act_dict = { diff --git a/hyperion/torch/layers/audio_feats.py b/hyperion/torch/layers/audio_feats.py index 34cb9aa3..3bc4add9 100644 --- a/hyperion/torch/layers/audio_feats.py +++ b/hyperion/torch/layers/audio_feats.py @@ -4,12 +4,12 @@ """ # -import math import logging +import math import torch -import torch.nn as nn import torch.cuda.amp as amp +import torch.nn as nn try: from torch.fft import rfft as torch_rfft diff --git a/hyperion/torch/layers/audio_feats_factory.py b/hyperion/torch/layers/audio_feats_factory.py index 71c3a8e8..a8398dac 100644 --- a/hyperion/torch/layers/audio_feats_factory.py +++ b/hyperion/torch/layers/audio_feats_factory.py @@ -2,11 +2,12 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import re -from ...utils.misc import str2bool +from jsonargparse import ActionParser, ArgumentParser + from ...np.feats.filter_banks import FilterBankFactory as FBF +from ...utils.misc import str2bool from .audio_feats import * FFT = "fft" diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 5001bfd0..5e38494f 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -4,6 +4,7 @@ """ import logging import math + import numpy as np import torch diff --git a/hyperion/torch/layers/margin_losses.py b/hyperion/torch/layers/margin_losses.py index acb7a514..3f991567 100644 --- a/hyperion/torch/layers/margin_losses.py +++ b/hyperion/torch/layers/margin_losses.py @@ -3,13 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys import logging import math +import sys import torch -import torch.nn as nn import torch.cuda.amp as amp +import torch.nn as nn def _l2_norm(x, axis=-1): diff --git a/hyperion/torch/layers/mvn.py b/hyperion/torch/layers/mvn.py index 4f569089..4b4c5927 100644 --- a/hyperion/torch/layers/mvn.py +++ b/hyperion/torch/layers/mvn.py @@ -2,7 +2,7 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn diff --git a/hyperion/torch/layers/pdf_storage.py b/hyperion/torch/layers/pdf_storage.py index f3f34b37..6a87cd0d 100644 --- a/hyperion/torch/layers/pdf_storage.py +++ b/hyperion/torch/layers/pdf_storage.py @@ -5,8 +5,8 @@ # import torch -import torch.nn as nn import torch.distributions as pdf +import torch.nn as nn class StdNormal(nn.Module): diff --git a/hyperion/torch/layers/pool_factory.py b/hyperion/torch/layers/pool_factory.py index 723c64a4..84d0cbf1 100644 --- a/hyperion/torch/layers/pool_factory.py +++ b/hyperion/torch/layers/pool_factory.py @@ -2,7 +2,8 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser + import torch.nn as nn from .global_pool import * diff --git a/hyperion/torch/layers/spec_augment.py b/hyperion/torch/layers/spec_augment.py index a7ebcfb1..f4e03842 100644 --- a/hyperion/torch/layers/spec_augment.py +++ b/hyperion/torch/layers/spec_augment.py @@ -3,7 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn diff --git a/hyperion/torch/layers/tensor2pdf.py b/hyperion/torch/layers/tensor2pdf.py index 55c890a3..41d1bc37 100644 --- a/hyperion/torch/layers/tensor2pdf.py +++ b/hyperion/torch/layers/tensor2pdf.py @@ -5,9 +5,9 @@ # import torch +import torch.distributions as pdf import torch.nn as nn import torch.nn.functional as nnf -import torch.distributions as pdf class Tensor2PDF(nn.Module): diff --git a/hyperion/torch/layers/tensor2pdf1.py b/hyperion/torch/layers/tensor2pdf1.py index 87ba3475..45c51f17 100644 --- a/hyperion/torch/layers/tensor2pdf1.py +++ b/hyperion/torch/layers/tensor2pdf1.py @@ -4,8 +4,8 @@ """ import torch -import torch.nn as nn import torch.distributions as pdf +import torch.nn as nn class Tensor2PDF(nn.Module): diff --git a/hyperion/torch/layers/vq.py b/hyperion/torch/layers/vq.py index c56b58f6..4a59b305 100644 --- a/hyperion/torch/layers/vq.py +++ b/hyperion/torch/layers/vq.py @@ -5,9 +5,9 @@ import math import torch +import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F -import torch.distributed as dist from ..utils import seq_lengths_to_mask diff --git a/hyperion/torch/loggers/__init__.py b/hyperion/torch/loggers/__init__.py index c48b9965..8842393c 100644 --- a/hyperion/torch/loggers/__init__.py +++ b/hyperion/torch/loggers/__init__.py @@ -3,9 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .csv_logger import CSVLogger from .logger import Logger from .logger_list import LoggerList -from .csv_logger import CSVLogger from .prog_logger import ProgLogger from .tensorboard_logger import TensorBoardLogger from .wandb_logger import WAndBLogger diff --git a/hyperion/torch/loggers/csv_logger.py b/hyperion/torch/loggers/csv_logger.py index 402ddcd5..67fdc464 100644 --- a/hyperion/torch/loggers/csv_logger.py +++ b/hyperion/torch/loggers/csv_logger.py @@ -3,9 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os import csv +import os from collections import OrderedDict as ODict + import numpy as np from .logger import Logger diff --git a/hyperion/torch/loggers/logger.py b/hyperion/torch/loggers/logger.py index 46c1130d..7e9c91f2 100644 --- a/hyperion/torch/loggers/logger.py +++ b/hyperion/torch/loggers/logger.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch.distributed as dist diff --git a/hyperion/torch/loggers/logger_list.py b/hyperion/torch/loggers/logger_list.py index 20ae58ec..0291a01f 100644 --- a/hyperion/torch/loggers/logger_list.py +++ b/hyperion/torch/loggers/logger_list.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch.distributed as dist from .tensorboard_logger import TensorBoardLogger as TBL diff --git a/hyperion/torch/loggers/prog_logger.py b/hyperion/torch/loggers/prog_logger.py index 26479197..8df63b15 100644 --- a/hyperion/torch/loggers/prog_logger.py +++ b/hyperion/torch/loggers/prog_logger.py @@ -3,8 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging +import time from collections import OrderedDict import numpy as np diff --git a/hyperion/torch/loggers/tensorboard_logger.py b/hyperion/torch/loggers/tensorboard_logger.py index 314757d1..a80fa175 100644 --- a/hyperion/torch/loggers/tensorboard_logger.py +++ b/hyperion/torch/loggers/tensorboard_logger.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import re + from torch.utils.tensorboard import SummaryWriter from .logger import Logger diff --git a/hyperion/torch/loggers/wandb_logger.py b/hyperion/torch/loggers/wandb_logger.py index c864e9b1..094f619a 100644 --- a/hyperion/torch/loggers/wandb_logger.py +++ b/hyperion/torch/loggers/wandb_logger.py @@ -2,8 +2,8 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import re import os +import re try: import wandb diff --git a/hyperion/torch/lr_schedulers/__init__.py b/hyperion/torch/lr_schedulers/__init__.py index be77dc15..7d1b07db 100644 --- a/hyperion/torch/lr_schedulers/__init__.py +++ b/hyperion/torch/lr_schedulers/__init__.py @@ -4,11 +4,11 @@ """ -from .lr_scheduler import LRScheduler -from .red_lr_on_plateau import ReduceLROnPlateau +from .cos_lr import AdamCosineLR, CosineLR from .exp_lr import ExponentialLR -from .cos_lr import CosineLR, AdamCosineLR +from .factory import LRSchedulerFactory from .invpow_lr import InvPowLR +from .lr_scheduler import LRScheduler from .noam_lr import NoamLR +from .red_lr_on_plateau import ReduceLROnPlateau from .triangular_lr import TriangularLR -from .factory import LRSchedulerFactory diff --git a/hyperion/torch/lr_schedulers/cos_lr.py b/hyperion/torch/lr_schedulers/cos_lr.py index 5caf12bb..b9e7d069 100644 --- a/hyperion/torch/lr_schedulers/cos_lr.py +++ b/hyperion/torch/lr_schedulers/cos_lr.py @@ -4,8 +4,8 @@ """ -import math import logging +import math import torch diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index 3fef6e93..d3111140 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -2,15 +2,15 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser import torch -from .red_lr_on_plateau import ReduceLROnPlateau +from .cos_lr import AdamCosineLR, CosineLR from .exp_lr import ExponentialLR from .invpow_lr import InvPowLR -from .cos_lr import CosineLR, AdamCosineLR from .noam_lr import NoamLR +from .red_lr_on_plateau import ReduceLROnPlateau from .triangular_lr import TriangularLR diff --git a/hyperion/torch/lr_schedulers/noam_lr.py b/hyperion/torch/lr_schedulers/noam_lr.py index 4acdc3b9..7d034f77 100644 --- a/hyperion/torch/lr_schedulers/noam_lr.py +++ b/hyperion/torch/lr_schedulers/noam_lr.py @@ -2,12 +2,13 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math import logging +import math + +from .invpow_lr import InvPowLR # import torch -from .invpow_lr import InvPowLR class NoamLR(InvPowLR): diff --git a/hyperion/torch/lr_schedulers/triangular_lr.py b/hyperion/torch/lr_schedulers/triangular_lr.py index f2578e1d..10e3f83d 100644 --- a/hyperion/torch/lr_schedulers/triangular_lr.py +++ b/hyperion/torch/lr_schedulers/triangular_lr.py @@ -4,8 +4,8 @@ """ -import math import logging +import math import torch diff --git a/hyperion/torch/metrics/__init__.py b/hyperion/torch/metrics/__init__.py index b4a2eaac..33d67c21 100644 --- a/hyperion/torch/metrics/__init__.py +++ b/hyperion/torch/metrics/__init__.py @@ -4,6 +4,6 @@ """ -from .metrics import TorchMetric -from .accuracy_functional import * from .accuracy import * +from .accuracy_functional import * +from .metrics import TorchMetric diff --git a/hyperion/torch/metrics/accuracy.py b/hyperion/torch/metrics/accuracy.py index ebd02e32..93d71683 100644 --- a/hyperion/torch/metrics/accuracy.py +++ b/hyperion/torch/metrics/accuracy.py @@ -5,8 +5,8 @@ import torch -from .metrics import TorchMetric from .accuracy_functional import * +from .metrics import TorchMetric class CategoricalAccuracy(TorchMetric): diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index e953f58c..db984616 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -4,19 +4,14 @@ """ -from .xvectors.xvector import XVector -from .xvectors.tdnn_xvector import TDNNXVector -from .xvectors.resnet_xvector import ResNetXVector -from .xvectors.efficient_net_xvector import EfficientNetXVector -from .xvectors.transformer_xvector_v1 import TransformerXVectorV1 -from .xvectors.spinenet_xvector import SpineNetXVector -from .xvectors.resnet1d_xvector import ResNet1dXVector - -from .wav2xvectors import ( - HFWav2Vec2ResNet1dXVector, - HFHubert2ResNet1dXVector, - HFWavLM2ResNet1dXVector, -) - from .vae.vae import VAE from .vae.vq_vae import VQVAE +from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector) +from .xvectors.efficient_net_xvector import EfficientNetXVector +from .xvectors.resnet1d_xvector import ResNet1dXVector +from .xvectors.resnet_xvector import ResNetXVector +from .xvectors.spinenet_xvector import SpineNetXVector +from .xvectors.tdnn_xvector import TDNNXVector +from .xvectors.transformer_xvector_v1 import TransformerXVectorV1 +from .xvectors.xvector import XVector diff --git a/hyperion/torch/models/ae/ae.py b/hyperion/torch/models/ae/ae.py index 57d30edc..32cd68ea 100644 --- a/hyperion/torch/models/ae/ae.py +++ b/hyperion/torch/models/ae/ae.py @@ -8,8 +8,8 @@ import torch import torch.nn as nn -from ...torch_model import TorchModel from ...narchs import TorchNALoader +from ...torch_model import TorchModel class AE(TorchModel): diff --git a/hyperion/torch/models/plda/plda_base.py b/hyperion/torch/models/plda/plda_base.py index d6100a36..2556627d 100644 --- a/hyperion/torch/models/plda/plda_base.py +++ b/hyperion/torch/models/plda/plda_base.py @@ -2,15 +2,15 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging import math +import time import torch import torch.nn as nn from ...torch_model import TorchModel -from ...utils.misc import l2_norm, get_selfsim_tarnon +from ...utils.misc import get_selfsim_tarnon, l2_norm class PLDABase(TorchModel): diff --git a/hyperion/torch/models/plda/splda.py b/hyperion/torch/models/plda/splda.py index 0025e4e7..2272793e 100644 --- a/hyperion/torch/models/plda/splda.py +++ b/hyperion/torch/models/plda/splda.py @@ -2,8 +2,8 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import time import logging +import time import torch import torch.nn as nn diff --git a/hyperion/torch/models/tvector/__init__.py b/hyperion/torch/models/tvector/__init__.py index 98db2561..36999146 100644 --- a/hyperion/torch/models/tvector/__init__.py +++ b/hyperion/torch/models/tvector/__init__.py @@ -3,6 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .resnet_tvector import ResNetTVector # t-vectors from .tvector import TVector -from .resnet_tvector import ResNetTVector diff --git a/hyperion/torch/models/tvector/resnet_tvector.py b/hyperion/torch/models/tvector/resnet_tvector.py index d74272aa..c84a38fc 100644 --- a/hyperion/torch/models/tvector/resnet_tvector.py +++ b/hyperion/torch/models/tvector/resnet_tvector.py @@ -9,8 +9,8 @@ import torch import torch.nn as nn -from .xvector import XVector from ..narchs import ResNetFactory as RNF +from .xvector import XVector class ResNetXVector(XVector): diff --git a/hyperion/torch/models/tvector/tvector.py b/hyperion/torch/models/tvector/tvector.py index 8a3758fb..a46fc324 100644 --- a/hyperion/torch/models/tvector/tvector.py +++ b/hyperion/torch/models/tvector/tvector.py @@ -4,14 +4,15 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from ..layers import GlobalPool1dFactory as PF -from ..layer_blocks import TDNNBlock from ...narchs import ClassifHead, ConformerEncoderV1, TorchNALoader +from ..layer_blocks import TDNNBlock +from ..layers import GlobalPool1dFactory as PF from ..torch_model import TorchModel from ..utils import eval_nnet_by_chunks diff --git a/hyperion/torch/models/vae/vae.py b/hyperion/torch/models/vae/vae.py index 32239718..86938bf2 100644 --- a/hyperion/torch/models/vae/vae.py +++ b/hyperion/torch/models/vae/vae.py @@ -6,13 +6,13 @@ import logging import torch -import torch.nn as nn import torch.distributions as pdf +import torch.nn as nn -from ...torch_model import TorchModel -from ...narchs import TorchNALoader -from ...layers import tensor2pdf as t2pdf from ...layers import pdf_storage +from ...layers import tensor2pdf as t2pdf +from ...narchs import TorchNALoader +from ...torch_model import TorchModel class VAE(TorchModel): diff --git a/hyperion/torch/models/vae/vq_vae.py b/hyperion/torch/models/vae/vq_vae.py index 9fcc22a0..e86cd04f 100644 --- a/hyperion/torch/models/vae/vq_vae.py +++ b/hyperion/torch/models/vae/vq_vae.py @@ -6,13 +6,13 @@ import logging import torch -import torch.nn as nn import torch.distributions as pdf +import torch.nn as nn -from ...torch_model import TorchModel -from ...narchs import TorchNALoader from ...layers import tensor2pdf as t2pdf from ...layers import vq +from ...narchs import TorchNALoader +from ...torch_model import TorchModel class VQVAE(TorchModel): diff --git a/hyperion/torch/models/wav2xvectors/__init__.py b/hyperion/torch/models/wav2xvectors/__init__.py index 015c8d0f..62123d13 100644 --- a/hyperion/torch/models/wav2xvectors/__init__.py +++ b/hyperion/torch/models/wav2xvectors/__init__.py @@ -4,14 +4,12 @@ """ -# from .wav2tdnn_xvector import Wav2TDNNXVector -from .wav2resnet_xvector import Wav2ResNetXVector - +from .hf_hubert2resnet1d_xvector import HFHubert2ResNet1dXVector +from .hf_wav2vec2resnet1d_xvector import HFWav2Vec2ResNet1dXVector +from .hf_wavlm2resnet1d_xvector import HFWavLM2ResNet1dXVector # from .wav2efficient_net_xvector import Wav2EfficientNetXVector # from .wav2transformer_xvector_v1 import Wav2TransformerXVectorV1 # from .wav2spinenet_xvector import Wav2SpineNetXVector from .wav2resnet1d_xvector import Wav2ResNet1dXVector - -from .hf_wav2vec2resnet1d_xvector import HFWav2Vec2ResNet1dXVector -from .hf_hubert2resnet1d_xvector import HFHubert2ResNet1dXVector -from .hf_wavlm2resnet1d_xvector import HFWavLM2ResNet1dXVector +# from .wav2tdnn_xvector import Wav2TDNNXVector +from .wav2resnet_xvector import Wav2ResNetXVector diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py index bd5c3f1b..b75ac53f 100644 --- a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py @@ -3,14 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from jsonargparse import ArgumentParser, ActionParser -from typing import Union, Dict, Optional +from typing import Dict, Optional, Union + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from ..xvectors import ResNet1dXVector from ...tpm import HFHubert +from ..xvectors import ResNet1dXVector from .hf_wav2xvector import HFWav2XVector diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py index a5166d4d..43ab2382 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -3,14 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from jsonargparse import ArgumentParser, ActionParser -from typing import Union, Dict, Optional +from typing import Dict, Optional, Union + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from ..xvectors import ResNet1dXVector from ...tpm import HFWav2Vec2 +from ..xvectors import ResNet1dXVector from .hf_wav2xvector import HFWav2XVector diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index bd1ec4cd..8a65f12e 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -2,18 +2,20 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import logging import contextlib -from jsonargparse import ArgumentParser, ActionParser +import logging + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -# import torch.nn.functional as nnf - from ...torch_model import TorchModel from ...utils import remove_silence +# import torch.nn.functional as nnf + + class HFWav2XVector(TorchModel): """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py index 2f4b66ce..56a19130 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py @@ -3,14 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from jsonargparse import ArgumentParser, ActionParser -from typing import Union, Dict, Optional +from typing import Dict, Optional, Union + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from ..xvectors import ResNet1dXVector from ...tpm import HFWavLM +from ..xvectors import ResNet1dXVector from .hf_wav2xvector import HFWav2XVector diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py index 983fbac2..0d9f1bc4 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -4,13 +4,14 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from .wav2xvector import Wav2XVector from ..xvectors import ResNet1dXVector +from .wav2xvector import Wav2XVector class Wav2ResNet1dXVector(Wav2XVector): diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py index dea2e442..1f7283a0 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py @@ -4,13 +4,14 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from .wav2xvector import Wav2XVector from ..xvectors import ResNetXVector +from .wav2xvector import Wav2XVector class Wav2ResNetXVector(Wav2XVector): diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index c7a77f3e..824b5830 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -3,13 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from ...torch_model import TorchModel from ...narchs import AudioFeatsMVN +from ...torch_model import TorchModel from ...utils import remove_silence diff --git a/hyperion/torch/models/xvectors/__init__.py b/hyperion/torch/models/xvectors/__init__.py index 408de716..57819128 100644 --- a/hyperion/torch/models/xvectors/__init__.py +++ b/hyperion/torch/models/xvectors/__init__.py @@ -4,10 +4,10 @@ """ -from .xvector import XVector -from .tdnn_xvector import TDNNXVector -from .resnet_xvector import ResNetXVector from .efficient_net_xvector import EfficientNetXVector -from .transformer_xvector_v1 import TransformerXVectorV1 -from .spinenet_xvector import SpineNetXVector from .resnet1d_xvector import ResNet1dXVector +from .resnet_xvector import ResNetXVector +from .spinenet_xvector import SpineNetXVector +from .tdnn_xvector import TDNNXVector +from .transformer_xvector_v1 import TransformerXVectorV1 +from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/efficient_net_xvector.py b/hyperion/torch/models/xvectors/efficient_net_xvector.py index df5965cd..a8663cd9 100644 --- a/hyperion/torch/models/xvectors/efficient_net_xvector.py +++ b/hyperion/torch/models/xvectors/efficient_net_xvector.py @@ -4,13 +4,14 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from .xvector import XVector from ...narchs import EfficientNet as EN +from .xvector import XVector class EfficientNetXVector(XVector): diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index 5957c9f5..1bce0f87 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -4,13 +4,14 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from .xvector import XVector from ...narchs import ResNet1dEncoder as Encoder +from .xvector import XVector class ResNet1dXVector(XVector): diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index fe88ff57..c6889626 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -4,13 +4,14 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from .xvector import XVector from ...narchs import ResNetFactory as RNF +from .xvector import XVector class ResNetXVector(XVector): diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index 1e616570..203008be 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -3,14 +3,15 @@ Copyright 2020 Magdalena Rybicka Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from .xvector import XVector from ...narchs import SpineNetFactory as SNF +from .xvector import XVector class SpineNetXVector(XVector): diff --git a/hyperion/torch/models/xvectors/tdnn_xvector.py b/hyperion/torch/models/xvectors/tdnn_xvector.py index f28dc9b3..530ca63b 100644 --- a/hyperion/torch/models/xvectors/tdnn_xvector.py +++ b/hyperion/torch/models/xvectors/tdnn_xvector.py @@ -4,13 +4,14 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from .xvector import XVector from ...narchs import TDNNFactory as TF +from .xvector import XVector class TDNNXVector(XVector): diff --git a/hyperion/torch/models/xvectors/transformer_xvector_v1.py b/hyperion/torch/models/xvectors/transformer_xvector_v1.py index b3428783..7c55844a 100644 --- a/hyperion/torch/models/xvectors/transformer_xvector_v1.py +++ b/hyperion/torch/models/xvectors/transformer_xvector_v1.py @@ -4,13 +4,14 @@ """ import logging -from jsonargparse import ArgumentParser, ActionParser + +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from .xvector import XVector from ...narchs import TransformerEncoderV1 as TE +from .xvector import XVector class TransformerXVectorV1(XVector): diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 2072241d..3807bbd8 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -4,14 +4,15 @@ """ import logging from enum import Enum -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo from typing import Optional +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from ...layers import GlobalPool1dFactory as PF from ...layer_blocks import TDNNBlock +from ...layers import GlobalPool1dFactory as PF from ...narchs import ClassifHead, TorchNALoader from ...torch_model import TorchModel from ...utils import eval_nnet_by_chunks, scale_seq_lengths diff --git a/hyperion/torch/narchs/__init__.py b/hyperion/torch/narchs/__init__.py index 71cd9de4..c8504425 100644 --- a/hyperion/torch/narchs/__init__.py +++ b/hyperion/torch/narchs/__init__.py @@ -3,36 +3,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .fcnet import FCNetV1, FCNetV2 - -from .tdnn import TDNNV1 +from .audio_feats_mvn import AudioFeatsMVN +from .classif_head import ClassifHead +from .conformer_encoder_v1 import ConformerEncoderV1 +from .dc1d_decoder import DC1dDecoder +from .dc1d_encoder import DC1dEncoder +from .dc2d_decoder import DC2dDecoder +from .dc2d_encoder import DC2dEncoder +from .efficient_net import EfficientNet from .etdnn import ETDNNV1 +from .fcnet import FCNetV1, FCNetV2 from .resetdnn import ResETDNNV1 -from .tdnn_factory import TDNNFactory - from .resnet import * +from .resnet1d_decoder import ResNet1dDecoder +from .resnet1d_encoder import ResNet1dEncoder +from .resnet2d_decoder import ResNet2dDecoder +from .resnet2d_encoder import ResNet2dEncoder from .resnet_factory import ResNetFactory - from .spinenet import * from .spinenet_factory import SpineNetFactory - -from .transformer_encoder_v1 import TransformerEncoderV1 -from .conformer_encoder_v1 import ConformerEncoderV1 - -from .dc1d_encoder import DC1dEncoder -from .dc1d_decoder import DC1dDecoder -from .dc2d_encoder import DC2dEncoder -from .dc2d_decoder import DC2dDecoder - -from .resnet1d_encoder import ResNet1dEncoder -from .resnet1d_decoder import ResNet1dDecoder -from .resnet2d_encoder import ResNet2dEncoder -from .resnet2d_decoder import ResNet2dDecoder - -from .efficient_net import EfficientNet - -from .classif_head import ClassifHead - -from .audio_feats_mvn import AudioFeatsMVN - +from .tdnn import TDNNV1 +from .tdnn_factory import TDNNFactory from .torch_na_loader import TorchNALoader +from .transformer_encoder_v1 import TransformerEncoderV1 diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index 9092e9d8..160ee61b 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -2,7 +2,7 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index adfeceb3..5d179fdb 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -3,16 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn from torch.nn import Linear +from ..layer_blocks import FCBlock from ..layers import ActivationFactory as AF -from ..layers import CosLossOutput, ArcLossOutput, SubCenterArcLossOutput +from ..layers import ArcLossOutput, CosLossOutput from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import FCBlock +from ..layers import SubCenterArcLossOutput from .net_arch import NetArch diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index 4fabe8d2..3acd44d2 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -3,16 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn -from ..layers import ActivationFactory as AF -from ..layers import NormLayer1dFactory as NLF -from ..layers import PosEncoder, RelPosEncoder, NoPosEncoder from ..layer_blocks import ConformerEncoderBlockV1 as EBlock from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler +from ..layers import ActivationFactory as AF +from ..layers import NoPosEncoder +from ..layers import NormLayer1dFactory as NLF +from ..layers import PosEncoder, RelPosEncoder from .net_arch import NetArch diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index 22f63de6..f5ab74d5 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -4,15 +4,17 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn +from ..layer_blocks import DC1dDecBlock from ..layers import ActivationFactory as AF +from ..layers import ICNR1d from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import DC1dDecBlock -from ..layers import SubPixelConv1d, ICNR1d +from ..layers import SubPixelConv1d from .net_arch import NetArch diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index 619851bb..0c331a5e 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -2,15 +2,16 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn +from ..layer_blocks.dc1d_blocks import DC1dEncBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks.dc1d_blocks import DC1dEncBlock from .net_arch import NetArch diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index 0166baca..4106cbfd 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -3,16 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn +from ..layer_blocks import DC2dDecBlock from ..layers import ActivationFactory as AF +from ..layers import ICNR2d from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import DC2dDecBlock -from ..layers import SubPixelConv2d, ICNR2d +from ..layers import SubPixelConv2d from .net_arch import NetArch diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index e847dbb6..ce7b9677 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -4,14 +4,15 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn +from ..layer_blocks import DC2dEncBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import DC2dEncBlock from .net_arch import NetArch diff --git a/hyperion/torch/narchs/efficient_net.py b/hyperion/torch/narchs/efficient_net.py index 273fa183..b9efdcef 100644 --- a/hyperion/torch/narchs/efficient_net.py +++ b/hyperion/torch/narchs/efficient_net.py @@ -4,15 +4,16 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn -from torch.nn import Linear, Dropout +from torch.nn import Dropout, Linear +from ..layer_blocks import MBConvBlock, MBConvInOutBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import MBConvBlock, MBConvInOutBlock from .net_arch import NetArch diff --git a/hyperion/torch/narchs/etdnn.py b/hyperion/torch/narchs/etdnn.py index ebc14534..a73439b7 100644 --- a/hyperion/torch/narchs/etdnn.py +++ b/hyperion/torch/narchs/etdnn.py @@ -9,9 +9,9 @@ import torch.nn as nn from torch.nn import Conv1d, Linear +from ..layer_blocks import ETDNNBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import ETDNNBlock from .net_arch import NetArch diff --git a/hyperion/torch/narchs/fcnet.py b/hyperion/torch/narchs/fcnet.py index e0c8afd5..cdbf1940 100644 --- a/hyperion/torch/narchs/fcnet.py +++ b/hyperion/torch/narchs/fcnet.py @@ -4,12 +4,12 @@ """ import torch.nn as nn -from torch.nn import Linear, BatchNorm1d, Dropout +from torch.nn import BatchNorm1d, Dropout, Linear +from ..layer_blocks import FCBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF from .net_arch import NetArch -from ..layer_blocks import FCBlock class FCNetV1(NetArch): diff --git a/hyperion/torch/narchs/resetdnn.py b/hyperion/torch/narchs/resetdnn.py index 2c7f3e00..eb964fa5 100644 --- a/hyperion/torch/narchs/resetdnn.py +++ b/hyperion/torch/narchs/resetdnn.py @@ -7,11 +7,11 @@ import torch import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear +from ..layer_blocks import ETDNNBlock, ResETDNNBlock, TDNNBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import ResETDNNBlock, ETDNNBlock, TDNNBlock from .net_arch import NetArch diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index 34ac9b81..e3264f33 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -8,21 +8,15 @@ import torch import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear -from ..utils import seq_lengths_to_mask, scale_seq_lengths +from ..layer_blocks import (Res2NetBasicBlock, Res2NetBNBlock, + ResNetBasicBlock, ResNetBNBlock, + ResNetEndpointBlock, ResNetInputBlock, + SEResNetBasicBlock, SEResNetBNBlock) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import ( - ResNetInputBlock, - ResNetBasicBlock, - ResNetBNBlock, - SEResNetBasicBlock, - SEResNetBNBlock, - Res2NetBasicBlock, - Res2NetBNBlock, -) -from ..layer_blocks import ResNetEndpointBlock +from ..utils import scale_seq_lengths, seq_lengths_to_mask from .net_arch import NetArch diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index 3ab454ae..0c577174 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -2,17 +2,20 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn +from ..layer_blocks import (DC1dDecBlock, ResNet1dBasicDecBlock, + ResNet1dBNDecBlock, SEResNet1dBasicDecBlock, + SEResNet1dBNDecBlock) from ..layers import ActivationFactory as AF +from ..layers import ICNR1d from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import ResNet1dBasicDecBlock, ResNet1dBNDecBlock, DC1dDecBlock -from ..layer_blocks import SEResNet1dBasicDecBlock, SEResNet1dBNDecBlock -from ..layers import SubPixelConv1d, ICNR1d +from ..layers import SubPixelConv1d from .net_arch import NetArch diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index deef9c59..5bdad186 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -3,28 +3,22 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo -import math import logging +import math import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn -from ..utils import seq_lengths_to_mask +from ..layer_blocks import (DC1dEncBlock, Res2Net1dBasicBlock, + Res2Net1dBNBlock, ResNet1dBasicBlock, + ResNet1dBNBlock, ResNet1dEndpoint, + SEResNet1dBasicBlock, SEResNet1dBNBlock) from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import ( - ResNet1dBasicBlock, - ResNet1dBNBlock, - DC1dEncBlock, - ResNet1dEndpoint, - SEResNet1dBasicBlock, - SEResNet1dBNBlock, - Res2Net1dBasicBlock, - Res2Net1dBNBlock, -) +from ..utils import seq_lengths_to_mask from .net_arch import NetArch diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index 22b1e7a7..426b37f5 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -4,16 +4,19 @@ """ import math -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn +from ..layer_blocks import (DC2dDecBlock, ResNet2dBasicDecBlock, + ResNet2dBNDecBlock, SEResNet2dBasicDecBlock, + SEResNet2dBNDecBlock) from ..layers import ActivationFactory as AF +from ..layers import ICNR2d from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import ResNet2dBasicDecBlock, ResNet2dBNDecBlock, DC2dDecBlock -from ..layer_blocks import SEResNet2dBasicDecBlock, SEResNet2dBNDecBlock -from ..layers import SubPixelConv2d, ICNR2d +from ..layers import SubPixelConv2d from .net_arch import NetArch diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index 3af174cf..84e6599e 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -3,19 +3,21 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import logging +import math + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser import torch import torch.nn as nn -from ..utils import seq_lengths_to_mask +from ..layer_blocks import (DC2dEncBlock, Res2Net2dBasicBlock, + Res2Net2dBNBlock, ResNet2dBasicBlock, + ResNet2dBNBlock, SEResNet2dBasicBlock, + SEResNet2dBNBlock) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import ResNet2dBasicBlock, ResNet2dBNBlock, DC2dEncBlock -from ..layer_blocks import SEResNet2dBasicBlock, SEResNet2dBNBlock -from ..layer_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock +from ..utils import seq_lengths_to_mask from .net_arch import NetArch diff --git a/hyperion/torch/narchs/resnet_factory.py b/hyperion/torch/narchs/resnet_factory.py index c9d5806e..2d17a6d7 100644 --- a/hyperion/torch/narchs/resnet_factory.py +++ b/hyperion/torch/narchs/resnet_factory.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .resnet import * diff --git a/hyperion/torch/narchs/spinenet.py b/hyperion/torch/narchs/spinenet.py index 4185d9c4..117c0733 100644 --- a/hyperion/torch/narchs/spinenet.py +++ b/hyperion/torch/narchs/spinenet.py @@ -3,17 +3,19 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import numpy as np import logging + +import numpy as np + import torch import torch.nn as nn -from torch.nn import Conv1d, Linear, BatchNorm1d +from torch.nn import BatchNorm1d, Conv1d, Linear +from ..layer_blocks import (BlockSpec, Res2NetBasicBlock, Res2NetBNBlock, + ResNetBasicBlock, ResNetBNBlock, ResNetInputBlock, + SpineConv, SpineEndpoints, SpineResample) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF -from ..layer_blocks import ResNetInputBlock, ResNetBasicBlock, ResNetBNBlock -from ..layer_blocks import Res2NetBNBlock, Res2NetBasicBlock -from ..layer_blocks import BlockSpec, SpineResample, SpineEndpoints, SpineConv from .net_arch import NetArch SPINENET_BLOCK_SPECS = [ diff --git a/hyperion/torch/narchs/spinenet_factory.py b/hyperion/torch/narchs/spinenet_factory.py index 9e94a1be..092cbd0e 100644 --- a/hyperion/torch/narchs/spinenet_factory.py +++ b/hyperion/torch/narchs/spinenet_factory.py @@ -2,7 +2,7 @@ Copyright 2020 Magdalena Rybicka Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .spinenet import * diff --git a/hyperion/torch/narchs/tdnn.py b/hyperion/torch/narchs/tdnn.py index 6cdcbf85..55e47e6a 100644 --- a/hyperion/torch/narchs/tdnn.py +++ b/hyperion/torch/narchs/tdnn.py @@ -9,9 +9,9 @@ import torch.nn as nn from torch.nn import Linear +from ..layer_blocks import TDNNBlock from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF -from ..layer_blocks import TDNNBlock from .net_arch import NetArch diff --git a/hyperion/torch/narchs/tdnn_factory.py b/hyperion/torch/narchs/tdnn_factory.py index 6a9e6010..901cc9d0 100644 --- a/hyperion/torch/narchs/tdnn_factory.py +++ b/hyperion/torch/narchs/tdnn_factory.py @@ -3,11 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from .tdnn import TDNNV1 from .etdnn import ETDNNV1 from .resetdnn import ResETDNNV1 +from .tdnn import TDNNV1 class TDNNFactory(object): diff --git a/hyperion/torch/narchs/torch_na_loader.py b/hyperion/torch/narchs/torch_na_loader.py index 97bf5fb9..58152fc7 100644 --- a/hyperion/torch/narchs/torch_na_loader.py +++ b/hyperion/torch/narchs/torch_na_loader.py @@ -5,32 +5,24 @@ import torch -from .fcnet import FCNetV1 - -from .tdnn import TDNNV1 -from .etdnn import ETDNNV1 -from .resetdnn import ResETDNNV1 - -from .resnet import * - -from .transformer_encoder_v1 import TransformerEncoderV1 +from .audio_feats_mvn import AudioFeatsMVN +from .classif_head import ClassifHead from .conformer_encoder_v1 import ConformerEncoderV1 - -from .dc1d_encoder import DC1dEncoder from .dc1d_decoder import DC1dDecoder -from .dc2d_encoder import DC2dEncoder +from .dc1d_encoder import DC1dEncoder from .dc2d_decoder import DC2dDecoder - -from .resnet1d_encoder import ResNet1dEncoder +from .dc2d_encoder import DC2dEncoder +from .efficient_net import EfficientNet +from .etdnn import ETDNNV1 +from .fcnet import FCNetV1 +from .resetdnn import ResETDNNV1 +from .resnet import * from .resnet1d_decoder import ResNet1dDecoder -from .resnet2d_encoder import ResNet2dEncoder +from .resnet1d_encoder import ResNet1dEncoder from .resnet2d_decoder import ResNet2dDecoder - -from .efficient_net import EfficientNet - -from .classif_head import ClassifHead - -from .audio_feats_mvn import AudioFeatsMVN +from .resnet2d_encoder import ResNet2dEncoder +from .tdnn import TDNNV1 +from .transformer_encoder_v1 import TransformerEncoderV1 class TorchNALoader(object): diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index d2949c12..4468185e 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -3,15 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser +from jsonargparse import ActionParser, ArgumentParser import torch import torch.nn as nn +from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler +from ..layer_blocks import TransformerEncoderBlockV1 as EBlock from ..layers import ActivationFactory as AF from ..layers import PosEncoder, RelPosEncoder -from ..layer_blocks import TransformerEncoderBlockV1 as EBlock -from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler from .net_arch import NetArch diff --git a/hyperion/torch/optim/__init__.py b/hyperion/torch/optim/__init__.py index cba89796..fd05c755 100644 --- a/hyperion/torch/optim/__init__.py +++ b/hyperion/torch/optim/__init__.py @@ -3,6 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .factory import OptimizerFactory from .fgsm import FGSM from .radam import RAdam -from .factory import OptimizerFactory diff --git a/hyperion/torch/optim/factory.py b/hyperion/torch/optim/factory.py index ab350098..95117b05 100644 --- a/hyperion/torch/optim/factory.py +++ b/hyperion/torch/optim/factory.py @@ -2,13 +2,14 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser import logging -from ...utils.misc import filter_args +from jsonargparse import ActionParser, ArgumentParser import torch import torch.optim as optim + +from ...utils.misc import filter_args from .radam import RAdam diff --git a/hyperion/torch/optim/radam.py b/hyperion/torch/optim/radam.py index 459646c1..1b7a588f 100644 --- a/hyperion/torch/optim/radam.py +++ b/hyperion/torch/optim/radam.py @@ -4,6 +4,7 @@ # import math + import torch from torch.optim.optimizer import Optimizer, required diff --git a/hyperion/torch/seq_embed/__init__.py b/hyperion/torch/seq_embed/__init__.py index 24ee9555..8ecc2cf8 100644 --- a/hyperion/torch/seq_embed/__init__.py +++ b/hyperion/torch/seq_embed/__init__.py @@ -6,9 +6,9 @@ # xvectors had been moved to models # we import them here for backwards compatibility -from ..models.xvector import XVector -from ..models.tdnn_xvector import TDNNXVector -from ..models.resnet_xvector import ResNetXVector from ..models.efficient_net_xvector import EfficientNetXVector -from ..models.transformer_xvector_v1 import TransformerXVectorV1 +from ..models.resnet_xvector import ResNetXVector from ..models.spinenet_xvector import SpineNetXVector +from ..models.tdnn_xvector import TDNNXVector +from ..models.transformer_xvector_v1 import TransformerXVectorV1 +from ..models.xvector import XVector diff --git a/hyperion/torch/torch_defs.py b/hyperion/torch/torch_defs.py index a567de50..b08beaeb 100644 --- a/hyperion/torch/torch_defs.py +++ b/hyperion/torch/torch_defs.py @@ -5,7 +5,6 @@ import torch - str2torch_dtype = { "float32": torch.float32, "float64": torch.float64, diff --git a/hyperion/torch/torch_model_loader.py b/hyperion/torch/torch_model_loader.py index c173cd50..2273bee8 100644 --- a/hyperion/torch/torch_model_loader.py +++ b/hyperion/torch/torch_model_loader.py @@ -3,13 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from collections import OrderedDict as ODict import re +from collections import OrderedDict as ODict import torch -from .narchs import * from .models import * +from .narchs import * class TorchModelLoader(object): diff --git a/hyperion/torch/tpm/__init__.py b/hyperion/torch/tpm/__init__.py index dfa5c14b..e3a17e4f 100644 --- a/hyperion/torch/tpm/__init__.py +++ b/hyperion/torch/tpm/__init__.py @@ -3,4 +3,4 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .hf import HFWav2Vec2, HFHubert, HFWavLM +from .hf import HFHubert, HFWav2Vec2, HFWavLM diff --git a/hyperion/torch/tpm/hf/__init__.py b/hyperion/torch/tpm/hf/__init__.py index 4db1c95d..d0f91785 100644 --- a/hyperion/torch/tpm/hf/__init__.py +++ b/hyperion/torch/tpm/hf/__init__.py @@ -3,6 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .hf_wav2vec2 import HFWav2Vec2 from .hf_hubert import HFHubert +from .hf_wav2vec2 import HFWav2Vec2 from .hf_wavlm import HFWavLM diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index ba331573..b2198924 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -2,17 +2,17 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os import logging -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo -from typing import Optional, Tuple, Union, List, Callable +import os +from typing import Callable, List, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import HubertConfig, HubertModel import torch import torch.nn as nn -from transformers import HubertModel, HubertConfig - -from ...utils.ddp import ddp_wait_for_all_procs, ddp_get_rank +from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index 579574a1..e1f21153 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -2,17 +2,17 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os import logging -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo -from typing import Optional, Tuple, Union, List, Callable +import os +from typing import Callable, List, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2Config, Wav2Vec2Model import torch import torch.nn as nn -from transformers import Wav2Vec2Model, Wav2Vec2Config - -from ...utils.ddp import ddp_wait_for_all_procs, ddp_get_rank +from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index 1dceed1c..1c1b1030 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -3,21 +3,20 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os import logging +import os from turtle import right -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from typing import List, Optional, Tuple, Union -from typing import Optional, Tuple, Union, List +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor import torch import torch.nn as nn -from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor - from ...torch_model import TorchModel -from ...utils import seq_lengths_to_mask, scale_seq_lengths -from ...utils.ddp import ddp_wait_for_all_procs, ddp_get_rank +from ...utils import scale_seq_lengths, seq_lengths_to_mask +from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs class HFWav2VecBase(TorchModel): diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index 15b8248d..0d5c5ad3 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -2,17 +2,17 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os import logging -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo -from typing import Optional, Tuple, Union, List, Callable +import os +from typing import Callable, List, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import WavLMConfig, WavLMModel import torch import torch.nn as nn -from transformers import WavLMModel, WavLMConfig - -from ...utils.ddp import ddp_wait_for_all_procs, ddp_get_rank +from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 69e97cc6..9f5fafe6 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 0523ad44..e2d2d1f6 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 93571acf..72058182 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -11,26 +11,23 @@ from enum import Enum from pathlib import Path +from fairscale.optim.grad_scaler import ShardedGradScaler +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.distributed as dist import torch.nn as nn -from fairscale.optim.grad_scaler import ShardedGradScaler -from jsonargparse import ActionParser, ArgumentParser from torch.optim.swa_utils import SWALR, AveragedModel from ...utils.misc import filter_func_args -from ..loggers import CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, WAndBLogger +from ..loggers import (CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, + WAndBLogger) from ..lr_schedulers import LRScheduler as LRS from ..lr_schedulers import LRSchedulerFactory as LRSF from ..optim import OptimizerFactory as OF -from ..utils import ( - FairFullyShardedDDP, - FairShardedDDP, - MetricAcc, - TorchDDP, - tensors_subset, -) +from ..utils import (FairFullyShardedDDP, FairShardedDDP, MetricAcc, TorchDDP, + tensors_subset) class DDPType(str, Enum): diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index ba401cb7..f4877dc6 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index 03800e0d..fc9d98f1 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 40b6b10d..35946e96 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index af915d6b..303427de 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -7,10 +7,11 @@ import time from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 1e1b1778..2a012dde 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -7,10 +7,11 @@ import time from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 4e791347..9d04af42 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -6,10 +6,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index da4a3773..0fee1bdb 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -3,17 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .devices import ( - open_device, - tensors_to_device, - tensors_to_cpu, - tensors_to_numpy, - tensors_subset, -) -from .metric_acc import MetricAcc -from .masking import seq_lengths_to_mask, scale_seq_lengths from .collation import collate_seq_1d, collate_seq_2d, collate_seq_nd +from .data_parallel import TorchDataParallel +from .ddp import FairFullyShardedDDP, FairShardedDDP, TorchDDP +from .devices import (open_device, tensors_subset, tensors_to_cpu, + tensors_to_device, tensors_to_numpy) from .eval_utils import eval_nnet_by_chunks, eval_nnet_overlap_add +from .masking import scale_seq_lengths, seq_lengths_to_mask +from .metric_acc import MetricAcc from .vad_utils import remove_silence -from .data_parallel import TorchDataParallel -from .ddp import TorchDDP, FairShardedDDP, FairFullyShardedDDP diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 7038cff3..ea3d18ea 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -2,14 +2,17 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os -import logging import datetime +import logging +import os + +from fairscale.nn.data_parallel import \ + FullyShardedDataParallel as FullyShardedDDP +from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP + import torch -import torch.nn as nn import torch.distributed as dist -from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP -from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP +import torch.nn as nn from .devices import open_device diff --git a/hyperion/torch/utils/devices.py b/hyperion/torch/utils/devices.py index 19c124b2..c0736f2f 100644 --- a/hyperion/torch/utils/devices.py +++ b/hyperion/torch/utils/devices.py @@ -2,9 +2,9 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os import subprocess -import logging import torch diff --git a/hyperion/torch/utils/eval_utils.py b/hyperion/torch/utils/eval_utils.py index f1ae1edb..d74835f6 100644 --- a/hyperion/torch/utils/eval_utils.py +++ b/hyperion/torch/utils/eval_utils.py @@ -4,6 +4,7 @@ """ import math + import torch diff --git a/hyperion/torch/utils/metric_acc.py b/hyperion/torch/utils/metric_acc.py index d635310b..a82c174a 100644 --- a/hyperion/torch/utils/metric_acc.py +++ b/hyperion/torch/utils/metric_acc.py @@ -4,6 +4,7 @@ """ import logging from collections import OrderedDict as ODict + import numpy as np import torch diff --git a/hyperion/torch/utils/misc.py b/hyperion/torch/utils/misc.py index 69d209eb..b2a3810f 100644 --- a/hyperion/torch/utils/misc.py +++ b/hyperion/torch/utils/misc.py @@ -4,8 +4,8 @@ """ import torch -import torch.nn as nn import torch.cuda.amp as amp +import torch.nn as nn def l2_norm(x, dim=1, axis=None): From e26f5b93c6be144c303a5e5c2f7230dde24db9b1 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 26 Jan 2023 07:30:08 -0500 Subject: [PATCH 075/154] new hyperparams for voxceleb --- .../v1.1/conf/fbank80_specaug1_stmn_16k.yaml | 24 + ...rain_ecapatdnn2048x4_xvec_stage1_v2.0.yaml | 91 ++++ ...rain_ecapatdnn2048x4_xvec_stage2_v2.0.yaml | 66 +++ ...train_ecapatdnn512x3_xvec_stage1_v2.0.yaml | 89 ++++ ...train_ecapatdnn512x3_xvec_stage2_v2.0.yaml | 66 +++ ...onfig_fbank80_stmn_ecapatdnn2048x4.v2.0.sh | 45 ++ ...config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | 45 ++ ...fnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | 12 +- ..._eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | 12 +- ...et34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- ...et34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- ...et50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | 10 +- ...et50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- ...et50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- ...w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | 11 +- ..._resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- ...net34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | 11 +- ...4_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh | 11 +- ..._resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | 12 +- ...ine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | 12 +- ...ne2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | 12 +- ...pinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- ...inenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | 12 +- ...et50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- ...eresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- ...ine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- ...ne2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | 11 +- egs/voxceleb/v1.1/run_001_prepare_data.sh | 27 + egs/voxceleb/v1.1/run_011_train_xvector.sh | 32 +- egs/voxceleb/v1.1/run_030_extract_xvectors.sh | 82 ++- egs/voxceleb/v1.1/run_040_eval_be.sh | 231 +++++--- egs/voxceleb/v1/local/make_vox2_trials.py | 83 +++ egs/voxceleb/v1/local/prepare_voxsrc22_dev.py | 88 ++++ .../v1/local/prepare_voxsrc22_test.py | 73 +++ egs/voxceleb/v1/local/score_voxceleb1.sh | 9 +- egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py | 205 +++++++ egs/voxceleb/v1/steps_be/eval-be-v2-snorm.py | 133 +++++ egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh | 75 +++ egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh | 64 +++ egs/voxceleb/v1/steps_be/train-qmf.py | 123 +++++ egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh | 81 +++ hyperion/bin/apply_mvn_select_frames.py | 173 ++++++ hyperion/bin/copy_feats.py | 40 ++ ...l_xvec_cosine_scoring_from_adv_test_wav.py | 423 +++++++++++++++ ...osine_scoring_from_adv_test_wav_wavegan.py | 498 ++++++++++++++++++ ...l_xvec_cosine_scoring_from_art_test_wav.py | 430 +++++++++++++++ .../eval_xvec_cosine_scoring_from_test_wav.py | 278 ++++++++++ ...sine_scoring_from_transfer_adv_test_wav.py | 439 +++++++++++++++ ...sine_scoring_from_transfer_art_test_wav.py | 464 ++++++++++++++++ hyperion/bin/finetune_xvector_from_wav.py | 22 +- hyperion/np/score_norm/adapt_s_norm.py | 318 ++++++++--- .../data/class_weighted_seg_chunk_sampler.py | 18 +- hyperion/torch/layer_blocks/fc_blocks.py | 3 + 53 files changed, 4752 insertions(+), 311 deletions(-) create mode 100644 egs/voxceleb/v1.1/conf/fbank80_specaug1_stmn_16k.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh create mode 100755 egs/voxceleb/v1/local/make_vox2_trials.py create mode 100755 egs/voxceleb/v1/local/prepare_voxsrc22_dev.py create mode 100755 egs/voxceleb/v1/local/prepare_voxsrc22_test.py create mode 100755 egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py create mode 100755 egs/voxceleb/v1/steps_be/eval-be-v2-snorm.py create mode 100755 egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh create mode 100755 egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh create mode 100755 egs/voxceleb/v1/steps_be/train-qmf.py create mode 100755 egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh create mode 100755 hyperion/bin/apply_mvn_select_frames.py create mode 100755 hyperion/bin/copy_feats.py create mode 100755 hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py create mode 100755 hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py create mode 100755 hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py create mode 100755 hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py create mode 100755 hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py create mode 100755 hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py diff --git a/egs/voxceleb/v1.1/conf/fbank80_specaug1_stmn_16k.yaml b/egs/voxceleb/v1.1/conf/fbank80_specaug1_stmn_16k.yaml new file mode 100644 index 00000000..8df42fc6 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/fbank80_specaug1_stmn_16k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml new file mode 100644 index 00000000..4b6fbc77 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml @@ -0,0 +1,91 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml new file mode 100644 index 00000000..4a4a8a88 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml new file mode 100644 index 00000000..319ab3ab --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml @@ -0,0 +1,89 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml new file mode 100644 index 00000000..4a4a8a88 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh new file mode 100644 index 00000000..14f2cdb4 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn2048x4.v2.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +#do_snorm=true +#do_qmf=true +do_voxsrc22=false + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh new file mode 100644 index 00000000..0e7a3b52 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN small + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3.v2.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +#do_snorm=true +#do_qmf=true +do_voxsrc22=false + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh index 2806a422..aae5f68e 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh @@ -26,14 +26,12 @@ eff_batch_size=512 # effective batch size lr=0.01 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_effnetb4_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" - -nnet_name=${feat_type}_${effnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_effnetb4_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr" +nnet_s1_name=${feat_type}_${effnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh index d83ca483..6ddb2b5e 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh @@ -26,14 +26,12 @@ eff_batch_size=512 # effective batch size lr=0.01 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_effnetb4_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model $PWD/conf/efficientnet_b7.yaml --trainer.optim.lr $lr" - -nnet_name=${feat_type}_${effnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_eina_hln_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_effnetb4_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model $PWD/conf/efficientnet_b7.yaml --trainer.optim.lr $lr" +nnet_s1_name=${feat_type}_${effnet_type}_is1_mbs1122121_ser${se_r}_fixsh_e${embed_dim}_eina_hln_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index f995fc0f..9082799e 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -28,13 +28,12 @@ scale=4 ws_tag=w16s4 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index a2e8cdba..f2e22b45 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -28,13 +28,12 @@ scale=4 ws_tag=w26s4 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh index 6ddb9e2c..bc828375 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -28,13 +28,13 @@ scale=8 ws_tag=w13s8 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index cfec2b09..0c2e825a 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -28,13 +28,12 @@ scale=4 ws_tag=w26s4 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh index 3cf18fcf..49fd61fa 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -28,13 +28,12 @@ scale=8 ws_tag=w26s8 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" -nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh index a5767e50..505ed8bc 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh @@ -28,13 +28,12 @@ scale=8 ws_tag=w26s8 nnet_num_epochs=90 -xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --trainer.epochs $nnet_num_epochs --trainer.swa-start 70 --trainer.swa-lr 1e-3 --trainer.swa-anneal-epochs 5" +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --trainer.epochs $nnet_num_epochs --trainer.swa-start 70 --trainer.swa-lr 1e-3 --trainer.swa-anneal-epochs 5" -nnet_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/swa_model_ep0091.pth +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/swa_model_ep0091.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh index b10e5e86..9c787210 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -24,13 +24,12 @@ margin_warmup=20 margin=0.3 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu" +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu" -nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh index 2666b93e..48dc3c90 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh @@ -25,13 +25,12 @@ margin_warmup=20 margin=0.3 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer $PWD/conf/trainer_swa_default.yaml" +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer $PWD/conf/trainer_swa_default.yaml" -nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/swa_model_ep0081.pth +nnet_s1_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp_swa.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/swa_model_ep0081.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh index 0ec34ef1..838a41ae 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_sharded_amp.v1.sh @@ -25,13 +25,12 @@ margin_warmup=20 margin=0.3 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.ddp-type oss_sharded_ddp" +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.ddp-type oss_sharded_ddp" -nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_sharded_ddp_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_sharded_ddp_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh index ced8b8d6..003c8aae 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -26,14 +26,12 @@ margin_warmup=20 margin=0.3 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type" - -nnet_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type" +nnet_s1_name=${feat_type}_${resnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh index f3a5ef5a..3a764519 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -27,14 +27,12 @@ scale=4 ws_tag=w26s4 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" - -nnet_name=${feat_type}_${spinenet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" +nnet_s1_name=${feat_type}_${spinenet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh index 40957669..e12ab940 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -27,14 +27,12 @@ scale=4 ws_tag=w26s4 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" - -nnet_name=${feat_type}_${spinenet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale" +nnet_s1_name=${feat_type}_${spinenet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh index 43f539f9..f452baae 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -24,13 +24,12 @@ margin_warmup=20 margin=0.3 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" -nnet_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh index f834b2cb..d17e2862 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -24,14 +24,12 @@ margin_warmup=20 margin=0.3 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" - -nnet_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type" +nnet_s1_name=${feat_type}_${spinenet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh index 243dab65..547020b1 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -29,13 +29,12 @@ ws_tag=w26s4 nnet_num_epochs=70 se_r=256 -xvec_train_base_cfg=conf/train_res2net50_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" +nnet_s1_base_cfg=conf/train_res2net50_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" -nnet_name=${feat_type}_${resnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh index 749ca557..63cde868 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -27,13 +27,12 @@ margin_warmup=20 margin=0.3 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_resnet34_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.se-r $se_r" +nnet_s1_base_cfg=conf/train_resnet34_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.resnet-type $resnet_type --model.se-r $se_r" -nnet_name=${feat_type}_${resnet_type}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${resnet_type}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh index d3a5595c..e465c525 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -28,13 +28,12 @@ ws_tag=w26s4 se_r=256 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" -nnet_name=${feat_type}_${spinenet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${spinenet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh index 4ffdd48b..975e2aba 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tsespine2net49s_arcs30m0.3_adam_lr0.05_amp.v1.sh @@ -28,13 +28,12 @@ ws_tag=w26s4 se_r=256 nnet_num_epochs=70 -xvec_train_base_cfg=conf/train_spinenet49_xvec_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" +nnet_s1_base_cfg=conf/train_spinenet49_xvec_default.yaml +nnet_s1_args="--data.train.sampler.batch-size $batch_size_1gpu --model.spinenet-type $spinenet_type --model.res2net-width-factor $width_factor --model.res2net-scale $scale --model.se-r $se_r" -nnet_name=${feat_type}_${spinenet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth +nnet_s1_name=${feat_type}_${spinenet_type}${ws_tag}_r${se_r}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0070.pth # back-end diff --git a/egs/voxceleb/v1.1/run_001_prepare_data.sh b/egs/voxceleb/v1.1/run_001_prepare_data.sh index 7bf15448..037efda1 100755 --- a/egs/voxceleb/v1.1/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.1/run_001_prepare_data.sh @@ -26,3 +26,30 @@ if [ $stage -le 2 ];then # Use this for the newer version of voxceleb1: local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + local/prepare_voxsrc22_dev.py \ + --vox1-corpus-dir $voxceleb1_root \ + --voxsrc22-corpus-dir $voxsrc22_root \ + --output-dir data/voxsrc22_dev +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + local/prepare_voxsrc22_test.py \ + --corpus-dir $voxsrc22_root \ + --output-dir data/voxsrc22_test +fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # # split vox2 into 2 parts, for cohort and qmf training + # utils/copy_data_dir.sh data/voxceleb2cat_train data/voxceleb2cat_train_odd + # utils/copy_data_dir.sh data/voxceleb2cat_train data/voxceleb2cat_train_even + # awk 'int(substr($2,3)) % 2 == 1' data/voxceleb2cat_train/utt2spk > data/voxceleb2cat_train_odd/utt2spk + # utils/fix_data_dir.sh data/voxceleb2cat_train_odd + # awk 'int(substr($2,3)) % 2 == 0' data/voxceleb2cat_train/utt2spk > data/voxceleb2cat_train_even/utt2spk + # utils/fix_data_dir.sh data/voxceleb2cat_train_even + # # we keep 3 utts per speaker + # utils/subset_data_dir.sh --per-spk data/voxceleb2cat_train_odd 3 data/voxceleb2cat_train_subset_cohort + # utils/subset_data_dir.sh --per-spk data/voxceleb2cat_train_even 3 data/voxceleb2cat_train_subset_qmf + local/make_vox2_trials.py --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v1.1/run_011_train_xvector.sh b/egs/voxceleb/v1.1/run_011_train_xvector.sh index 883c729b..a051c136 100755 --- a/egs/voxceleb/v1.1/run_011_train_xvector.sh +++ b/egs/voxceleb/v1.1/run_011_train_xvector.sh @@ -38,13 +38,12 @@ fi # Network Training if [ $stage -le 1 ]; then - - mkdir -p $nnet_dir/log + mkdir -p $nnet_s1_dir/log $cuda_cmd \ - --gpu $ngpu $nnet_dir/log/train.log \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - train_xvector_from_wav.py $nnet_type --cfg $xvec_train_base_cfg $xvec_train_args $extra_args \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ @@ -52,8 +51,31 @@ if [ $stage -le 1 ]; then --data.val.dataset.audio-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ - --trainer.exp-path $nnet_dir $args \ + --trainer.exp-path $nnet_s1_dir \ --num-gpus $ngpu \ fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh index 3abf2ff6..c42f96bb 100755 --- a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh @@ -8,6 +8,7 @@ set -e stage=1 +nnet_stage=1 config_file=default_config.sh use_gpu=false xvec_chunk_length=12800 @@ -21,41 +22,64 @@ else xvec_cmd="$train_cmd --mem 12G" fi +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + xvector_dir=exp/xvectors/$nnet_name -if [ $stage -le 1 ]; then - # Extract xvectors for training LDA/PLDA - for name in voxceleb2cat_train - do - if [ $plda_num_augs -eq 0 ]; then - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ - --random-utt-length true --min-utt-length 400 --max-utt-length 14000 \ - --feat-config $feat_config \ - $nnet data/${name} \ - $xvector_dir/${name} - else - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ - --random-utt-length true --min-utt-length 400 --max-utt-length 14000 \ - --feat-config $feat_config --aug-config $plda_aug_config --num-augs $plda_num_augs \ - $nnet data/${name} \ - $xvector_dir/${name}_augx${plda_num_augs} \ - data/${name}_augx${plda_num_augs} - fi - done +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" ) ]]; then + # Extract xvectors for training LDA/PLDA + for name in voxceleb2cat_train + do + if [ $plda_num_augs -eq 0 ]; then + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --random-utt-length true --min-utt-length 200 --max-utt-length 14000 \ + --feat-config $feat_config \ + $nnet data/${name} \ + $xvector_dir/${name} + else + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ + --random-utt-length true --min-utt-length 200 --max-utt-length 14000 \ + --feat-config $feat_config --aug-config $plda_aug_config --num-augs $plda_num_augs \ + $nnet data/${name} \ + $xvector_dir/${name}_augx${plda_num_augs} \ + data/${name}_augx${plda_num_augs} + fi + done fi if [ $stage -le 2 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - --feat-config $feat_config \ - $nnet data/$name \ - $xvector_dir/$name - done + # Extracts x-vectors for evaluation + for name in voxceleb1_test + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name + done fi exit diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index cd168180..49fa68e7 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -8,12 +8,34 @@ set -e stage=1 +nnet_stage=1 config_file=default_config.sh + . parse_options.sh || exit 1; . $config_file . datapath.sh +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + plda_label=${plda_type}y${plda_y_dim}_v1 be_name=lda${lda_dim}_${plda_label}_${plda_data} @@ -22,104 +44,179 @@ be_dir=exp/be/$nnet_name/$be_name score_dir=exp/scores/$nnet_name/${be_name} score_plda_dir=$score_dir/plda score_cosine_dir=exp/scores/$nnet_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/cosine_qmf -if [ $stage -le 1 ]; then +if [ "$do_plda" == "true" ];then + if [ $stage -le 1 ]; then echo "Train PLDA on Voxceleb2" - steps_be/train_be_v1.sh --cmd "$train_cmd" \ - --lda_dim $lda_dim \ - --plda_type $plda_type \ - --y_dim $plda_y_dim --z_dim $plda_z_dim \ - $xvector_dir/$plda_data/xvector.scp \ - data/$plda_data \ - $be_dir & - - - wait - -fi - - -if [ $stage -le 2 ];then - + steps_be/train_be_v1.sh \ + --cmd "$train_cmd" \ + --lda_dim $lda_dim \ + --plda_type $plda_type \ + --y_dim $plda_y_dim --z_dim $plda_z_dim \ + $xvector_dir/$plda_data/xvector.scp \ + data/$plda_data \ + $be_dir + + fi + + + if [ $stage -le 2 ];then echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" - steps_be/eval_be_v1.sh --cmd "$train_cmd" --plda_type $plda_type \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $be_dir/lda_lnorm.h5 \ - $be_dir/plda.h5 \ - $score_plda_dir/voxceleb1_scores + steps_be/eval_be_v1.sh \ + --cmd "$train_cmd" --plda_type $plda_type \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $be_dir/lda_lnorm.h5 \ + $be_dir/plda.h5 \ + $score_plda_dir/voxceleb1_scores $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir - + local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + for f in $(ls $score_plda_dir/*_results); do - echo $f - cat $f - echo "" + echo $f + cat $f + echo "" done - + fi fi -score_plda_dir=$score_cosine_dir if [ $stage -le 3 ];then - echo "Eval Voxceleb 1 with Cosine scoring" - steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_plda_dir/voxceleb1_scores + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores - $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + $train_cmd --mem 10G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done fi -be_dir=exp/be/$nnet_name/cw -score_plda_dir=$score_dir/cw_cosine -if [ $stage -le 4 ]; then - echo "Train centering+whitening on Voxceleb2" - steps_be/train_be_v2.sh --cmd "$train_cmd" \ - $xvector_dir/$plda_data/xvector.scp \ - data/$plda_data \ - $be_dir +if [ "$do_snorm" == "true" ];then + if [ $stage -le 4 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/*_results); + do + echo $f + cat $f + echo "" + done + fi fi -if [ $stage -le 5 ];then +if [ "$do_qmf" == "true" ];then + if [ $stage -le 5 ];then + echo "Train QMF in Vox2" + steps_be/train_be_cos_qmf.sh \ + --cmd "$train_cmd" --coh-nbest 1000 \ + data/voxceleb2cat_train/trials \ + data/voxceleb2cat_train/utt2model \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $xvector_dir/voxceleb2cat_train/utt2num_frames \ + data/voxceleb2cat_train/snorm_utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/voxceleb2_qmf_scores - echo "Eval Voxceleb 1 with CentWhiten + Cosine scoring" - steps_be/eval_be_v2.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $be_dir/cw.h5 \ - $score_plda_dir/voxceleb1_scores + fi - $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + if [ $stage -le 6 ];then - for f in $(ls $score_plda_dir/*_results); + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 15G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $xvector_dir/voxceleb1_test/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_snorm.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_qmf.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxceleb1{,_snorm,_qmf}_[oeh]_clean_results); do - echo $f - cat $f - echo "" + echo $f + cat $f + echo "" done + fi fi + exit +# be_dir=exp/be/$nnet_name/cw +# score_plda_dir=$score_dir/cw_cosine + +# if [ $stage -le 4 ]; then +# echo "Train centering+whitening on Voxceleb2" +# steps_be/train_be_v2.sh --cmd "$train_cmd" \ +# $xvector_dir/$plda_data/xvector.scp \ +# data/$plda_data \ +# $be_dir +# fi + + +# if [ $stage -le 5 ];then + +# echo "Eval Voxceleb 1 with CentWhiten + Cosine scoring" +# steps_be/eval_be_v2.sh --cmd "$train_cmd" \ +# data/voxceleb1_test/trials \ +# data/voxceleb1_test/utt2model \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $be_dir/cw.h5 \ +# $score_plda_dir/voxceleb1_scores + +# $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ +# local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + +# for f in $(ls $score_plda_dir/*_results); +# do +# echo $f +# cat $f +# echo "" +# done + +# fi + +# exit diff --git a/egs/voxceleb/v1/local/make_vox2_trials.py b/egs/voxceleb/v1/local/make_vox2_trials.py new file mode 100755 index 00000000..95a69cf1 --- /dev/null +++ b/egs/voxceleb/v1/local/make_vox2_trials.py @@ -0,0 +1,83 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import math +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils.segment_set import SegmentSet + + +def make_trials_single_gender(ft, fm, fs, segments, num_tar_trials, num_spks): + + # select spks + rng = np.random.RandomState(seed=1123) + spks = segments["class_id"].unique() + spks = rng.choice(spks, size=(num_spks,), replace=False) + snorm_segments = segments[~segments["class_id"].isin(spks)] + for seg, spk in zip(snorm_segments["id"], snorm_segments["class_id"]): + fs.write("%s %s\n" % (seg, spk)) + + segments = segments[segments["class_id"].isin(spks)] + num_segs_per_spk = int( + math.ceil((1 + math.sqrt(1 + 8 * num_tar_trials // num_spks)) / 2) + ) + + n = num_spks * num_segs_per_spk + print(num_segs_per_spk, n, num_tar_trials // num_spks, num_spks, len(spks)) + seg_ids = rng.choice(segments["id"], size=(n,), replace=False) + segments = segments[segments["id"].isin(seg_ids)] + seg_ids = segments["id"].values + class_ids = segments["class_id"].values + ntar = 0 + nnon = 0 + for i in range(n - 1): + for j in range(i + 1, n): + t = "target" if class_ids[i] == class_ids[j] else "nontarget" + ft.write("%s %s %s\n" % (seg_ids[i], seg_ids[j], t)) + if t == "target": + ntar += 1 + else: + nnon += 1 + + logging.info("Got ntar=%d and nnon=%d", ntar, nnon) + for i in range(n - 1): + fm.write("%s %s\n" % (seg_ids[i], seg_ids[i])) + + +def make_trials(data_dir, num_1k_tar_trials, num_spks): + config_logger(1) + logging.info("Making trial list for %s", data_dir) + data_dir = Path(data_dir) + segments = SegmentSet.load(data_dir / "utt2spk") + gender = SegmentSet.load(data_dir / "spk2gender") + segments["gender"] = gender.loc[segments["class_id"], "class_id"].values + + num_tar_trials = num_1k_tar_trials * 1000 // 2 + num_spks = num_spks // 2 + with open(data_dir / "trials", "w") as ft, open( + data_dir / "utt2model", "w" + ) as fm, open(data_dir / "snorm_utt2spk", "w") as fs: + segs_m = SegmentSet(segments.loc[segments["gender"] == "m"]) + make_trials_single_gender(ft, fm, fs, segs_m, num_tar_trials, num_spks) + segs_f = SegmentSet(segments.loc[segments["gender"] == "f"]) + make_trials_single_gender(ft, fm, fs, segs_f, num_tar_trials, num_spks) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="makes a trial list for vox2 dev") + + parser.add_argument("--data-dir", required=True, help="Path to dataset") + parser.add_argument( + "--num-1k-tar-trials", type=int, default=30, help="thousands of target trials" + ) + parser.add_argument("--num-spks", type=int, default=1000, help="number of speakers") + args = parser.parse_args() + make_trials(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_dev.py b/egs/voxceleb/v1/local/prepare_voxsrc22_dev.py new file mode 100755 index 00000000..915de676 --- /dev/null +++ b/egs/voxceleb/v1/local/prepare_voxsrc22_dev.py @@ -0,0 +1,88 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def prepare_voxsrc22_dev(vox1_corpus_dir, voxsrc22_corpus_dir, output_dir, verbose): + config_logger(verbose) + logging.info( + "Preparing corpus %s + %s -> %s", + vox1_corpus_dir, + voxsrc22_corpus_dir, + output_dir, + ) + vox1_corpus_dir = Path(vox1_corpus_dir) + voxsrc22_corpus_dir = Path(voxsrc22_corpus_dir) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + trials_file = voxsrc22_corpus_dir / "voxsrc2022_dev.txt" + df_trials = pd.read_csv( + trials_file, header=None, names=["target", "enroll", "test"], sep=" ", + ) + + trials_file = output_dir / "trials" + logging.info("creating trials file %s", trials_file) + with open(trials_file, "w") as f: + for _, row in df_trials.iterrows(): + t = "target" if row["target"] == 1 else "nontarget" + f.write("%s %s %s\n" % (row["enroll"], row["test"], t)) + + enroll_file = output_dir / "utt2model" + logging.info("creating enrollment file %s", enroll_file) + file_ids = df_trials["enroll"].unique() + with open(enroll_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + u2s_file = output_dir / "utt2spk" + logging.info("creating utt2spk file %s", u2s_file) + file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"]))) + with open(u2s_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + s2u_file = output_dir / "spk2utt" + logging.info("creating spk2utt file %s", s2u_file) + with open(s2u_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + wav_file = output_dir / "wav.scp" + logging.info("creating wav.scp file %s", wav_file) + with open(wav_file, "w") as f: + for file_id in file_ids: + if "VoxSRC2022_dev" in file_id: + wav_file = voxsrc22_corpus_dir / file_id + else: + wav_file = vox1_corpus_dir / "wav" / file_id + + f.write("%s %s\n" % (file_id, wav_file)) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares VoxSRC22 Track1/2 validation data") + + parser.add_argument( + "--vox1-corpus-dir", required=True, help="Path to voxceleb1 v2 dataset" + ) + parser.add_argument( + "--voxsrc22-corpus-dir", required=True, help="Path to voxsrc22 dataset" + ) + + parser.add_argument("--output-dir", required=True, help="Ouput data path prefix") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_voxsrc22_dev(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_test.py b/egs/voxceleb/v1/local/prepare_voxsrc22_test.py new file mode 100755 index 00000000..e3421fe1 --- /dev/null +++ b/egs/voxceleb/v1/local/prepare_voxsrc22_test.py @@ -0,0 +1,73 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def prepare_voxsrc22_test(corpus_dir, output_dir, verbose): + config_logger(verbose) + logging.info( + "Preparing corpus %s -> %s", corpus_dir, output_dir, + ) + corpus_dir = Path(corpus_dir) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + trials_file = corpus_dir / "Track12_blind.txt" + df_trials = pd.read_csv( + trials_file, header=None, names=["enroll", "test"], sep=" ", + ) + trials_file = output_dir / "trials" + logging.info("creating trials file %s", trials_file) + with open(trials_file, "w") as f: + for _, row in df_trials.iterrows(): + f.write("%s %s\n" % (row["enroll"], row["test"])) + + enroll_file = output_dir / "utt2model" + logging.info("creating enrollment file %s", enroll_file) + file_ids = df_trials["enroll"].unique() + with open(enroll_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + u2s_file = output_dir / "utt2spk" + logging.info("creating utt2spk file %s", u2s_file) + file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"]))) + with open(u2s_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + s2u_file = output_dir / "spk2utt" + logging.info("creating spk2utt file %s", s2u_file) + with open(s2u_file, "w") as f: + for file_id in file_ids: + f.write("%s %s\n" % (file_id, file_id)) + + wav_file = output_dir / "wav.scp" + logging.info("creating wav.scp file %s", wav_file) + with open(wav_file, "w") as f: + for file_id in file_ids: + wav_file = corpus_dir / "Track12_test_data" / file_id + f.write("%s %s\n" % (file_id, wav_file)) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares VoxSRC22 Track1/2 test data") + + parser.add_argument("--corpus-dir", required=True, help="Path to voxsrc22 dataset") + + parser.add_argument("--output-dir", required=True, help="Ouput data path prefix") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_voxsrc22_test(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/local/score_voxceleb1.sh b/egs/voxceleb/v1/local/score_voxceleb1.sh index 5d11848d..f12b18eb 100755 --- a/egs/voxceleb/v1/local/score_voxceleb1.sh +++ b/egs/voxceleb/v1/local/score_voxceleb1.sh @@ -2,8 +2,8 @@ # Copyright 2020 Johns Hopkins University (Jesus Villalba) # Apache 2.0. # -if [ $# -ne 2 ]; then - echo "Usage: $0 " +if [ $# -ne 2 ] && [ $# -n 3]; then + echo "Usage: $0 [suffix]" exit 1; fi @@ -11,13 +11,14 @@ set -e data_dir=$1 score_dir=$2 +suffix=$3 for cond in o o_clean e e_clean h h_clean do - echo "Voxceleb $cond" + echo "Voxceleb1 $cond" key=$data_dir/trials_$cond #Compute performance - python local/score_dcf.py --key-file $key --score-file $score_dir/voxceleb1_scores --output-path $score_dir/voxceleb1_${cond} & + python local/score_dcf.py --key-file $key --score-file $score_dir/voxceleb1_scores$suffix --output-path $score_dir/voxceleb1${suffix}_${cond} & done wait diff --git a/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py b/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py new file mode 100755 index 00000000..78526277 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils import TrialNdx, TrialScores, Utt2Info +from hyperion.utils.math import cosine_scoring +from hyperion.np.pdfs import PLDA +from hyperion.utils.list_utils import ismember +from hyperion.helpers import TrialDataReader as TDR +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm as SNorm +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def get_score_filename(score_file, q_name, i, j, p): + if q_name is not None: + score_file = "%s_%s" % (score_file, q_name) + + if p: + score_file = "%s-%03d-%03d" % (score_file, i, j) + + return score_file + + +def save_empty(score_file, q_name, i, j, p): + score_file = get_score_filename(score_file, q_name, i, j, p) + logging.info("saving scores to %s", score_file) + with open(score_file, "w") as f: + pass + + +def save_scores(s, score_file, q_name, i, j, p): + score_file = get_score_filename(score_file, q_name, i, j, p) + logging.info("saving scores to %s", score_file) + s.save_txt(score_file) + + +def eval_plda( + v_file, + ndx_file, + enroll_file, + num_frames_file, + coh_file, + coh_v_file, + score_file, + qmf_file, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + coh_nbest, + **kwargs +): + + logging.info("loading data") + tdr = TDR( + v_file, + ndx_file, + enroll_file, + None, + None, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) + logging.info("read x-vectors and ndx") + x_e, x_t, enroll, ndx = tdr.read() + enroll_segs = tdr.enroll.key + + parallel = num_model_parts > 1 or num_seg_parts > 1 + + if not np.any(ndx.trial_mask): + save_empty(score_file, None, model_part_idx, seg_part_idx, parallel) + if qmf_file is None: + for q_name in ["snorm", "maxnf", "minnf", "maxcohmu", "mincohmu"]: + save_empty(score_file, q_name, model_part_idx, seg_part_idx, parallel) + return + + logging.info("read num_frames") + u2nf = Utt2Info.load(num_frames_file) + enroll_nf = np.log( + np.clip( + u2nf.filter(enroll_segs).info.astype(float) / 100 - 2.0, + a_min=0.1, + a_max=6.0, + ) + ) + test_nf = np.log( + np.clip( + u2nf.filter(ndx.seg_set).info.astype(float) / 100 - 2.0, + a_min=0.1, + a_max=6.0, + ) + ) + t1 = time.time() + logging.info("computing llr") + scores = cosine_scoring(x_e, x_t) + + logging.info("read cohort x-vectors") + vcr = VCR(coh_v_file, coh_file) + x_coh, ids_coh = vcr.read() + D_coh = PLDA.compute_stats_hard(x_coh, class_ids=ids_coh) + x_coh = D_coh[1] / np.expand_dims(D_coh[0], axis=-1) + + t2 = time.time() + logging.info("score cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t) + logging.info("score enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh) + + dt = time.time() - t2 + logging.info("cohort-scoring elapsed time: %.2f s.", dt) + + t2 = time.time() + logging.info("apply s-norm") + snorm = SNorm(nbest=coh_nbest, nbest_sel_method="highest-other-side") + scores_norm, mu_z, _, mu_t, _ = snorm( + scores, scores_coh_test, scores_enr_coh, return_stats=True + ) + + dt = time.time() - t1 + num_trials = len(enroll) * x_t.shape[0] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) + + q_measures = { + "maxnf": np.maximum(enroll_nf[:, None], test_nf[None, :]), + "minnf": np.minimum(enroll_nf[:, None], test_nf[None, :]), + "maxcohmu": np.maximum(mu_z, mu_t), + "mincohmu": np.minimum(mu_z, mu_t), + } + + f, loc = ismember(enroll, ndx.model_set) + trial_mask = ndx.trial_mask[loc] + s = TrialScores(enroll, ndx.seg_set, scores, score_mask=trial_mask) + save_scores(s, score_file, None, model_part_idx, seg_part_idx, parallel) + s.scores = scores_norm + save_scores(s, score_file, "snorm", model_part_idx, seg_part_idx, parallel) + if qmf_file is None: + for q_name in ["maxnf", "minnf", "maxcohmu", "mincohmu"]: + s.scores = q_measures[q_name] + save_scores(s, score_file, q_name, model_part_idx, seg_part_idx, parallel) + + return + + logging.info("applying qmf") + scores_fus = [scores.ravel()] + for q_name in ["maxnf", "minnf", "maxcohmu", "mincohmu"]: + scores_fus.append(q_measures[q_name].ravel()) + + scores_fus = np.vstack(scores_fus).T + lr = LR.load(qmf_file) + scores_fus = lr.predict(scores_fus) + scores_fus = np.reshape(scores_fus, (s.num_models, s.num_tests)) + s.scores = scores_fus + save_scores(s, score_file, "qmf", model_part_idx, seg_part_idx, parallel) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Eval cosine-scoring with QMF") + + parser.add_argument("--v-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--num-frames-file", required=True) + parser.add_argument("--coh-v-file", required=True) + parser.add_argument("--coh-file", required=True) + parser.add_argument("--coh-nbest", type=int, default=400) + parser.add_argument("--qmf-file", default=None) + # parser.add_argument("--preproc-file", dest="preproc_file", default=None) + + TDR.add_argparse_args(parser) + + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_plda(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/eval-be-v2-snorm.py b/egs/voxceleb/v1/steps_be/eval-be-v2-snorm.py new file mode 100755 index 00000000..4ad0a869 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval-be-v2-snorm.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.list_utils import ismember +from hyperion.utils import TrialNdx, TrialScores +from hyperion.utils.math import cosine_scoring +from hyperion.helpers import TrialDataReader as TDR +from hyperion.helpers import PLDAFactory as F +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF + + +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + score_file, + coh_iv_file, + coh_file, + coh_nbest, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + **kwargs +): + + logging.info("loading data") + if preproc_file is not None: + preproc = TransformList.load(preproc_file) + else: + preproc = None + + tdr = TDR( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) + x_e, x_t, enroll, ndx = tdr.read() + + coh_segs = SegmentSet.load(coh_file) + r = DRF.create(coh_iv_file) + x_coh = r.read(coh_segs["id"], squeeze=True) + _, spk_ids = np.unique(coh_segs["class_id"], return_inverse=True) + num_coh_spks = np.max(spk_ids) + 1 + x_coh_spk = np.zeros((num_coh_spks, x_coh.shape[1])) + for i in range(num_coh_spks): + idx = spk_ids == i + x_coh_spk[i] = np.mean(x_coh[idx], axis=0) + + t1 = time.time() + logging.info("computing llr") + scores = cosine_scoring(x_e, x_t) + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh_spk) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh_spk, x_t) + + snorm = AdaptSNorm(coh_nbest) + scores = snorm(scores, scores_coh_test, scores_enr_coh) + + dt = time.time() - t1 + num_trials = len(enroll) * x_t.shape[0] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if num_model_parts > 1 or num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + f, loc = ismember(enroll, ndx.model_set) + s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask[loc]) + s.save_txt(score_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Eval cosine-scoring with adaptive s-norm") + + parser.add_argument("--iv-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-file", default=None) + parser.add_argument("--preproc-file", default=None) + + TDR.add_argparse_args(parser) + + parser.add_argument("--coh-iv-file", required=True) + parser.add_argument("--coh-file", required=True) + parser.add_argument("--coh-nbest", type=int, default=1000) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + assert args.test_file is not None or args.ndx_file is not None + eval_plda(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh new file mode 100755 index 00000000..8b69b0d6 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +set -e +cmd=run.pl +stage=1 +num_parts=8 +coh_nbest=400 + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +set -e + +if [ $# -ne 8 ]; then + echo "Usage: $0 " + exit 1; +fi + +ndx_file=$1 +enroll_file=$2 +vector_file=$3 +nf_file=$4 +coh_file=$5 +coh_v_file=$6 +qmf_file=$7 +output_file=$8 + +output_dir=$(dirname $output_file) + +mkdir -p $output_dir/log +name=$(basename $output_file) + +echo "$0 score $ndx_file" + +if [ $stage -le 1 ];then + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $cmd $output_dir/log/${name}_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval-be-cos-qmf.py \ + --v-file scp:$vector_file \ + --ndx-file $ndx_file \ + --enroll-file $enroll_file \ + --score-file $output_file \ + --num-frames-file $nf_file \ + --coh-v-file scp:$coh_v_file \ + --coh-file $coh_file \ + --coh-nbest $coh_nbest \ + --qmf-file $qmf_file \ + --model-part-idx $i --num-model-parts $num_parts \ + --seg-part-idx $j --num-seg-parts $num_parts & + done + done + wait +fi + + +if [ $stage -le 2 ];then + for suffix in "" _snorm _qmf + do + output_file_k=${output_file}${suffix} + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + cat $output_file_k-$(printf "%03d" $i)-$(printf "%03d" $j) + done + done | sort -u > $output_file_k + done +fi + + diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh new file mode 100755 index 00000000..4f5e3e76 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# + +cmd=run.pl +num_parts=16 +coh_nbest=1000 +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +set -e + +if [ $# -ne 6 ]; then + echo "Usage: $0 " + exit 1; +fi + +ndx_file=$1 +enroll_file=$2 +vector_file=$3 +coh_file=$4 +coh_vector_file=$5 +output_file=$6 + +output_dir=$(dirname $output_file) + +mkdir -p $output_dir/log +name=$(basename $output_file) + +echo "$0 score $ndx_file" + + +for((i=1;i<=$num_parts;i++)); +do + for((j=1;j<=$num_parts;j++)); + do + $cmd $output_dir/log/${name}_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval-be-v2-snorm.py \ + --iv-file scp:$vector_file \ + --ndx-file $ndx_file \ + --enroll-file $enroll_file \ + --coh-file $coh_file \ + --coh-iv-file scp:$coh_vector_file \ + --score-file $output_file \ + --coh-nbest $coh_nbest \ + --model-part-idx $i --num-model-parts $num_parts \ + --seg-part-idx $j --num-seg-parts $num_parts & + sleep 1s + done +done +wait + + +for((i=1;i<=$num_parts;i++)); +do + for((j=1;j<=$num_parts;j++)); + do + cat $output_file-$(printf "%03d" $i)-$(printf "%03d" $j) + done +done | sort -u > $output_file + + + diff --git a/egs/voxceleb/v1/steps_be/train-qmf.py b/egs/voxceleb/v1/steps_be/train-qmf.py new file mode 100755 index 00000000..07712221 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/train-qmf.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Trains calibration for SRE18 tel condition +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.trial_scores import TrialScores +from hyperion.utils.trial_key import TrialKey +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): + + logging.info("load key: %s", key_file) + key = TrialKey.load_txt(key_file) + logging.info("load scores: %s", score_file) + scr = TrialScores.load_txt(score_file) + tar, non = scr.get_tar_non(key) + ntar = len(tar) + nnon = len(non) + + q_file = f"{score_file}_maxnf" + logging.info("load max num-frames: %s", q_file) + q = TrialScores.load_txt(q_file) + maxnf_tar, maxnf_non = q.get_tar_non(key) + + q_file = f"{score_file}_minnf" + logging.info("load min num-frames: %s", q_file) + q = TrialScores.load_txt(q_file) + minnf_tar, minnf_non = q.get_tar_non(key) + + q_file = f"{score_file}_maxcohmu" + logging.info("load max cohort mean: %s", q_file) + q = TrialScores.load_txt(q_file) + maxcohmu_tar, maxcohmu_non = q.get_tar_non(key) + + q_file = f"{score_file}_mincohmu" + logging.info("load min cohort mean: %s", q_file) + q = TrialScores.load_txt(q_file) + mincohmu_tar, mincohmu_non = q.get_tar_non(key) + + min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + logging.info("train calibration") + tar = np.vstack((tar, maxnf_tar, minnf_tar, maxcohmu_tar, mincohmu_tar)).T + non = np.vstack((non, maxnf_non, minnf_non, maxcohmu_non, mincohmu_non)).T + + x = np.vstack((tar, non)) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) + lr.fit(x, y) + logging.info(f"A={lr.A} b={lr.b}") + logging.info("save calibration at %s", model_file) + lr.save(model_file) + + logging.info("calibrate scores") + tar_cal = lr.predict(tar) + non_cal = lr.predict(non) + act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + output_file = f"{score_file}_qmf" + scr_out = TrialScores(key.model_set, key.seg_set) + scr_out.scores[key.tar] = tar_cal + scr_out.scores[key.non] = non_cal + scr_out.score_mask = np.logical_or(key.tar, key.non) + scr_out.save(output_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Trains QMF calibration") + + parser.add_argument("--score-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--model-file", required=True) + parser.add_argument("--prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + + train_calibration(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh b/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh new file mode 100755 index 00000000..7dbfcfb9 --- /dev/null +++ b/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +set -e +cmd=run.pl +stage=1 +num_parts=8 +coh_nbest=400 + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +set -e + +if [ $# -ne 7 ]; then + echo "Usage: $0 " + exit 1; +fi + +ndx_file=$1 +enroll_file=$2 +vector_file=$3 +nf_file=$4 +coh_file=$5 +coh_v_file=$6 +output_file=$7 + +output_dir=$(dirname $output_file) + +mkdir -p $output_dir/log +name=$(basename $output_file) + +echo "$0 score $ndx_file" + +if [ $stage -le 1 ];then + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $cmd $output_dir/log/${name}_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval-be-cos-qmf.py \ + --v-file scp:$vector_file \ + --ndx-file $ndx_file \ + --enroll-file $enroll_file \ + --score-file $output_file \ + --num-frames-file $nf_file \ + --coh-v-file scp:$coh_v_file \ + --coh-file $coh_file \ + --coh-nbest $coh_nbest \ + --model-part-idx $i --num-model-parts $num_parts \ + --seg-part-idx $j --num-seg-parts $num_parts & + done + done + wait +fi + +if [ $stage -le 2 ];then + for suffix in "" _maxnf _minnf _maxcohmu _mincohmu _snorm + do + output_file_k=${output_file}${suffix} + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + cat $output_file_k-$(printf "%03d" $i)-$(printf "%03d" $j) + done + done | sort -u > $output_file_k + done +fi + +if [ $stage -le 3 ];then + $cmd $output_dir/log/train_qmf_${name}.log \ + hyp_utils/conda_env.sh \ + steps_be/train-qmf.py \ + --score-file $output_file \ + --key-file $ndx_file \ + --model-file $output_dir/qmf.h5 +fi + + diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py new file mode 100755 index 00000000..a2456dc9 --- /dev/null +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import RandomAccessDataReaderFactory as RDRF +from hyperion.io import SequentialDataReaderFactory as DRF +from hyperion.np.feats import FrameSelector as FSel +from hyperion.np.feats import MeanVarianceNorm as MVN +from hyperion.utils import Utt2Info +from hyperion.utils.kaldi_matrix import compression_methods + + +def process_feats( + input_spec, + output_spec, + vad_spec, + write_num_frames_spec, + scp_sep, + path_prefix, + vad_path_prefix, + part_idx, + num_parts, + compress, + compression_method, + **kwargs +): + + logging.info("initializing") + mvn_args = MVN.filter_args(**kwargs) + mvn = MVN(**mvn_args) + if vad_spec is not None: + fs_args = FSel.filter_args(**kwargs) + fs = FSel(**fs_args) + + if write_num_frames_spec is not None: + keys = [] + info = [] + + logging.info("opening output stream: %s" % (output_spec)) + with DWF.create( + output_spec, + compress=compress, + compression_method=compression_method, + scp_sep=scp_sep, + ) as writer: + + logging.info("opening input stream: %s" % (output_spec)) + with DRF.create( + input_spec, + path_prefix=path_prefix, + scp_sep=scp_sep, + part_idx=part_idx, + num_parts=num_parts, + ) as reader: + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = RDRF.create( + vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + ) + + while not reader.eof(): + key, data = reader.read(1) + if len(key) == 0: + break + logging.info("processing feats at %s" % (key[0])) + x = mvn.normalize(data[0]) + if vad_spec is not None: + vad = v_reader.read(key)[0].astype("bool") + tot_frames = x.shape[0] + x = fs.select(x, vad) + logging.info( + "for %s detected %d/%d (%.2f %%) speech frames" + % ( + key[0], + x.shape[0], + tot_frames, + x.shape[0] / tot_frames * 100, + ) + ) + if x.shape[0] > 0: + writer.write(key, [x]) + if write_num_frames_spec is not None: + keys += key + info.append(x.shape[0]) + + if write_num_frames_spec is not None: + logging.info("writing num-frames to %s" % (write_num_frames_spec)) + u2nf = Utt2Info.create(keys, info) + u2nf.save(write_num_frames_spec) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Apply CMVN and remove silence") + + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) + parser.add_argument( + "--scp-sep", dest="scp_sep", default=" ", help=("scp file field separator") + ) + parser.add_argument( + "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix") + ) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + parser.add_argument( + "--part-idx", + dest="part_idx", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + parser.add_argument( + "--num-parts", + dest="num_parts", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + + parser.add_argument( + "--compress", + dest="compress", + default=False, + action="store_true", + help="Lossy compress the features", + ) + parser.add_argument( + "--compression-method", + dest="compression_method", + default="auto", + choices=compression_methods, + help=( + "Kaldi compression method: " + "{auto (default), speech_feat, " + "2byte-auto, 2byte-signed-integer, " + "1byte-auto, 1byte-unsigned-integer, 1byte-0-1}." + ), + ) + MVN.add_argparse_args(parser) + FSel.add_argparse_args(parser) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + process_feats(**namespace_to_dict(args)) diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py new file mode 100755 index 00000000..0385cc55 --- /dev/null +++ b/hyperion/bin/copy_feats.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +""" + Copyright 2018 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + Copy features/vectors and change format +""" + +import argparse +import logging +import os +import sys +import time + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.io import CopyFeats as CF + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Copy features and change format", + ) + + parser.add_argument("--input", dest="input_spec", nargs="+", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--write-num-frames", dest="write_num_frames", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + CF.add_argparse_args(parser) + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + CF(**vars(args)) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py new file mode 100755 index 00000000..437127b2 --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +class MyModel(nn.Module): + def __init__( + self, feat_extractor, xvector_model, embed_layer=None, calibrator=None, sigma=0 + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + self.sigma = sigma + + def forward(self, s_t): + # print('sigma0=', self.sigma) + if self.sigma > 0: + s_t = s_t + self.sigma * torch.randn_like(s_t) + # print('sigma1=', self.sigma) + f_t = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + score = torch.sum(x_e * x_t, dim=-1) + if self.calibrator is not None: + score = self.calibrator(score) + + return score + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.eval() + return model + + +def load_calibrator(cal_file, threshold): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful + calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + f, idx = ismember(key.model_set, enroll.info) + assert np.all(f) + x_e = x_e[idx] + return key, x_e + + +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + stats_file, + cal_file, + threshold, + smooth_sigma, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + + device = init_device(use_gpu) + feat_extractor = init_feats(**kwargs) + xvector_model = load_model(model_path) + + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file, threshold) + + tar = torch.as_tensor([1], dtype=torch.float).to(device) + non = torch.as_tensor([0], dtype=torch.float).to(device) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file, **audio_args) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + smooth_sigma *= wav_scale + model = MyModel( + feat_extractor, xvector_model, embed_layer, calibrator, smooth_sigma + ) + model.to(device) + model.eval() + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, + } + attack_args.update(extra_args) + logging.info("attacks args={}".format(attack_args)) + attack = AttackFactory.create(model, **attack_args) + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s", key.seg_set[j]) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) + model.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + with torch.no_grad(): + # we add the threshold back here to make sure the scores are well calibrated + scores[i, j] = model(s_adv) + threshold + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.4f trial-time=%.4f n_trials=%d " + "rt-factor=%.5f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s" % (stats_file)) + attack_stats.to_csv(stats_file) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Eval cosine-scoring given enroll x-vector and test wave" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", dest="v_file", required=True) + parser.add_argument("--key-file", dest="key_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-wav-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + # parser.add_argument('--save-adv-wav-tar-thr', + # default=0.75, type=float, + # help='min score to save signal from attack that makes non-tar into tar') + + # parser.add_argument('--save-adv-wav-non-thr', + # default=-0.75, type=float, + # help='max score to save signal from attack that makes tar into non-tar') + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--smooth-sigma", default=0, type=float, help="sigma for smoothing" + ) + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py new file mode 100755 index 00000000..aaa91214 --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -0,0 +1,498 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time +# [Added Sonal May21] +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.adv_defenses.wave_gan_white import WaveGANDefender +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + +torch.backends.cudnn.enabled = False + + +class MyModel(nn.Module): + def __init__( + self, + feat_extractor, + xvector_model, + embed_layer=None, + calibrator=None, + sigma=0, + smoothing_after_wavegan=None, + wave_gan_defender=None, + wav_scale=2 ** 15 - 1, + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + self.sigma = sigma + self.smoothing_after_wavegan = smoothing_after_wavegan + self.wave_gan_defender = wave_gan_defender + self.wav_scale = wav_scale + self.apply_wavegan = False if wave_gan_defender is None else True + + def forward(self, s_t): + + # Pre-proceessing defense, wavegan + smoothing [Added Sonal May21] + s_t = s_t / self.wav_scale + if self.smoothing_after_wavegan: + if self.apply_wavegan: + s_t = self.wave_gan_defender(s_t) + if self.sigma > 0: + s_t = s_t + self.sigma * torch.randn_like(s_t) + else: + if self.sigma > 0: + s_t = s_t + self.sigma * torch.randn_like(s_t) + if self.apply_wavegan: + s_t = self.wave_gan_defender(s_t) + + s_t = self.wav_scale * s_t + # End of pre-processing defense + + f_t = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + score = torch.sum(x_e * x_t, dim=-1) + if self.calibrator is not None: + score = self.calibrator(score) + + return score + + +def fix_out_of_memory(model, tensors): + for p in model.parameters(): + if p.grad is not None: + del p.grad # free some memory + + for tensor in tensors: + if tensor.grad is not None: + del tensor.grad + + torch.cuda.empty_cache() + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.eval() + return model + + +def load_calibrator(cal_file, threshold): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful + calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + f, idx = ismember(key.model_set, enroll.info) + assert np.all(f) + x_e = x_e[idx] + return key, x_e + + +def eval_cosine_scoring_wavegan( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + stats_file, + cal_file, + threshold, + smooth_sigma, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + smoothing_after_wavegan, + wave_gan_root_dir, + wave_gan_model_ckpt, + **kwargs +): + + device = init_device(use_gpu) + feat_extractor = init_feats(**kwargs) + + wave_gan_defender = WaveGANDefender( + Path(wave_gan_root_dir), Path(wave_gan_model_ckpt) + ) + xvector_model = load_model(model_path) + + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file, threshold) + + tar = torch.as_tensor([1], dtype=torch.float).to(device) + non = torch.as_tensor([0], dtype=torch.float).to(device) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file, **audio_args) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + model = MyModel( + feat_extractor, + xvector_model, + embed_layer, + calibrator, + smooth_sigma, + smoothing_after_wavegan, + wave_gan_defender, + wav_scale, + ) + model.to(device) + model.eval() + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, + } + attack_args.update(extra_args) + logging.info("attacks args={}".format(attack_args)) + attack = AttackFactory.create(model, **attack_args) + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s" % (key.seg_set[j])) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s_cpu = s[None, :] + s = torch.as_tensor(s_cpu, dtype=torch.get_default_dtype(), device=device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.as_tensor( + vad.astype(np.bool, copy=False), dtype=torch.bool, device=device + ) + model.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + with torch.no_grad(): + scores[i, j] = model(s_adv) + threshold + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.5f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s" % (stats_file)) + attack_stats.to_csv(stats_file) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Eval cosine-scoring given enroll x-vector and test wave" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", dest="v_file", required=True) + parser.add_argument("--key-file", dest="key_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-wav-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + # parser.add_argument('--save-adv-wav-tar-thr', + # default=0.75, type=float, + # help='min score to save signal from attack that makes non-tar into tar') + + # parser.add_argument('--save-adv-wav-non-thr', + # default=-0.75, type=float, + # help='max score to save signal from attack that makes tar into non-tar') + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--smooth-sigma", default=0, type=float, help="sigma for smoothing" + ) + parser.add_argument( + "--max-test-length", + default=5, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + # Defense: WaveGAN specific arguments [Added Sonal May21] + parser.add_argument( + "--smoothing-after-wavegan", + default=False, + action="store_true", + help=( + "Smoothing before or after wavegan, if true: " + "smoothing is done after wavegan" + ), + ) + + parser.add_argument( + "--wave-gan-root-dir", default=None, help="WaveGAN model root directory" + ) + parser.add_argument( + "--wave-gan-model-ckpt", default=None, help="WaveGAN model checkpoint" + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring_wavegan(**namespace_to_dict(args)) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py new file mode 100755 index 00000000..8d4add76 --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +from art.classifiers import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.eval() + return model + + +def load_calibrator(cal_file): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + calibrator = Calibrator(lr.A[0, 0], lr.b[0]) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + f, idx = ismember(key.model_set, enroll.info) + assert np.all(f) + x_e = x_e[idx] + return key, x_e + + +class MyModel(nn.Module): + def __init__( + self, + feat_extractor, + xvector_model, + embed_layer=None, + calibrator=None, + threshold=0, + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + self.threshold = threshold + + def forward(self, s_t): + f_t = s_t + f_t = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + tar_score = torch.sum(x_e * x_t, dim=-1, keepdim=True) + if self.calibrator is not None: + score = self.calibrator(tar_score) + + non_score = self.threshold + 0 * tar_score + score = torch.cat((non_score, tar_score), dim=-1) # .unsqueeze(0) + return score + + +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + stats_file, + cal_file, + threshold, + save_adv_wav, + save_adv_wav_path, + max_test_length, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + + device_type = "gpu" if use_gpu else "cpu" + device = init_device(use_gpu) + feat_extractor = init_feats(**kwargs) + xvector_model = load_model(model_path) + + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file) + + model = MyModel( + feat_extractor, xvector_model, embed_layer, calibrator, threshold=threshold + ) + model.to(device) + model.eval() + + tar = np.asarray([1], dtype=np.int) + non = np.asarray([0], dtype=np.int) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = {"eps_scale": wav_scale} + attack_args.update(extra_args) + logging.info("attack-args={}".format(attack_args)) + + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s" % (key.seg_set[j])) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s = s[None, :].astype("float32", copy=False) + s_tensor = torch.as_tensor(s, dtype=torch.get_default_dtype()).to(device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) + model.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + model_art = PyTorchClassifier( + model=model, + loss=nn.CrossEntropyLoss(), + optimizer=None, + input_shape=[1, s.shape[1]], + nb_classes=2, + clip_values=(-wav_scale, wav_scale), + device_type=device_type, + ) + + attack_args["num_samples"] = s.shape[-1] + attack = AttackFactory.create(model_art, **attack_args) + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + s_adv = torch.from_numpy(s_adv).to(device) + with torch.no_grad(): + scores[i, j] = model(s_adv).cpu().numpy()[0, 1] + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s_tensor, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + del attack + del model_art + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.5f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s" % (stats_file)) + attack_stats.to_csv(stats_file) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=( + "Eval cosine-scoring given enroll x-vector " + "and adversarial test wave from ART" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", dest="v_file", required=True) + parser.add_argument("--key-file", dest="key_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-wav-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py new file mode 100755 index 00000000..0e9493c0 --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(device, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + feat_extractor.to(device) + return feat_extractor + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def load_calibrator(cal_file, device): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + calibrator = Calibrator(lr.A[0, 0], lr.b[0]) + calibrator.to(device) + calibrator.eval() + return calibrator + + +def read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts): + + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_seg_parts > 1: + ndx = ndx.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + + f, idx = ismember(ndx.model_set, enroll.info) + + assert np.all(f) + x_e = x_e[idx] + + return ndx, x_e + + +def eval_cosine_scoring( + v_file, + ndx_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + cal_file, + max_test_length, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + + device = init_device(use_gpu) + feat_extractor = init_feats(device, **kwargs) + model = load_model(model_path, device) + + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file, device) + + logging.info("loading ndx and enrollment x-vectors") + ndx, y_e = read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file, **audio_args) + + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32") + with torch.no_grad(): + for j in range(ndx.num_tests): + t1 = time.time() + logging.info("scoring test utt %s" % (ndx.seg_set[j])) + s, fs = audio_reader.read([ndx.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + t2 = time.time() + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) + x_t = feat_extractor(s) + t4 = time.time() + tot_frames = x_t.shape[1] + if vad_spec is not None: + vad = torch.as_tensor( + v_reader.read([ndx.seg_set[j]], num_frames=x_t.shape[1])[0].astype( + np.uint8, copy=False + ), + dtype=torch.uint8, + ).to(device) + x_t = x_t[:, vad] + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + ndx.seg_set[j], + x_t.shape[1], + tot_frames, + x_t.shape[1] / tot_frames * 100, + ) + ) + + t5 = time.time() + x_t = x_t.transpose(1, 2).contiguous() + y_t = model.extract_embed(x_t, embed_layer=embed_layer) + y_t = l2_norm(y_t) + t6 = time.time() + + for i in range(ndx.num_models): + if ndx.trial_mask[i, j]: + y_e_i = torch.as_tensor(y_e[i], dtype=torch.get_default_dtype()).to( + device + ) + y_e_i = l2_norm(y_e_i) + scores_ij = torch.sum(y_e_i * y_t, dim=-1) + if calibrator is None: + scores[i, j] = scores_ij + else: + scores[i, j] = calibrator(scores_ij) + + t7 = time.time() + num_trials = np.sum(ndx.trial_mask[:, j]) + trial_time = (t7 - t6) / num_trials + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.2f" + ), + ndx.seg_set[j], + t7 - t1, + t2 - t1, + t4 - t2, + t5 - t4, + t6 - t5, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores(ndx.model_set, ndx.seg_set, scores, score_mask=ndx.trial_mask) + s.save_txt(score_file) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Eval cosine-scoring given enroll x-vector and test wave" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", dest="v_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-wav-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", required=True) + parser.add_argument("--cal-file", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py new file mode 100755 index 00000000..e0754498 --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +class MyModel(nn.Module): + def __init__( + self, feat_extractor, xvector_model, embed_layer=None, calibrator=None + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + + def forward(self, s_t): + f_t = s_t + f_t = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + score = torch.sum(x_e * x_t, dim=-1) + if self.calibrator is not None: + score = self.calibrator(score) + + return score + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.freeze() + model.eval() + return model + + +def load_calibrator(cal_file, threshold): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful + calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + + f, idx = ismember(key.model_set, enroll.info) + + assert np.all(f) + x_e = x_e[idx] + + return key, x_e + + +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + transfer_v_file, + model_path, + transfer_model_path, + embed_layer, + score_file, + stats_file, + cal_file, + transfer_cal_file, + threshold, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + + device = init_device(use_gpu) + # load victim model + feat_extractor = init_feats(**kwargs["feats"]) + xvector_model = load_model(model_path) + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file, 0) + + model = MyModel(feat_extractor, xvector_model, embed_layer, calibrator) + model.to(device) + model.eval() + + # load white-box model + tfeat_extractor = init_feats(**kwargs["transfer_feats"]) + xvector_tmodel = load_model(transfer_model_path) + tcalibrator = None + if transfer_cal_file is not None: + tcalibrator = load_calibrator(transfer_cal_file, threshold) + + tmodel = MyModel(tfeat_extractor, xvector_tmodel, embed_layer, tcalibrator) + tmodel.to(device) + tmodel.eval() + + tar = torch.as_tensor([1], dtype=torch.float).to(device) + non = torch.as_tensor([0], dtype=torch.float).to(device) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + _, t_x_e = read_data( + transfer_v_file, key_file, enroll_file, seg_part_idx, num_seg_parts + ) + t_x_e = torch.as_tensor(t_x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, + } + attack_args.update(extra_args) + logging.info("attacks args={}".format(attack_args)) + attack = AttackFactory.create(model, **attack_args) + + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s", key.seg_set[j]) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) + model.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i].to(device) + tmodel.x_e = t_x_e[i].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + with torch.no_grad(): + scores[i, j] = model(s_adv) + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.2f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s", stats_file) + attack_stats.to_csv(stats_file) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=( + "Eval cosine-scoring given enroll x-vector and " + "adversarial test wave obtained from a different model" + ) + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + + parser.add_argument("--transfer-v-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + AF.add_class_args(parser, prefix="transfer_feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument("--transfer-model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument( + "--transfer-cal-file", + default=None, + help="score calibration file for transfer model", + ) + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py new file mode 100755 index 00000000..0f9f375d --- /dev/null +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +from art.classifiers import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import AudioWriter as AW +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory +from hyperion.torch.layers import LinBinCalibrator as Calibrator +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm +from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info +from hyperion.utils.list_utils import ismember + + +class MyModel(nn.Module): + def __init__( + self, + feat_extractor, + xvector_model, + embed_layer=None, + calibrator=None, + threshold=0, + ): + super().__init__() + self.feat_extractor = feat_extractor + self.xvector_model = xvector_model + self.x_e = None + self.vad_t = None + self.embed_layer = embed_layer + self.calibrator = calibrator + self.threshold = threshold + + def forward(self, s_t): + f_t = s_t + f_t = self.feat_extractor(s_t) + if self.vad_t is not None: + n_vad_frames = len(self.vad_t) + n_feat_frames = f_t.shape[1] + if n_vad_frames > n_feat_frames: + self.vad_t = self.vad_t[:n_feat_frames] + elif n_vad_frames < n_feat_frames: + f_t = f_t[:, :n_vad_frames] + + f_t = f_t[:, self.vad_t] + + f_t = f_t.transpose(1, 2).contiguous() + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + x_t = l2_norm(x_t) + x_e = l2_norm(self.x_e) + tar_score = torch.sum(x_e * x_t, dim=-1, keepdim=True) + if self.calibrator is not None: + score = self.calibrator(tar_score) + + non_score = self.threshold + 0 * tar_score + score = torch.cat((non_score, tar_score), dim=-1) # .unsqueeze(0) + return score + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def init_feats(**kwargs): + feat_args = AF.filter_args(**kwargs) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=False, **feat_args) + logging.info("feat-extractor={}".format(feat_extractor)) + feat_extractor.eval() + return feat_extractor + + +def load_model(model_path): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("xvector-model={}".format(model)) + model.freeze() + model.eval() + return model + + +def load_calibrator(cal_file): + logging.info("loading calibration params {}".format(cal_file)) + lr = LR.load(cal_file) + calibrator = Calibrator(lr.A[0, 0], lr.b[0]) + calibrator.eval() + return calibrator + + +def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): + + r = DRF.create(v_file) + enroll = Utt2Info.load(enroll_file) + key = TrialKey.load(key_file) + + if num_seg_parts > 1: + key = key.split(1, 1, seg_part_idx, num_seg_parts) + + x_e = r.read(enroll.key, squeeze=True) + + f, idx = ismember(key.model_set, enroll.info) + + assert np.all(f) + x_e = x_e[idx] + + return key, x_e + + +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + transfer_v_file, + model_path, + transfer_model_path, + embed_layer, + score_file, + stats_file, + cal_file, + transfer_cal_file, + threshold, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + + device_type = "gpu" if use_gpu else "cpu" + device = init_device(use_gpu) + # load victim model + feat_extractor = init_feats(**kwargs["feats"]) + xvector_model = load_model(model_path) + calibrator = None + if cal_file is not None: + calibrator = load_calibrator(cal_file) + + model = MyModel( + feat_extractor, xvector_model, embed_layer, calibrator, threshold=threshold + ) + model.to(device) + model.eval() + + # load white-box model + tfeat_extractor = init_feats(**kwargs["transfer_feats"]) + xvector_tmodel = load_model(transfer_model_path) + tcalibrator = None + if transfer_cal_file is not None: + tcalibrator = load_calibrator(transfer_cal_file) + + tmodel = MyModel( + tfeat_extractor, xvector_tmodel, embed_layer, tcalibrator, threshold=threshold + ) + tmodel.to(device) + tmodel.eval() + + tar = np.asarray([1], dtype=np.int) + non = np.asarray([0], dtype=np.int) + + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) + x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) + + _, t_x_e = read_data( + transfer_v_file, key_file, enroll_file, seg_part_idx, num_seg_parts + ) + t_x_e = torch.as_tensor(t_x_e, dtype=torch.get_default_dtype()) + + audio_args = AR.filter_args(**kwargs) + audio_reader = AR(test_wav_file) + wav_scale = audio_reader.wav_scale + + if save_adv_wav: + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = {"eps_scale": wav_scale} + attack_args.update(extra_args) + logging.info("attack-args={}".format(attack_args)) + + if vad_spec is not None: + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) + for j in range(key.num_tests): + t1 = time.time() + logging.info("scoring test utt %s" % (key.seg_set[j])) + s, fs = audio_reader.read([key.seg_set[j]]) + s = s[0] + fs = fs[0] + + if max_test_length is not None: + max_samples = int(fs * max_test_length) + if len(s) > max_samples: + s = s[:max_samples] + + s = s[None, :].astype("float32", copy=False) + s_tensor = torch.as_tensor(s, dtype=torch.get_default_dtype()).to(device) + + if vad_spec is not None: + vad = v_reader.read([key.seg_set[j]])[0] + tot_frames = len(vad) + speech_frames = np.sum(vad) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) + model.vad_t = vad + tmodel.vad_t = vad + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) + + t2 = time.time() + + trial_time = 0 + num_trials = 0 + model_art = PyTorchClassifier( + model=tmodel, + loss=nn.CrossEntropyLoss(), + optimizer=None, + input_shape=[1, s.shape[1]], + nb_classes=2, + clip_values=(-wav_scale, wav_scale), + device_type=device_type, + ) + + attack_args["num_samples"] = s.shape[-1] + attack = AttackFactory.create(model_art, **attack_args) + for i in range(key.num_models): + if key.tar[i, j] or key.non[i, j]: + t3 = time.time() + model.x_e = x_e[i].to(device) + tmodel.x_e = t_x_e[i].to(device) + if key.tar[i, j]: + if attack.targeted: + t = non + else: + t = tar + else: + if attack.targeted: + t = tar + else: + t = non + + s_adv = attack.generate(s, t) + s_adv = torch.from_numpy(s_adv).to(device) + with torch.no_grad(): + scores[i, j] = model(s_adv).cpu().numpy()[0, 1] + + t4 = time.time() + trial_time += t4 - t3 + num_trials += 1 + + s_adv = s_adv.detach() + stats_ij = compute_stats_adv_attack(s_tensor, s_adv) + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] + attack_stats = attack_stats.append( + { + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + }, + ignore_index=True, + ) + + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + if save_adv_wav: + s_adv = s_adv.cpu().numpy()[0] + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] and scores[i, j] < threshold: + tar_audio_writer.write(trial_name, s_adv, fs) + elif key.non[i, j] and scores[i, j] > threshold: + non_audio_writer.write(trial_name, s_adv, fs) + + del attack + del model_art + trial_time /= num_trials + t7 = time.time() + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.2f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) + + if num_seg_parts > 1: + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) + s.save_txt(score_file) + + logging.info("saving stats to %s" % (stats_file)) + attack_stats.to_csv(stats_file) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=( + "Eval cosine-scoring given enroll x-vector and " + "adversarial test wave obtained from a different model" + "using ART" + ) + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + + parser.add_argument("--transfer-v-file", required=True) + + AR.add_class_args(parser) + AF.add_class_args(parser, prefix="feats") + AF.add_class_args(parser, prefix="transfer_feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument("--transfer-model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument( + "--transfer-cal-file", + default=None, + help="score calibration file for transfer model", + ) + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_cosine_scoring(**namespace_to_dict(args)) diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 7b68b9dd..d7b1f17d 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -10,8 +10,12 @@ import time from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, set_float_cpu @@ -95,7 +99,12 @@ def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): def init_hard_prototype_mining(model, train_loader, val_loader, rank): - if not train_loader.batch_sampler.hard_prototype_mining: + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: return if rank == 0: @@ -104,7 +113,12 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank): affinity_matrix = model.compute_prototype_affinity() train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) - if not val_loader.batch_sampler.hard_prototype_mining: + try: + hard_prototype_mining = val_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: return val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) diff --git a/hyperion/np/score_norm/adapt_s_norm.py b/hyperion/np/score_norm/adapt_s_norm.py index a5ae6f13..944fcad5 100644 --- a/hyperion/np/score_norm/adapt_s_norm.py +++ b/hyperion/np/score_norm/adapt_s_norm.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +import math import h5py import numpy as np @@ -33,6 +33,24 @@ def __init__( self.nbest_discard = nbest_discard self.nbest_sel_method = nbest_sel_method + def __call__( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test=None, + mask_enr_coh=None, + return_stats=False, + ): + return self.predict( + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + ) + def predict( self, scores, @@ -63,9 +81,9 @@ def predict( nbest = self.nbest if mask_coh_test is not None: - scores_coh_test[mask_coh_test == False] = 0 + scores_coh_test[~mask_coh_test] = 0 if mask_enr_coh is not None: - scores_enr_coh[mask_enr_coh == False] = 0 + scores_enr_coh[~mask_enr_coh] = 0 if self.nbest_sel_method == "highest-other-side": return self._norm_highest_other_side( @@ -87,81 +105,10 @@ def predict( return_stats, nbest, ) - # best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ - # self.nbest_discard : self.nbest_discard + nbest - # ] - # elif self.nbest_sel_method == "highest-same-side": - # best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ - # :, self.nbest_discard : self.nbest_discard + nbest - # ].T else: raise Exception(f"invalid cohort selection method {self.nbest_sel_method}") - # scores_z_norm = np.zeros_like(scores) - # for i in range(scores.shape[1]): - # best_idx_i = best_idx[:, i] - - # best_scores_i = scores_enr_coh[:, best_idx_i] - # mu_z = np.mean(best_scores_i, axis=1, keepdims=True) - - # if mask_enr_coh is None: - # s_z = np.std(best_scores_i, axis=1, keepdims=True) - # else: - # norm = np.mean(mask_enr_coh[:, best_idx_i], axis=1, keepdims=True) - # mu_z /= norm - # s_z = np.sqrt( - # np.mean(best_scores_i ** 2, axis=1, keepdims=True) / norm - # - mu_z ** 2 - # ) - - # s_z = np.clip(s_z, a_min=1e-5, a_max=None) - # if not self.norm_var: - # s_z = 1.0 - - # scores_z_norm[:, i] = (scores[:, i] - mu_z.T) / s_z.T - - # if self.nbest_sel_method == "highest-other-side": - # best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ - # :, self.nbest_discard : self.nbest_discard + nbest - # ] - # elif self.nbest_sel_method == "highest-same-side": - # best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ - # self.nbest_discard : self.nbest_discard + nbest - # ].T - # else: - # raise Exception(f"invalid cohort selection method {self.nbest_sel_method}") - - # scores_t_norm = np.zeros_like(scores) - # for i in range(scores.shape[0]): - # best_idx_i = best_idx[i] - # best_scores_i = scores_coh_test[best_idx_i, :] - # mu_t = np.mean(best_scores_i, axis=0, keepdims=True) - - # if mask_coh_test is None: - # s_t = np.std(best_scores_i[best_idx_i, :], axis=0, keepdims=True) - # else: - # norm = np.mean(mask_coh_test[best_idx_i, :], axis=0, keepdims=True) - # mu_t /= norm - # s_t = np.sqrt( - # np.mean(best_scores_i[best_idx_i, :] ** 2, axis=0, keepdims=True) - # / norm - # - mu_z ** 2 - # ) - - # s_t = np.clip(s_t, a_min=1e-5, a_max=None) - # if not self.norm_var: - # s_t = 1.0 - - # scores_t_norm[i, :] = (scores[i, :] - mu_t) / s_t - - # scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) - - # if return_stats: - # return scores_norm, mu_z, s_z, mu_t, s_t - # else: - # return scores_norm - - def _norm_highest_other_side( + def _norm_highest_other_side0( self, scores, scores_coh_test, @@ -246,7 +193,117 @@ def _norm_highest_other_side( else: return scores_norm - def _norm_highest_same_side( + def _norm_highest_other_side( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ): + + # this is very memory intensive, so we pass to f32 + scores_coh_test = scores_coh_test.astype("float32", copy=False) + scores_enr_coh = scores_enr_coh.astype("float32", copy=False) + + best_idx = np.argsort(-scores_coh_test, axis=0)[ + self.nbest_discard : self.nbest_discard + nbest + ].T # (n_test, n_best) + + mem = nbest * scores_enr_coh.shape[0] * scores.shape[1] * 4 / 2 ** 30 + # limit mem to 10 GB + num_groups = math.ceil(mem / 10) + num_el_group = int(math.ceil(scores.shape[1] / num_groups)) + scores_enr_coh = np.expand_dims(scores_enr_coh, 0) + if mask_enr_coh is not None: + mask_enr_coh = np.expand_dims(scores_enr_coh, 0) + + mu_z = [] + s_z = [] + for start in range(0, scores.shape[1], num_el_group): + stop = min(start + num_el_group, scores.shape[1]) + best_idx_i = np.expand_dims(best_idx[start:stop], 1) + best_scores_i = np.take_along_axis(scores_enr_coh, best_idx_i, axis=-1) + mu_z_i = best_scores_i.mean(axis=-1) + + if mask_enr_coh is None: + s_z_i = np.std(best_scores_i, axis=-1) + else: + mask_i = np.take_along_axis(mask_enr_coh, best_idx_i, axis=-1) + norm = mask_i.mean(axis=-1) + mu_z_i /= norm + s_z_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=-1) / norm - mu_z_i ** 2 + ) + + del best_scores_i + mu_z.append(mu_z_i.T) + s_z.append(s_z_i.T) + + mu_z = np.concatenate(mu_z, axis=-1) + s_z = np.concatenate(s_z, axis=-1) + + s_z = np.clip(s_z, a_min=1e-5, a_max=None) + if not self.norm_var: + s_z = 1.0 + + scores_z_norm = (scores - mu_z) / s_z + + scores_enr_coh = scores_enr_coh[0] # unsqueeze + best_idx = np.argsort(-scores_enr_coh, axis=1)[ + :, self.nbest_discard : self.nbest_discard + nbest + ].T + + mem = nbest * scores.shape[0] * scores_coh_test.shape[1] * 4 / 2 ** 30 + # limit mem to 10 GB + num_groups = math.ceil(mem / 10) + num_el_group = int(math.ceil(scores.shape[0] / num_groups)) + scores_coh_test = np.expand_dims(scores_coh_test, -1) + if mask_coh_test is not None: + mask_coh_test = np.expand_dims(mask_coh_test, -1) + + mu_t = [] + s_t = [] + for start in range(0, scores.shape[0], num_el_group): + stop = min(start + num_el_group, scores.shape[0]) + best_idx_i = np.expand_dims(best_idx[:, start:stop], 1) + # print(scores_coh_test.shape, best_idx_i.shape) + best_scores_i = np.take_along_axis(scores_coh_test, best_idx_i, axis=0) + # print(best_scores_i.shape) + mu_t_i = best_scores_i.mean(axis=0) + if mask_enr_coh is None: + s_t_i = np.std(best_scores_i, axis=0) + else: + mask_i = np.take_along_axis(mask_coh_test, best_idx_i, axis=0) + norm = mask_i.mean(axis=0) + mu_t_i /= norm + s_t_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=0) / norm - mu_t_i ** 2 + ) + + # print(best_scores_i.shape, mu_t_i.shape) + del best_scores_i + mu_t.append(mu_t_i.T) + s_t.append(s_t_i.T) + + mu_t = np.concatenate(mu_t, axis=0) + s_t = np.concatenate(s_t, axis=0) + + s_t = np.clip(s_t, a_min=1e-5, a_max=None) + if not self.norm_var: + s_t = 1.0 + + scores_t_norm = (scores - mu_t) / s_t + + scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) + if return_stats: + return scores_norm, mu_z, s_z, mu_t, s_t + else: + return scores_norm + + def _norm_highest_same_side0( self, scores, scores_coh_test, @@ -331,3 +388,112 @@ def _norm_highest_same_side( return scores_norm, mu_z, s_z, mu_t, s_t else: return scores_norm + + def _norm_highest_same_side( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test, + mask_enr_coh, + return_stats, + nbest, + ): + + # this is very memory intensive, so we pass to f32 + scores_coh_test = scores_coh_test.astype("float32", copy=False) + scores_enr_coh = scores_enr_coh.astype("float32", copy=False) + + best_idx = np.argsort(-scores_enr_coh, axis=1)[ + :, self.nbest_discard : self.nbest_discard + nbest + ] + + mem = nbest * scores_enr_coh.shape[0] * scores.shape[0] * 4 / 2 ** 30 + # limit mem to 10 GB + num_groups = math.ceil(mem / 10) + num_el_group = int(math.ceil(scores.shape[0] / num_groups)) + scores_enr_coh = np.expand_dims(scores_enr_coh, 0) + if mask_enr_coh is not None: + mask_enr_coh = np.expand_dims(scores_enr_coh, 0) + + mu_z = [] + s_z = [] + for start in range(0, scores.shape[0], num_el_group): + stop = min(start + num_el_group, scores.shape[0]) + best_idx_i = np.expand_dims(best_idx[start:stop], 1) + best_scores_i = np.take_along_axis(scores_enr_coh, best_idx_i, axis=-1) + mu_z_i = best_scores_i.mean(axis=-1) + + if mask_enr_coh is None: + s_z_i = np.std(best_scores_i, axis=-1) + else: + mask_i = np.take_along_axis(mask_enr_coh, best_idx_i, axis=-1) + norm = mask_i.mean(axis=-1) + mu_z_i /= norm + s_z_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=-1) / norm - mu_z_i ** 2 + ) + + del best_scores_i + mu_z.append(mu_z_i.T) + s_z.append(s_z_i.T) + + mu_z = np.concatenate(mu_z, axis=-1) + s_z = np.concatenate(s_z, axis=-1) + + s_z = np.clip(s_z, a_min=1e-5, a_max=None) + if not self.norm_var: + s_z = 1.0 + + scores_z_norm = (scores - mu_z) / s_z + + best_idx = np.argsort(-scores_coh_test, axis=0)[ + self.nbest_discard : self.nbest_discard + nbest + ] # (n_best, n_test) + + mem = nbest * scores.shape[1] * scores_coh_test.shape[1] * 4 / 2 ** 30 + # limit mem to 10 GB + num_groups = math.ceil(mem / 10) + num_el_group = int(math.ceil(scores.shape[1] / num_groups)) + scores_coh_test = np.expand_dims(scores_coh_test, -1) + if mask_coh_test is not None: + mask_coh_test = np.expand_dims(mask_coh_test, -1) + + mu_t = [] + s_t = [] + for start in range(0, scores.shape[1], num_el_group): + stop = min(start + num_el_group, scores.shape[1]) + best_idx_i = np.expand_dims(best_idx[:, start:stop], 1) + # print(scores_coh_test.shape, best_idx_i.shape) + best_scores_i = np.take_along_axis(scores_coh_test, best_idx_i, axis=0) + # print(best_scores_i.shape) + mu_t_i = best_scores_i.mean(axis=0) + if mask_enr_coh is None: + s_t_i = np.std(best_scores_i, axis=0) + else: + mask_i = np.take_along_axis(mask_coh_test, best_idx_i, axis=0) + norm = mask_i.mean(axis=0) + mu_t_i /= norm + s_t_i = np.sqrt( + np.mean(best_scores_i ** 2, axis=0) / norm - mu_t_i ** 2 + ) + + # print(best_scores_i.shape, mu_t_i.shape) + del best_scores_i + mu_t.append(mu_t_i.T) + s_t.append(s_t_i.T) + + mu_t = np.concatenate(mu_t, axis=0) + s_t = np.concatenate(s_t, axis=0) + + s_t = np.clip(s_t, a_min=1e-5, a_max=None) + if not self.norm_var: + s_t = 1.0 + + scores_t_norm = (scores - mu_t) / s_t + + scores_norm = (scores_z_norm + scores_t_norm) / np.sqrt(2) + if return_stats: + return scores_norm, mu_z, s_z, mu_t, s_t + else: + return scores_norm diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 81e9082f..7fbfbd71 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -116,6 +116,8 @@ def __init__( self.num_chunks_per_seg_epoch, ) + self.counts = {} + def _set_seed(self): if self.shuffle: self.rng.manual_seed(self.seed + 10 * self.epoch + 100 * self.rank) @@ -208,7 +210,7 @@ def _set_class_weights(self): if self.weight_exponent != 1.0: self.class_info.exp_weights(self.weight_exponent) - zero_weight = self.class_info["min_seg_duration"] < self.min_chunk_length + zero_weight = self.class_info["max_seg_duration"] < self.min_chunk_length if np.any(zero_weight): self.class_info.set_zero_weight(zero_weight) @@ -374,6 +376,20 @@ def __next__(self): num_classes = self._compute_num_classes_per_batch(batch_size) # t4 = time.time() class_ids = self._sample_classes(num_classes, chunk_length) + # for i in class_ids: + # if i in self.counts: + # self.counts[i] += 1 + # else: + # self.counts[i] = 1 + + # mx = 0 + # mn = 1000000000 + # for k, v in self.counts.items(): + # if v > mx: + # mx = v + # if v < mn: + # mn = v + # t5 = time.time() seg_ids = self._sample_segs(class_ids, chunk_length) # t6 = time.time() diff --git a/hyperion/torch/layer_blocks/fc_blocks.py b/hyperion/torch/layer_blocks/fc_blocks.py index e56ab83e..49bf12db 100644 --- a/hyperion/torch/layer_blocks/fc_blocks.py +++ b/hyperion/torch/layer_blocks/fc_blocks.py @@ -84,4 +84,7 @@ def forward_linear(self, x): if self.norm_before: x = self.bn1(x) + if self.activation is None and self.norm_after: + x = self.bn1(x) + return x From 5f0ac9936784f3581304152911be0093b25b44c2 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 27 Jan 2023 14:02:21 -0500 Subject: [PATCH 076/154] updated qmf script --- .../conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml | 2 +- .../v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml | 2 +- .../config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | 2 +- egs/voxceleb/v1.1/run_040_eval_be.sh | 2 +- egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py | 9 ++++++--- egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh | 4 ++-- egs/voxceleb/v1/steps_be/train-qmf.py | 5 +++-- 7 files changed, 15 insertions(+), 11 deletions(-) diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml index 4b6fbc77..c4de614e 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml @@ -87,5 +87,5 @@ trainer: update_lr_on_opt_step: true use_amp: true log_interval: 1000 - epochs: 30 + epochs: 40 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml index 319ab3ab..f5a7dcb1 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml @@ -85,5 +85,5 @@ trainer: update_lr_on_opt_step: true use_amp: true log_interval: 1000 - epochs: 30 + epochs: 40 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh index 0e7a3b52..68990732 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh @@ -17,7 +17,7 @@ nnet_name=${feat_type}_ecapatdnn512x3.v2.0 nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v2.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0040.pth nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml nnet_s2_name=${nnet_name}.s2 diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index 49fa68e7..18c5eeeb 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -155,7 +155,7 @@ if [ "$do_qmf" == "true" ];then echo "Eval Voxceleb 1 with Cosine scoring" steps_be/eval_be_cos_qmf.sh \ - --cmd "$train_cmd --mem 15G" --coh-nbest 1000 \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ data/voxceleb1_test/trials \ data/voxceleb1_test/utt2model \ $xvector_dir/voxceleb1_test/xvector.scp \ diff --git a/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py b/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py index 78526277..90650941 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py +++ b/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py @@ -100,14 +100,14 @@ def eval_plda( np.clip( u2nf.filter(enroll_segs).info.astype(float) / 100 - 2.0, a_min=0.1, - a_max=6.0, + a_max=12.0, # 6.0, ) ) test_nf = np.log( np.clip( u2nf.filter(ndx.seg_set).info.astype(float) / 100 - 2.0, a_min=0.1, - a_max=6.0, + a_max=12.0, # 6.0, ) ) t1 = time.time() @@ -132,9 +132,11 @@ def eval_plda( t2 = time.time() logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest, nbest_sel_method="highest-other-side") - scores_norm, mu_z, _, mu_t, _ = snorm( + scores_norm, mu_z, s_z, mu_t, s_t = snorm( scores, scores_coh_test, scores_enr_coh, return_stats=True ) + mu_z = mu_z / s_z + mu_t = mu_t / s_t dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] @@ -165,6 +167,7 @@ def eval_plda( logging.info("applying qmf") scores_fus = [scores.ravel()] + scores_fus = [scores_norm.ravel()] for q_name in ["maxnf", "minnf", "maxcohmu", "mincohmu"]: scores_fus.append(q_measures[q_name].ravel()) diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh index 8b69b0d6..a8ad0178 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh @@ -5,8 +5,8 @@ set -e cmd=run.pl stage=1 -num_parts=8 -coh_nbest=400 +num_parts=16 +coh_nbest=1000 if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; diff --git a/egs/voxceleb/v1/steps_be/train-qmf.py b/egs/voxceleb/v1/steps_be/train-qmf.py index 07712221..afd9d218 100755 --- a/egs/voxceleb/v1/steps_be/train-qmf.py +++ b/egs/voxceleb/v1/steps_be/train-qmf.py @@ -30,8 +30,9 @@ def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbo logging.info("load key: %s", key_file) key = TrialKey.load_txt(key_file) - logging.info("load scores: %s", score_file) - scr = TrialScores.load_txt(score_file) + score_snorm_file = f"{score_file}_snorm" + logging.info("load scores: %s", score_snorm_file) + scr = TrialScores.load_txt(score_snorm_file) tar, non = scr.get_tar_non(key) ntar = len(tar) nnon = len(non) From b3647987d1cfeca2f419e422ea3b8a45b90bddb0 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 27 Jan 2023 14:19:26 -0500 Subject: [PATCH 077/154] xxx --- .../v1/global_conf/config_transducer_v4.3.sh | 2 +- .../v1/global_conf/config_transducer_v4.4.sh | 4 +++- egs/librispeech/v1/run_030_inference.sh | 13 +++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/egs/librispeech/v1/global_conf/config_transducer_v4.3.sh b/egs/librispeech/v1/global_conf/config_transducer_v4.3.sh index de00c55a..f51f1213 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v4.3.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v4.3.sh @@ -23,7 +23,7 @@ nnet_name=${hf_model_name}_transducer_v4.3 nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0060.pth +nnet_s1=$nnet_s1_dir/model_ep0030.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s2_args="" diff --git a/egs/librispeech/v1/global_conf/config_transducer_v4.4.sh b/egs/librispeech/v1/global_conf/config_transducer_v4.4.sh index 3114af61..d09c197b 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v4.4.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v4.4.sh @@ -23,7 +23,9 @@ nnet_name=${hf_model_name}_transducer_v4.4 nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0060.pth +nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0075.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml nnet_s2_args="" diff --git a/egs/librispeech/v1/run_030_inference.sh b/egs/librispeech/v1/run_030_inference.sh index 73ac2b8f..02b97001 100755 --- a/egs/librispeech/v1/run_030_inference.sh +++ b/egs/librispeech/v1/run_030_inference.sh @@ -38,11 +38,12 @@ test_data=test_clean # Extracts x-vectors for evaluation -for name in $test_data - do - nj=16 - steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ +for name in dev_clean dev_other test_clean test_other #$test_data +do + nj=40 + steps_transducer/decode_wav2vec2transducer.sh \ + --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ $nnet data/$name \ $transducer_dir/$name $bpe_model - done -exit +done + From f15905ba2e0e0e43b76190728885ca9d648d6c8f Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 30 Jan 2023 04:38:56 -0500 Subject: [PATCH 078/154] new results voxceleb --- egs/voxceleb/v1.1/README.md | 34 ++++++ ...train_ecapatdnn512x3_xvec_stage2_v2.0.yaml | 4 +- ...rain_idrnd_resnet100_xvec_stage1_v2.0.yaml | 70 ++++++++++++ ...rain_idrnd_resnet100_xvec_stage2_v2.0.yaml | 66 +++++++++++ ...rain_idrnd_resnet202_xvec_stage1_v2.0.yaml | 70 ++++++++++++ ...rain_idrnd_resnet202_xvec_stage2_v2.0.yaml | 66 +++++++++++ egs/voxceleb/v1.1/datapath.sh | 1 + ...onfig_fbank80_stmn_ecapatdnn2048x4.v2.0.sh | 8 +- ...config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | 6 +- ...onfig_fbank80_stmn_idrnd_resnet100.v2.0.sh | 45 ++++++++ ...onfig_fbank80_stmn_idrnd_resnet202.v2.0.sh | 45 ++++++++ egs/voxceleb/v1.1/run_001_prepare_data.sh | 21 +--- egs/voxceleb/v1.1/run_002_compute_evad.sh | 62 ++++++----- egs/voxceleb/v1.1/run_030_extract_xvectors.sh | 5 +- egs/voxceleb/v1.1/run_040_eval_be.sh | 105 +++++++++++++++++- egs/voxceleb/v1/local/score_voxsrc22_dev.sh | 21 ++++ hyperion/torch/layer_blocks/se_blocks.py | 2 +- hyperion/torch/narchs/resnet.py | 28 +++-- 18 files changed, 592 insertions(+), 67 deletions(-) create mode 100644 egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet202.v2.0.sh create mode 100755 egs/voxceleb/v1/local/score_voxsrc22_dev.sh diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 5b5b93e5..83027e16 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -87,6 +87,40 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr ### VoxCeleb 1 Original-Clean trial list +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.09 | 0.068 | 0.121 | +| | | | Cosine + AS-Norm | 1.0 | 0.064 | 0.110 | +| | | | Cosine + QMF | 0.87 | 0.059 | 0.076 | + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.21 | 0.075 | 0.129 | +| | | | Cosine + AS-Norm | 1.15 | 0.069 | 0.113 | +| | | | Cosine + QMF | 1.12 | 0.067 | 0.111 | + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.17 | 0.129 | 0.212 | +| | | | Cosine + AS-Norm | 1.98 | 0.116 | 0.190 | +| | | | Cosine + QMF | 1.88 | 0.112 | 0.181 | + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.85 | 0.187 | 0.310 | +| | | | Cosine + AS-Norm | 2.69 | 0.182 | 0.310 | +| | | | Cosine + QMF | 2.80 | 0.196 | 0.338 | + +## Results before 2023 + +### VoxCeleb 1 Original-Clean trial list + | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | | config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml index 4a4a8a88..b6163f14 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml @@ -14,7 +14,7 @@ data: min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 class_name: class_id - seg_weight_mode: uniform + seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: num_workers: 8 @@ -33,7 +33,7 @@ data: min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 class_name: class_id - seg_weight_mode: uniform + seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: num_workers: 8 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml new file mode 100644 index 00000000..fba4ce80 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + se_r: 4 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml new file mode 100644 index 00000000..6c209a9f --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage1_v2.0.yaml new file mode 100644 index 00000000..bff34263 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage1_v2.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + se_r: 4 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml new file mode 100644 index 00000000..e4e6d97a --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/datapath.sh b/egs/voxceleb/v1.1/datapath.sh index 9a2f7529..a7eb575c 100644 --- a/egs/voxceleb/v1.1/datapath.sh +++ b/egs/voxceleb/v1.1/datapath.sh @@ -13,6 +13,7 @@ elif [ "$(hostname --domain)" == "cm.gemini" ];then # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 musan_root=/expscratch/dgromero/corpora-open/musan else echo "Put your database paths here" diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh index 14f2cdb4..e9c634a3 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh @@ -17,7 +17,7 @@ nnet_name=${feat_type}_ecapatdnn2048x4.v2.0 nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0040.pth nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml nnet_s2_name=${nnet_name}.s2 @@ -27,9 +27,9 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth # back-end do_plda=false -#do_snorm=true -#do_qmf=true -do_voxsrc22=false +do_snorm=true +do_qmf=true +do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh index 68990732..1f6eb371 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v2.0.sh @@ -27,9 +27,9 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth # back-end do_plda=false -#do_snorm=true -#do_qmf=true -do_voxsrc22=false +do_snorm=true +do_qmf=true +do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh new file mode 100644 index 00000000..b9363c3f --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v2.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet202.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet202.v2.0.sh new file mode 100644 index 00000000..3de2f432 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet202.v2.0.sh @@ -0,0 +1,45 @@ +# Voxsrc22 Ravana ResNet202 network + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet202.v2.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet202_xvec_stage1_v2.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/run_001_prepare_data.sh b/egs/voxceleb/v1.1/run_001_prepare_data.sh index 037efda1..44385610 100755 --- a/egs/voxceleb/v1.1/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.1/run_001_prepare_data.sh @@ -12,7 +12,7 @@ config_file=default_config.sh . parse_options.sh || exit 1; . datapath.sh - +. $config_file if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. @@ -34,22 +34,13 @@ if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then --output-dir data/voxsrc22_dev fi -if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then - local/prepare_voxsrc22_test.py \ - --corpus-dir $voxsrc22_root \ - --output-dir data/voxsrc22_test -fi +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then +# local/prepare_voxsrc22_test.py \ +# --corpus-dir $voxsrc22_root \ +# --output-dir data/voxsrc22_test +# fi if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then # # split vox2 into 2 parts, for cohort and qmf training - # utils/copy_data_dir.sh data/voxceleb2cat_train data/voxceleb2cat_train_odd - # utils/copy_data_dir.sh data/voxceleb2cat_train data/voxceleb2cat_train_even - # awk 'int(substr($2,3)) % 2 == 1' data/voxceleb2cat_train/utt2spk > data/voxceleb2cat_train_odd/utt2spk - # utils/fix_data_dir.sh data/voxceleb2cat_train_odd - # awk 'int(substr($2,3)) % 2 == 0' data/voxceleb2cat_train/utt2spk > data/voxceleb2cat_train_even/utt2spk - # utils/fix_data_dir.sh data/voxceleb2cat_train_even - # # we keep 3 utts per speaker - # utils/subset_data_dir.sh --per-spk data/voxceleb2cat_train_odd 3 data/voxceleb2cat_train_subset_cohort - # utils/subset_data_dir.sh --per-spk data/voxceleb2cat_train_even 3 data/voxceleb2cat_train_subset_qmf local/make_vox2_trials.py --data-dir data/voxceleb2cat_train fi diff --git a/egs/voxceleb/v1.1/run_002_compute_evad.sh b/egs/voxceleb/v1.1/run_002_compute_evad.sh index eeae00ac..7a2a9be5 100755 --- a/egs/voxceleb/v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/v1.1/run_002_compute_evad.sh @@ -19,39 +19,43 @@ config_file=default_config.sh if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $vaddir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $vaddir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $vaddir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $vaddir/storage - else - echo "we don't distribute data between multiple machines" - fi + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" fi + fi fi #Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ - --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_vad/$name $vaddir - utils/fix_data_dir.sh data/${name} - done +if [ $stage -le 2 ];then + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh \ + --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done fi diff --git a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh index c42f96bb..4e6a8790 100755 --- a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh @@ -70,7 +70,10 @@ fi if [ $stage -le 2 ]; then # Extracts x-vectors for evaluation - for name in voxceleb1_test + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data do num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') nj=$(($num_spk < 100 ? $num_spk:100)) diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index 18c5eeeb..abbdb20c 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -110,9 +110,37 @@ if [ $stage -le 3 ];then fi +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $score_cosine_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $score_cosine_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_dir + + for f in $(ls $score_cosine_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + +fi + if [ "$do_snorm" == "true" ];then - if [ $stage -le 4 ];then + if [ $stage -le 5 ];then echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" steps_be/eval_be_cos_snorm.sh \ --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ @@ -133,11 +161,42 @@ if [ "$do_snorm" == "true" ];then echo "" done fi + + if [ $stage -le 6 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_snorm.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_snorm_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + fi fi if [ "$do_qmf" == "true" ];then - if [ $stage -le 5 ];then + if [ $stage -le 7 ];then echo "Train QMF in Vox2" steps_be/train_be_cos_qmf.sh \ --cmd "$train_cmd" --coh-nbest 1000 \ @@ -151,7 +210,7 @@ if [ "$do_qmf" == "true" ];then fi - if [ $stage -le 6 ];then + if [ $stage -le 8 ];then echo "Eval Voxceleb 1 with Cosine scoring" steps_be/eval_be_cos_qmf.sh \ @@ -180,6 +239,46 @@ if [ "$do_qmf" == "true" ];then done fi + + if [ $stage -le 9 ];then + echo "Eval voxsrc2 with Cosine scoring" + # steps_be/eval_be_cos_qmf.sh \ + # --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + # data/voxsrc22_dev/trials \ + # data/voxsrc22_dev/utt2model \ + # $xvector_dir/voxsrc22_dev/xvector.scp \ + # $xvector_dir/voxsrc22_dev/utt2num_frames \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_qmf_dir/qmf.h5 \ + # $score_cosine_qmf_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $xvector_dir/voxsrc22_test/utt2num_frames \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_qmf_dir/qmf.h5 \ + # $score_cosine_qmf_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_snorm.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_qmf.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxsrc22_dev{,_snorm,_qmf}_results); + do + echo $f + cat $f + echo "" + done + fi + fi diff --git a/egs/voxceleb/v1/local/score_voxsrc22_dev.sh b/egs/voxceleb/v1/local/score_voxsrc22_dev.sh new file mode 100755 index 00000000..f4649fb7 --- /dev/null +++ b/egs/voxceleb/v1/local/score_voxsrc22_dev.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +if [ $# -ne 2 ] && [ $# -n 3 ]; then + echo "Usage: $0 " + exit 1; +fi + +set -e + +data_dir=$1 +score_dir=$2 +suffix=$3 + +echo "Score voxsrc22 dev" +key=$data_dir/trials +#Compute performance +python local/score_dcf.py --key-file $key --score-file $score_dir/voxsrc22_dev_scores$suffix --output-path $score_dir/voxsrc22_dev$suffix + + diff --git a/hyperion/torch/layer_blocks/se_blocks.py b/hyperion/torch/layer_blocks/se_blocks.py index c53d5ecc..b14c2b60 100644 --- a/hyperion/torch/layer_blocks/se_blocks.py +++ b/hyperion/torch/layer_blocks/se_blocks.py @@ -177,7 +177,7 @@ def forward(self, x, x_mask=None): """ x = x.transpose(1, 2) y = super().forward(x, x_mask) - y = y.tranpose(1, 2).continous() + y = y.transpose(1, 2).contiguous() return y diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index e3264f33..59143c2e 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -10,10 +10,16 @@ import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear -from ..layer_blocks import (Res2NetBasicBlock, Res2NetBNBlock, - ResNetBasicBlock, ResNetBNBlock, - ResNetEndpointBlock, ResNetInputBlock, - SEResNetBasicBlock, SEResNetBNBlock) +from ..layer_blocks import ( + Res2NetBasicBlock, + Res2NetBNBlock, + ResNetBasicBlock, + ResNetBNBlock, + ResNetEndpointBlock, + ResNetInputBlock, + SEResNetBasicBlock, + SEResNetBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..utils import scale_seq_lengths, seq_lengths_to_mask @@ -296,7 +302,11 @@ def _make_layer(self, block, channels, num_blocks, stride=1, dilate=False): kwargs = {"se_r": self.se_r} else: num_feats = int(self.in_feats / (self._downsample_factor * stride)) - kwargs = {"se_r": self.se_r, "time_se": True, "num_feats": num_feats} + kwargs = { + "se_r": self.se_r, + "se_type": self.se_type, + "num_feats": num_feats, + } if self.is_res2net: kwargs["scale"] = self.res2net_scale @@ -972,7 +982,7 @@ def __init__(self, in_channels, **kwargs): kwargs["base_channels"] = 128 kwargs["resb_channels"] = [128, 128, 256, 256] kwargs["se_type"] = "fw-se" - super().__init__("basic", [6, 16, 24, 3], in_channels, **kwargs) + super().__init__("sebasic", [6, 16, 24, 3], in_channels, **kwargs) class FwSEIdRndResNet202(ResNet): @@ -980,7 +990,7 @@ def __init__(self, in_channels, **kwargs): kwargs["base_channels"] = 128 kwargs["resb_channels"] = [128, 128, 256, 256] kwargs["se_type"] = "fw-se" - super().__init__("basic", [6, 16, 75, 3], in_channels, **kwargs) + super().__init__("sebasic", [6, 16, 75, 3], in_channels, **kwargs) # Channel-Freq-wise Squezee-Excitation ResNets @@ -1083,7 +1093,7 @@ def __init__(self, in_channels, **kwargs): kwargs["base_channels"] = 128 kwargs["resb_channels"] = [128, 128, 256, 256] kwargs["se_type"] = "cfw-se" - super().__init__("basic", [6, 16, 24, 3], in_channels, **kwargs) + super().__init__("sebasic", [6, 16, 24, 3], in_channels, **kwargs) class CFwSEIdRndResNet202(ResNet): @@ -1091,7 +1101,7 @@ def __init__(self, in_channels, **kwargs): kwargs["base_channels"] = 128 kwargs["resb_channels"] = [128, 128, 256, 256] kwargs["se_type"] = "cfw-se" - super().__init__("basic", [6, 16, 75, 3], in_channels, **kwargs) + super().__init__("sebasic", [6, 16, 75, 3], in_channels, **kwargs) #################### Res2Net variants ######################## From 21a47643c17d5ec3e7f739c1371470a0c2df1db7 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 30 Jan 2023 07:16:55 -0500 Subject: [PATCH 079/154] uncoment script --- egs/voxceleb/v1.1/run_040_eval_be.sh | 58 +++++----------------------- 1 file changed, 10 insertions(+), 48 deletions(-) diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index abbdb20c..358e2acf 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -242,16 +242,16 @@ if [ "$do_qmf" == "true" ];then if [ $stage -le 9 ];then echo "Eval voxsrc2 with Cosine scoring" - # steps_be/eval_be_cos_qmf.sh \ - # --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ - # data/voxsrc22_dev/trials \ - # data/voxsrc22_dev/utt2model \ - # $xvector_dir/voxsrc22_dev/xvector.scp \ - # $xvector_dir/voxsrc22_dev/utt2num_frames \ - # data/voxceleb2cat_train/utt2spk \ - # $xvector_dir/voxceleb2cat_train/xvector.scp \ - # $score_cosine_qmf_dir/qmf.h5 \ - # $score_cosine_qmf_dir/voxsrc22_dev_scores & + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $xvector_dir/voxsrc22_dev/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxsrc22_dev_scores & # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \ # data/voxsrc22_test/trials \ @@ -281,41 +281,3 @@ if [ "$do_qmf" == "true" ];then fi - -exit -# be_dir=exp/be/$nnet_name/cw -# score_plda_dir=$score_dir/cw_cosine - -# if [ $stage -le 4 ]; then -# echo "Train centering+whitening on Voxceleb2" -# steps_be/train_be_v2.sh --cmd "$train_cmd" \ -# $xvector_dir/$plda_data/xvector.scp \ -# data/$plda_data \ -# $be_dir -# fi - - -# if [ $stage -le 5 ];then - -# echo "Eval Voxceleb 1 with CentWhiten + Cosine scoring" -# steps_be/eval_be_v2.sh --cmd "$train_cmd" \ -# data/voxceleb1_test/trials \ -# data/voxceleb1_test/utt2model \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $be_dir/cw.h5 \ -# $score_plda_dir/voxceleb1_scores - -# $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# fi - -# exit - From 16b1316ab478a0001d21f57389ed8af30107ce2f Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 30 Jan 2023 10:45:01 -0500 Subject: [PATCH 080/154] fix bug --- egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py b/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py index 90650941..82050ed1 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py +++ b/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py @@ -89,9 +89,12 @@ def eval_plda( if not np.any(ndx.trial_mask): save_empty(score_file, None, model_part_idx, seg_part_idx, parallel) + save_empty(score_file, "snorm", model_part_idx, seg_part_idx, parallel) if qmf_file is None: for q_name in ["snorm", "maxnf", "minnf", "maxcohmu", "mincohmu"]: save_empty(score_file, q_name, model_part_idx, seg_part_idx, parallel) + else: + save_empty(score_file, "qmf", model_part_idx, seg_part_idx, parallel) return logging.info("read num_frames") From 76ac6f3c2363bcd6583bbc3058bb3e8e9fc6dd5c Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 9 Feb 2023 12:28:27 -0500 Subject: [PATCH 081/154] audio dataset with dictionary for asr --- ...n_wav2vec2base_transducer_stage1_v5.0.yaml | 53 +++++++ .../conf/wav2vec2base_transducer_do0.4.yaml | 13 ++ .../v1/global_conf/config_transducer_v5.0.sh | 32 +++++ hyperion/bin/train_wav2vec2transducer.py | 11 +- hyperion/torch/data/audio_dataset.py | 2 +- hyperion/torch/trainers/transducer_trainer.py | 136 +++++++++++------- 6 files changed, 195 insertions(+), 52 deletions(-) create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml create mode 100644 egs/librispeech/v1/conf/wav2vec2base_transducer_do0.4.yaml create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v5.0.sh diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml new file mode 100644 index 00000000..c23a4f11 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml @@ -0,0 +1,53 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 75. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 75. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2base_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 42000 + hold_steps: 15000 + min_lr: 4e-5 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 1200 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/wav2vec2base_transducer_do0.4.yaml b/egs/librispeech/v1/conf/wav2vec2base_transducer_do0.4.yaml new file mode 100644 index 00000000..3707672a --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2base_transducer_do0.4.yaml @@ -0,0 +1,13 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h +transducer: + decoder: + embedding_dim: 1024 + num_layers: 2 + hidden_dim: 512 + embedding_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + joiner: + num_layers: 1 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v1/global_conf/config_transducer_v5.0.sh b/egs/librispeech/v1/global_conf/config_transducer_v5.0.sh new file mode 100644 index 00000000..b1da75b7 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v5.0.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_transducer_stage1_v5.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v5.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0075.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index ee60080a..8b945217 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -41,14 +41,19 @@ def transducer_collate(batch): audio_length = [] target = [] for record in batch: - wav = torch.as_tensor(record[0]) + wav = torch.as_tensor(record["x"]) audio.append(wav) audio_length.append(wav.shape[0]) - target.append(record[1]) + target.append(record["text"]) audio = pad_sequence(audio) audio_length = torch.as_tensor(audio_length) target = k2.RaggedTensor(target) - return torch.transpose(audio, 0, 1), audio_length, target + batch = { + "x": torch.transpose(audio, 0, 1), + "x_lengths": audio_length, + "text": target, + } + return batch def init_data(partition, rank, num_gpus, **kwargs): diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 20deb039..b352f94d 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -259,7 +259,7 @@ def _get_segment_info(self, seg_id): seg_info_i = class_info.loc[seg_info_i, "class_idx"] if info_name == "text": - seg_info = self.sp.encode(seg_info, out_type=int) + seg_info_i = self.sp.encode(seg_info_i, out_type=int) seg_info[info_name] = seg_info_i diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 932c3ed4..cbf94bc0 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -7,11 +7,14 @@ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torchaudio import torch.nn as nn -from ..utils import MetricAcc +from ...utils.misc import filter_func_args +from ..utils import MetricAcc, tensors_subset from .torch_trainer import TorchTrainer from torch.distributed.elastic.multiprocessing.errors import record @@ -47,6 +50,7 @@ class TransducerTrainer(TorchTrainer): swa_anneal_epochs: SWA learning rate anneal epochs cpu_offload: CPU offload of gradients when using fully sharded ddp """ + def __init__( self, model, @@ -75,39 +79,42 @@ def __init__( swa_lr=1e-3, swa_anneal_epochs=10, cpu_offload=False, + input_key="x", + target_key="text", ): - if loss is None: - # TODO: Check and Modify loss - loss = nn.CrossEntropyLoss() - super().__init__( - model, - loss, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - eff_batch_size=eff_batch_size, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload, - ) + loss = None + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + + # super().__init__( + # model, + # None, + # optim, + # epochs, + # exp_path, + # cur_epoch=cur_epoch, + # grad_acc_steps=grad_acc_steps, + # eff_batch_size=eff_batch_size, + # device=device, + # metrics=metrics, + # lrsched=lrsched, + # loggers=loggers, + # ddp=ddp, + # ddp_type=ddp_type, + # train_mode=train_mode, + # use_amp=use_amp, + # log_interval=log_interval, + # use_tensorboard=use_tensorboard, + # use_wandb=use_wandb, + # wandb=wandb, + # grad_clip=grad_clip, + # grad_clip_norm=grad_clip_norm, + # swa_start=swa_start, + # swa_lr=swa_lr, + # swa_anneal_epochs=swa_anneal_epochs, + # cpu_offload=cpu_offload, + # ) @record def train_epoch(self, data_loader): @@ -116,29 +123,35 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ - + batch_keys = [ + self.input_key, f"{self.input_key}_lengths", self.target_key + ] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() self.sp = data_loader.dataset.sp - for batch, (data, audio_length, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - # TODO: Check and Modify data, target - data, audio_length, target = data.to(self.device), audio_length.to( - self.device), target.to(self.device) - batch_size = data.shape[0] + + # # TODO: Check and Modify data, target + # data, audio_length, target = data.to(self.device), audio_length.to( + # self.device), target.to(self.device) + #print(data.keys(), batch_keys, flush=True) + input_data, input_lengths, target = tensors_subset( + data, batch_keys, self.device) + batch_size = input_data.shape[0] with self.amp_autocast(): # print("xx", data.shape, data.shape[0] * data.shape[1] / 16000, # torch.sum(audio_length).item() / 16000, # torch.min(audio_length).item() / 16000, # torch.max(audio_length).item() / 16000) - output, loss = self.model(data, - x_lengths=audio_length, + output, loss = self.model(input_data, + x_lengths=input_lengths, y=target) loss = loss.mean() / self.grad_acc_steps @@ -173,7 +186,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False): data_loader: PyTorch data loader return input/output pairs. sw_update_bn: wheter or not, update batch-norm layers in SWA. """ - + batch_keys = [ + self.input_key, f"{self.input_key}_lengths", self.target_key + ] metric_acc = MetricAcc(self.device) batch_metrics = ODict() with torch.no_grad(): @@ -184,17 +199,22 @@ def validation_epoch(self, data_loader, swa_update_bn=False): log_tag = "val_" self.model.eval() - for batch, (data, audio_length, target) in enumerate(data_loader): - data, audio_length, target = data.to( - self.device), audio_length.to(self.device), target.to( - self.device) - batch_size = data.shape[0] + for batch, data in enumerate(data_loader): + + input_data, input_lengths, target = tensors_subset( + data, batch_keys, self.device) + batch_size = input_data.shape[0] + + # data, audio_length, target = data.to( + # self.device), audio_length.to(self.device), target.to( + # self.device) + # batch_size = data.shape[0] # data, target = data.to(self.device), target.to(self.device) # batch_size = data.shape[0] with self.amp_autocast(): - output, loss = self.model(data, - x_lengths=audio_length, + output, loss = self.model(input_data, + x_lengths=input_lengths, y=target) # output = self.model(data) # loss = self.loss(output, target) @@ -208,3 +228,23 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = metric_acc.metrics logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + super_skip = skip.copy() + super_skip.add("target_key") + TorchTrainer.add_class_args(parser, + train_modes=train_modes, + skip=super_skip) + if "target_key" not in skip: + parser.add_argument("--target-key", + default="text", + help="dict. key for nnet targets") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) From aabbef1ae71e3580f8582058b523041a3108474a Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 10 Feb 2023 14:54:47 -0500 Subject: [PATCH 082/154] start refactorizing rnn-t --- .../torch/layer_blocks/transducer_joiner.py | 99 +++++++ .../layer_blocks/transducer_predictor.py | 224 ++++++++++++++++ .../torch/models/transducer/rnn_transducer.py | 250 ++++++++++++++++++ .../torch/models/transducer/transducer.py | 24 +- .../torch/narchs/rnn_transducer_decoder.py | 95 +++++++ 5 files changed, 679 insertions(+), 13 deletions(-) create mode 100644 hyperion/torch/layer_blocks/transducer_joiner.py create mode 100644 hyperion/torch/layer_blocks/transducer_predictor.py create mode 100644 hyperion/torch/models/transducer/rnn_transducer.py create mode 100644 hyperion/torch/narchs/rnn_transducer_decoder.py diff --git a/hyperion/torch/layer_blocks/transducer_joiner.py b/hyperion/torch/layer_blocks/transducer_joiner.py new file mode 100644 index 00000000..ee7a667b --- /dev/null +++ b/hyperion/torch/layer_blocks/transducer_joiner.py @@ -0,0 +1,99 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +import logging +from typing import Optional, Tuple + +import torch +import torch.nn as nn + + +class TransducerJoiner(nn.Module): + """ RNN-T Joiner network. + Implementation based on + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/joiner.py + + Attributes: + in_feats: input feature dimension. + vocab_size: vocabulary size + """ + + def __init__(self, in_feats: int, vocab_size: int): + super().__init__() + self.in_feats = in_feats + self.vocab_size = vocab_size + + self.output = nn.Linear(in_feats, out_dims) + + def forward(self, encoder_out: torch.Tensor, + pred_out: torch.Tensor) -> torch.Tensor: + """ + Args: + encoder_out: Output from the encoder with shape = (N, T, C). + pred_out: Output from the predictor with shape = (N, U, C). + Returns: + Return a tensor of shape (N, T, U, C). + """ + assert encoder_out.ndim == pred_out.ndim == 3 + assert encoder_out.size(0) == pred_out.size(0) + assert encoder_out.size(2) == pred_out.size(2) + + encoder_out = encoder_out.unsqueeze(2) + # Now encoder_out is (N, T, 1, C) + pred_out = pred_out.unsqueeze(1) + # Now decoder_out is (N, 1, U, C) + x = torch.tanh(encoder_out + pred_out) + + logits = self.output(x) + return logits + + def get_config(self): + config = { + "in_feats": self.in_feats, + "out_dims": self.out_dims, + "num_layers": self.num_layers, + } + + # base_config = super().get_config() + return dict(list(config.items())) + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "in_feats", + "out_dims", + "num_layers", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + @staticmethod + def add_class_args(parser, + prefix=None, + skip=set(["in_feats", "out_dims"])): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument("--in-feats", + type=int, + required=True, + help=("input feature dimension")) + + if "out_dims" not in skip: + parser.add_argument("--out-dims", + type=int, + required=True, + help=("output feature dimension (vocab size)")) + parser.add_argument("--num-layers", + default=1, + type=int, + help=("layers of the joiner")) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/layer_blocks/transducer_predictor.py b/hyperion/torch/layer_blocks/transducer_predictor.py new file mode 100644 index 00000000..178c423a --- /dev/null +++ b/hyperion/torch/layer_blocks/transducer_predictor.py @@ -0,0 +1,224 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +import logging +from typing import Optional, Tuple + +import torch +import torch.nn as nn + + +class TransducerPredictor(nn.Module): + """ RNN-T prediction network. + Implmentation based on: + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/decoder.py + + Attributes: + vocab_size: Number of tokens of the modeling unit including blank. + embed_dim: Dimension of the input embedding. + blank_id: The ID of the blank symbol. + num_layers: Number of LSTM layers. + hid_feats: Hidden dimension of LSTM layers. + out_feats: Output dimension of the predictor. + embed_dropout_rate: Dropout rate for the embedding layer. + rnn_dropout_rate: Dropout for LSTM layers. + + """ + + def __init__(self, + vocab_size: int, + embed_dim: int, + num_layers: int, + hid_feats: int, + out_feats: int, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + rnn_type: str = "lstm", + blank_id: int = 0): + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embed_dim=embed_dim, + padding_idx=blank_id, + ) + self.embed_dropout = nn.Dropout(embed_dropout_rate) + if rnn_type == "lstm": + self.rnn = nn.LSTM( + input_size=embed_dim, + hidden_size=hid_feats, + num_layers=num_layers, + batch_first=True, + dropout=rnn_dropout_rate, + ) + elif rnn_type == "gru": + self.rnn = nn.GRU( + input_size=embed_dim, + hidden_size=hid_feats, + num_layers=num_layers, + batch_first=True, + dropout=rnn_dropout_rate, + ) + else: + raise Exception(f"Unknown RNN type {rnn_type}") + + self.out_feats = out_feats + self.blank_id = blank_id + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.num_layers = num_layers + self.hid_feats = hid_feats + self.embed_dropout_rate = embed_dropout_rate + self.rnn_dropout_rate = rnn_dropout_rate + self.output = nn.Linear(hid_feats, in_feats) + + def forward( + self, + y: torch.Tensor, + states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Args: + y: previous y_{ prepended. + states: tuple of tensors containing RNN layers states + Returns: + - rnn_output, a tensor of shape (N, U, C) + - (h, c), containing the states i for RNN layers with shape (num_layers, N, C). + """ + embed = self.embedding(y) + embed = self.embed_dropout(embed) + rnn_out, (h, c) = self.rnn(embed, states) + out = self.output(rnn_out) + + return out, (h, c) + + def get_config(self): + config = { + "in_feats": self.in_feats, + "blank_id": self.blank_id, + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "num_layers": self.num_layers, + "hid_feats": self.hid_feats, + "embed_dropout_rate": self.embed_dropout_rate, + "rnn_dropout_rate": self.rnn_dropout_rate, + } + + # base_config = super().get_config() + return dict(list(config.items())) + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "in_feats", + "blank_id", + "vocab_size", + "embed_dim", + "num_layers", + "hid_feats", + "embed_dropout_rate", + "rnn_dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + @staticmethod + def filter_finetune_args(**kwargs): + valid_args = ( + "embed_dropout_rate", + "rnn_dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + return args + + @staticmethod + def add_class_args(parser, + prefix=None, + skip=set(["in_feats", "blank_id", "vocab_size"])): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument("--in-feats", + type=int, + required=True, + help=("input feature dimension")) + if "blank_id" not in skip: + parser.add_argument("--blank-id", + type=int, + required=True, + help=("blank id from sp model")) + if "vocab_size" not in skip: + parser.add_argument("--vocab-size", + type=int, + required=True, + help=("output prediction dimension")) + parser.add_argument("--embedding-dim", + default=1024, + type=int, + help=("feature dimension")) + parser.add_argument("--embedding-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) + + parser.add_argument("--num-layers", default=2, type=int, help=("")) + + parser.add_argument("--hidden-dim", default=512, type=int, help=("")) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + ): + logging.info("changing decoder config") + + if override_dropouts: + logging.info("overriding decoder dropouts") + self.rnn_dropout_rate = rnn_dropout_rate + self.rnn.p = self.rnn_dropout_rate + self.embed_dropout_rate = embed_dropout_rate + self.embed_dropout = nn.Dropout(self.embed_dropout_rate) + + @staticmethod + def add_finetune_args(parser, + prefix=None, + skip=set(["in_feats", "blank_id", "vocab_size"])): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model.")) + parser.add_argument("--embedding-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py new file mode 100644 index 00000000..dd91da5f --- /dev/null +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -0,0 +1,250 @@ +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Note we use `rnnt_loss` from torchaudio, which exists only in +torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0 +""" +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import logging +import torch +import torch.nn as nn +import torchaudio +import torchaudio.functional +#from .encoder_interface import EncoderInterface + +from ...torch_model import TorchModel +from hyperion.utils.text import add_sos +# from .conformer import Conformer +from .decoder import Decoder +from .joiner import Joiner + + +class Transducer(TorchModel): + """It implements https://arxiv.org/pdf/1211.3711.pdf + "Sequence Transduction with Recurrent Neural Networks" + """ + + def __init__( + self, + encoder, + # conformer_enc, + decoder, + joiner, + vocab_size, + blank_id, + ): + """ + Args: + encoder: + It is the transcription network in the paper. Its accepts + two inputs: `x` of (N, T, C) and `x_lengths` of shape (N,). + It returns two tensors: `logits` of shape (N, T, C) and + `logit_lens` of shape (N,). + decoder: + It is the prediction network in the paper. Its input shape + is (N, U) and its output shape is (N, U, C). It should contain + one attribute: `blank_id`. + joiner: + It has two inputs with shapes: (N, T, C) and (N, U, C). Its + output shape is (N, T, U, C). Note that its output contains + unnormalized probs, i.e., not processed by log-softmax. + """ + super().__init__() + decoder["blank_id"] = blank_id + decoder["vocab_size"] = vocab_size + joiner["out_dims"] = vocab_size + + self.vocab_size = vocab_size + self.blank_id = blank_id + self.encoder = encoder + self.decoder = Decoder(**decoder) + self.joiner = Joiner(**joiner) + + def forward( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: k2.RaggedTensor, + ) -> torch.Tensor: + """ + Args: + x: + A 3-D tensor of shape (N, T, C). + x_lengths: + A 1-D tensor of shape (N,). It contains the number of frames in `x` + before padding. + y: + A ragged tensor with 2 axes [utt][label]. It contains labels of each + utterance. + Returns: + Return the transducer loss. + """ + assert x.ndim == 3, x.shape + assert x_lengths.ndim == 1, x_lengths.shape + assert y.num_axes == 2, y.num_axes + + assert x.size(0) == x_lengths.size(0) == y.dim0 + + # wav2vec2 works as encoder + # encoder_out, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) + + encoder_out = x + # Now for the decoder, i.e., the prediction network + row_splits = y.shape.row_splits(1) + y_lens = row_splits[1:] - row_splits[:-1] + + blank_id = self.decoder.blank_id + sos_y = add_sos(y, sos_id=blank_id) + + sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id) + sos_y_padded = sos_y_padded.to(torch.int64) + + decoder_out, _ = self.decoder(sos_y_padded) + + logits = self.joiner(encoder_out, decoder_out) + + # rnnt_loss requires 0 padded targets + # Note: y does not start with SOS + y_padded = y.pad(mode="constant", padding_value=0) + + assert hasattr(torchaudio.functional, "rnnt_loss"), ( + f"Current torchaudio version: {torchaudio.__version__}\n" + "Please install a version >= 0.10.0") + + x_lengths = x_lengths.to(torch.int32) + + loss = torchaudio.functional.rnnt_loss( + logits=logits, + targets=y_padded.to(torch.int32), + logit_lengths=x_lengths, + target_lengths=y_lens, + blank=blank_id, + reduction="sum", + ) + + return logits, loss + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode == "ft-embed-affine": + self.unfreeze() + self.freeze_preembed_layers() + else: + raise ValueError(f"invalid train_mode={mode}") + + self._train_mode = mode + + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return ["full", "frozen", "ft-embed-affine"] + + def get_config(self): + dec_cfg = self.decoder.get_config() + join_cfg = self.joiner.get_config() + + config = { + "blank_id": self.blank_id, + "vocab_size": self.vocab_size, + "decoder": dec_cfg, + "joiner": join_cfg, + } + + # base_config = super().get_config() + return dict(list(config.items())) + + @staticmethod + def filter_args(**kwargs): + + # get arguments for pooling + decoder_args = Decoder.filter_args(**kwargs["decoder"]) + joiner_args = Joiner.filter_args(**kwargs["joiner"]) + + valid_args = () + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + args["decoder"] = decoder_args + args["joiner"] = joiner_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Decoder.add_class_args(parser, prefix="decoder") + Joiner.add_class_args(parser, prefix="joiner") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + def change_config( + self, + decoder, + # joiner, + ): + logging.info("changing transducer config") + self.decoder.change_config(**decoder) + # self.joiner.change_config(**joiner) + + @staticmethod + def filter_finetune_args(**kwargs): + # get arguments for pooling + decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) + # joiner_args = Joiner.filter_finetune_args(**kwargs["joiner"]) + + valid_args = () + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + args["decoder"] = decoder_args + # args["joiner"] = joiner_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Decoder.add_finetune_args(parser, prefix="decoder") + # Joiner.add_finetune_args(parser, prefix="joiner") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + add_argparse_args = add_class_args + add_argparse_finetune_args = add_finetune_args diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index 8d2a09e8..855e1590 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -28,7 +28,7 @@ import torch.nn as nn import torchaudio import torchaudio.functional -from .encoder_interface import EncoderInterface +#from .encoder_interface import EncoderInterface from ...torch_model import TorchModel from hyperion.utils.text import add_sos @@ -41,13 +41,15 @@ class Transducer(TorchModel): """It implements https://arxiv.org/pdf/1211.3711.pdf "Sequence Transduction with Recurrent Neural Networks" """ + def __init__( self, - vocab_size, - blank_id, + encoder_net, # conformer_enc, decoder, joiner, + vocab_size, + blank_id, ): """ Args: @@ -66,9 +68,6 @@ def __init__( unnormalized probs, i.e., not processed by log-softmax. """ super().__init__() - # assert isinstance(encoder, EncoderInterface) - # assert hasattr(decoder, "blank_id") - decoder["blank_id"] = blank_id decoder["vocab_size"] = vocab_size joiner["out_dims"] = vocab_size @@ -211,10 +210,11 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - def change_config(self, + def change_config( + self, decoder, # joiner, - ): + ): logging.info("changing transducer config") self.decoder.change_config(**decoder) # self.joiner.change_config(**joiner) @@ -225,8 +225,7 @@ def filter_finetune_args(**kwargs): decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) # joiner_args = Joiner.filter_finetune_args(**kwargs["joiner"]) - valid_args = ( - ) + valid_args = () args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) args["decoder"] = decoder_args @@ -243,9 +242,8 @@ def add_finetune_args(parser, prefix=None): # Joiner.add_finetune_args(parser, prefix="joiner") if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) add_argparse_args = add_class_args add_argparse_finetune_args = add_finetune_args - - diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py new file mode 100644 index 00000000..ef153776 --- /dev/null +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -0,0 +1,95 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from jsonargparse import ActionParser, ArgumentParser + +import torch +import torch.nn as nn + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +from ...utils import filter_func_args +from ..layer_blocks import TransducerPredictor as Predictor, TransducerJoiner as Joiner +from .net_arch import NetArch + + +class RNNTransducerDecoder(NetArch): + """ RNN-T Decoder composed of Predictor and Joiner networks + Implementation based on + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/transducer.py + + Attributes: + in_feats: input features dimension (encoder output) + vocab_size: Number of tokens of the modeling unit including blank. + embed_dim: Dimension of the predictor input embedding. + blank_id: The ID of the blank symbol. + num_layers: Number of LSTM layers. + hid_feats: Hidden dimension for predictor layers. + embed_dropout_rate: Dropout rate for the embedding layer. + rnn_dropout_rate: Dropout for LSTM layers. + + """ + + def __init__(self, + in_feats: int, + vocab_size: int, + embed_dim: int, + num_pred_layers: int, + pred_hid_feats: int, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + rnn_type: str = "lstm", + blank_id: int = 0): + + super().__init__() + self.in_feats = in_feats + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.num_pred_layers = num_pred_layers + self.pred_hid_feats = pred_hid_feats + self.embed_dropout_rate = embed_dropout_rate + self.rnn_dropout_rate = rnn_dropout_rate + self.rnn_type = rnn_type + self.blank_id = blank_id + + pred_args = filter_func_args(Predictor.__init__, locals()) + pred_args["num_layers"] = num_pred_layers + pred_args["hid_feats"] = pred_hid_feats + pred_args["out_feats"] = in_feats + self.predictor = Predictor(**pred_args) + self.joiner = Joiner(in_feats, vocab_size) + + def forward(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: k2.RaggedTensor) -> torch.Tensor: + + # get y_lengths + row_splits = y.shape.row_splits(1) + y_lengths = row_splits[1:] - row_splits[:-1] + + # shift y adding token + sos_y = add_sos(y, sos_id=self.blank_id) + sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id) + sos_y_padded = sos_y_padded.to(torch.int64) + + # apply predictor and joiner + pred_out, _ = self.predictor(sos_y_padded) + logits = self.joiner(x, pred_out) + + # rnnt_loss requires 0 padded targets + # Note: y does not start with SOS + y_padded = y.pad(mode="constant", padding_value=0) + x_lengths = x_lengths.to(torch.int32) + loss = torchaudio.functional.rnnt_loss( + logits=logits, + targets=y_padded.to(torch.int32), + logit_lengths=x_lengths, + target_lengths=y_lengths, + blank=blank_id, + reduction="sum", + ) + return loss From 9a619a5f4808d318cb488504011dfa700ad44423 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 16 Feb 2023 08:58:02 -0500 Subject: [PATCH 083/154] got results with ravana resnet100 --- egs/voxceleb/v1.1/README.md | 12 ++++++++++++ .../train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml | 15 +++++++++++---- .../train_idrnd_resnet100_xvec_stage1_v2.0.yaml | 2 +- .../train_idrnd_resnet100_xvec_stage2_v2.0.yaml | 2 +- .../v1.1/conf/train_res2net50_xvec_default.yaml | 2 +- .../config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh | 12 ++++++++++-- .../config_fbank80_stmn_idrnd_resnet100.v2.0.sh | 5 ++--- 7 files changed, 38 insertions(+), 12 deletions(-) diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 83027e16..57a09ad8 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -92,6 +92,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.09 | 0.068 | 0.121 | | | | | Cosine + AS-Norm | 1.0 | 0.064 | 0.110 | | | | | Cosine + QMF | 0.87 | 0.059 | 0.076 | +| config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.053 | 0.083 | +| | | | Cosine + AS-Norm | 0.78 | 0.046 | 0.078 | +| | | | Cosine + QMF | 0.74 | 0.046 | 0.077 | ### VoxCeleb 1 Entire-Clean trial list @@ -100,6 +103,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.21 | 0.075 | 0.129 | | | | | Cosine + AS-Norm | 1.15 | 0.069 | 0.113 | | | | | Cosine + QMF | 1.12 | 0.067 | 0.111 | +| config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.92 | 0.058 | 0.104 | +| | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.089 | +| | | | Cosine + QMF | 0.88 | 0.054 | 0.092 | ### VoxCeleb 1 Hard-Clean trial list @@ -108,6 +114,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.17 | 0.129 | 0.212 | | | | | Cosine + AS-Norm | 1.98 | 0.116 | 0.190 | | | | | Cosine + QMF | 1.88 | 0.112 | 0.181 | +| config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.80 | 0.106 | 0.171 | +| | | | Cosine + AS-Norm | 1.59 | 0.091 | 0.146 | +| | | | Cosine + QMF | 1.59 | 0.092 | 0.151 | ### VoxSRC2022 dev @@ -116,6 +125,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.85 | 0.187 | 0.310 | | | | | Cosine + AS-Norm | 2.69 | 0.182 | 0.310 | | | | | Cosine + QMF | 2.80 | 0.196 | 0.338 | +| config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.50 | 0.160 | 0.270 | +| | | | Cosine + AS-Norm | 2.31 | 0.139 | 0.240 | +| | | | Cosine + QMF | 2.54 | 0.153 | 0.248 | ## Results before 2023 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml index 4a4a8a88..e7a94225 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml @@ -10,11 +10,13 @@ data: sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 - max_chunk_length: 6.0 - min_chunk_length: 6.0 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + # max_chunk_length: 6.0 + # min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 class_name: class_id - seg_weight_mode: uniform + seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: num_workers: 8 @@ -33,7 +35,7 @@ data: min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 class_name: class_id - seg_weight_mode: uniform + seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: num_workers: 8 @@ -43,6 +45,11 @@ model: margin: 0.4 margin_warmup_epochs: 0 intertop_margin: 0.1 + # override_dropouts: false + # dropout_rate: 0.1 + # resnet_enc: + # override_dropouts: true + # dropout_rate: 0.1 trainer: optim: opt_type: sgd diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml index fba4ce80..b7f02a47 100644 --- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml @@ -66,5 +66,5 @@ trainer: update_lr_on_opt_step: true use_amp: true log_interval: 1000 - epochs: 40 + epochs: 20 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml index 6c209a9f..7e62ec72 100644 --- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml @@ -59,7 +59,7 @@ trainer: update_lr_on_opt_step: true use_amp: true log_interval: 1000 - epochs: 35 + epochs: 5 eff_batch_size: 256 swa_start: 31 swa_lr: 1e-4 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml b/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml index 1d387790..c7eb6ee1 100644 --- a/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml +++ b/egs/voxceleb/v1.1/conf/train_res2net50_xvec_default.yaml @@ -2,6 +2,6 @@ data: train: train_data_default.yaml val: val_data_default.yaml feats: fbank80_stmn_16k.yaml -model: resnet34.yaml +model: res2net50.yaml trainer: trainer_default.yaml \ No newline at end of file diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh index e9c634a3..0532754f 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v2.0.sh @@ -18,12 +18,20 @@ nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v2.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/model_ep0040.pth +nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0020.pth +#nnet_s1=$nnet_s1_dir/model_ep0010.pth nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0030.pth -nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth +nnet_s2=$nnet_s2_dir/model_ep0010.pth +#nnet_s2=$nnet_s2_dir/model_ep0020.pth +#nnet_s2=$nnet_s2_dir/model_ep0010.pth +#nnet_s2=$nnet_s2_dir/model_ep0005.pth +#nnet_s2=$nnet_s2_dir/model_ep0002.pth +#nnet_s2=$nnet_s2_dir/model_ep0001.pth +#nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth # back-end do_plda=false diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh index b9363c3f..f71545b7 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v2.0.sh @@ -17,13 +17,12 @@ nnet_name=${feat_type}_idrnd_resnet100.v2.0 nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v2.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0040.pth +nnet_s1=$nnet_s1_dir/model_ep0020.pth nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0030.pth -nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth +nnet_s2=$nnet_s2_dir/model_ep0005.pth # back-end do_plda=false From 42f1ebdc8167a20238a82e7be0ee1cf19d89c5bd Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 16 Feb 2023 17:53:58 -0500 Subject: [PATCH 084/154] xxx --- .../v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml index 7e62ec72..2311b07b 100644 --- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml @@ -10,8 +10,8 @@ data: sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 16 - max_chunk_length: 6.0 - min_chunk_length: 6.0 + max_chunk_length: 4.0 + min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 class_name: class_id seg_weight_mode: data-prior @@ -29,8 +29,8 @@ data: sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 16 - max_chunk_length: 6.0 - min_chunk_length: 6.0 + max_chunk_length: 4.0 + min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 class_name: class_id seg_weight_mode: data-prior From c2bd3dbbb79b8ce66a45c13935d63520be952c75 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 2 Mar 2023 10:56:05 -0500 Subject: [PATCH 085/154] ecapa v3 recipe --- egs/voxceleb/v1.1/README.md | 16 ++ ...rain_ecapatdnn2048x4_xvec_stage1_v3.0.yaml | 93 ++++++++ ...rain_ecapatdnn2048x4_xvec_stage2_v3.0.yaml | 71 ++++++ ...onfig_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | 44 ++++ egs/voxceleb/v1.1/run_030_extract_xvectors.sh | 2 +- egs/voxceleb/v1.1/run_040_eval_be.sh | 225 ++++++++++++++++++ .../{eval-be-v2.py => eval_be_cos.py} | 22 +- egs/voxceleb/v1/steps_be/eval_be_cos.sh | 11 +- ...{eval-be-cos-qmf.py => eval_be_cos_qmf.py} | 12 +- egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh | 7 +- ...al-be-v2-snorm.py => eval_be_cos_snorm.py} | 16 +- egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh | 11 +- .../{eval-be-v1.py => eval_be_plda_v1.py} | 0 .../{eval_be_v1.sh => eval_be_plda_v1.sh} | 4 +- egs/voxceleb/v1/steps_be/eval_be_v2.sh | 2 +- egs/voxceleb/v1/steps_be/train-be-v2.py | 82 ------- egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh | 4 +- .../{train-be-v1.py => train_be_plda_v1.py} | 0 .../{train_be_v1.sh => train_be_plda_v1.sh} | 2 +- egs/voxceleb/v1/steps_be/train_be_proj_v1.py | 95 ++++++++ .../{train_be_v2.sh => train_be_proj_v1.sh} | 3 +- .../steps_be/{train-qmf.py => train_qmf.py} | 0 hyperion/np/np_model.py | 2 + hyperion/np/transforms/pca.py | 6 + 24 files changed, 611 insertions(+), 119 deletions(-) create mode 100644 egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh rename egs/voxceleb/v1/steps_be/{eval-be-v2.py => eval_be_cos.py} (80%) rename egs/voxceleb/v1/steps_be/{eval-be-cos-qmf.py => eval_be_cos_qmf.py} (96%) rename egs/voxceleb/v1/steps_be/{eval-be-v2-snorm.py => eval_be_cos_snorm.py} (92%) rename egs/voxceleb/v1/steps_be/{eval-be-v1.py => eval_be_plda_v1.py} (100%) rename egs/voxceleb/v1/steps_be/{eval_be_v1.sh => eval_be_plda_v1.sh} (94%) delete mode 100755 egs/voxceleb/v1/steps_be/train-be-v2.py rename egs/voxceleb/v1/steps_be/{train-be-v1.py => train_be_plda_v1.py} (100%) rename egs/voxceleb/v1/steps_be/{train_be_v1.sh => train_be_plda_v1.sh} (96%) create mode 100755 egs/voxceleb/v1/steps_be/train_be_proj_v1.py rename egs/voxceleb/v1/steps_be/{train_be_v2.sh => train_be_proj_v1.sh} (94%) rename egs/voxceleb/v1/steps_be/{train-qmf.py => train_qmf.py} (100%) diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 57a09ad8..7b6b278f 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -85,6 +85,8 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr ## Results + + ### VoxCeleb 1 Original-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -95,6 +97,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.053 | 0.083 | | | | | Cosine + AS-Norm | 0.78 | 0.046 | 0.078 | | | | | Cosine + QMF | 0.74 | 0.046 | 0.077 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.78 | 0.061 | 0.110 | +| | | | Cosine + AS-Norm | 0.70 | 0.054 | 0.102 | +| | | | Cosine + QMF | 0.66 | 0.047 | 0.090 | ### VoxCeleb 1 Entire-Clean trial list @@ -106,6 +111,10 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.92 | 0.058 | 0.104 | | | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.089 | | | | | Cosine + QMF | 0.88 | 0.054 | 0.092 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.93 | 0.058 | 0.103 | +| | | | Cosine + AS-Norm | 0.88 | 0.052 | 0.092 | +| | | | Cosine + QMF | 0.90 | 0.053 | 0.090 | + ### VoxCeleb 1 Hard-Clean trial list @@ -117,6 +126,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.80 | 0.106 | 0.171 | | | | | Cosine + AS-Norm | 1.59 | 0.091 | 0.146 | | | | | Cosine + QMF | 1.59 | 0.092 | 0.151 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.78 | 0.110 | 0.180 | +| | | | Cosine + AS-Norm | 1.61 | 0.097 | 0.159 | +| | | | Cosine + QMF | 1.62 | 0.096 | 0.158 | ### VoxSRC2022 dev @@ -128,6 +140,10 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.50 | 0.160 | 0.270 | | | | | Cosine + AS-Norm | 2.31 | 0.139 | 0.240 | | | | | Cosine + QMF | 2.54 | 0.153 | 0.248 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.42 | 0.160 | 0.265 | +| | | | Cosine + AS-Norm | 2.32 | 0.152 | 0.273 | +| | | | Cosine + QMF | 2.54 | 0.179 | 0.304 | + ## Results before 2023 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..408bad1a --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -0,0 +1,93 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.2 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + #min_lr: 1.0e-05 + min_lr: 1.0e-06 + warmup_steps: 15000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..91a7d0b8 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + # max_chunk_length: 6.0 + # min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0.25 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh new file mode 100644 index 00000000..5f7ed094 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn2048x4.v3.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh index 4e6a8790..5bd2c17d 100755 --- a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh @@ -44,7 +44,7 @@ fi xvector_dir=exp/xvectors/$nnet_name -if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" ) ]]; then +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then # Extract xvectors for training LDA/PLDA for name in voxceleb2cat_train do diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index 358e2acf..37a344b6 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -281,3 +281,228 @@ if [ "$do_qmf" == "true" ];then fi +if [ "$do_pca" != "true" ];then + exit 0 +fi + + +be_name=pca_r${pca_var_r} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_cosine_dir=exp/scores/$nnet_name/$be_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/$be_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/$be_name/cosine_qmf + +be_dir=exp/be/$nnet_name/ +score_be_dir=$score_dir/pca_r${pca_var_r} + +if [ $stage -le 10 ]; then + echo "Train projection on Voxceleb2" + $train_cmd $be_dir/log/train_be.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_proj_v1.py \ + --v-file scp:$xvector_dir/$plda_data/xvector.scp \ + --train-list data/$plda_data/utt2spk \ + --output-path $be_dir \ + --pca.pca-var-r $pca_var_r + +fi + + +if [ $stage -le 11 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + --preproc-file $be_dir/preproc.h5 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi +exit +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $score_cosine_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $score_cosine_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_dir + + for f in $(ls $score_cosine_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + +fi + + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/*_results); + do + echo $f + cat $f + echo "" + done + fi + + if [ $stage -le 6 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_snorm.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_snorm_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + fi +fi + + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + steps_be/train_be_cos_qmf.sh \ + --cmd "$train_cmd" --coh-nbest 1000 \ + data/voxceleb2cat_train/trials \ + data/voxceleb2cat_train/utt2model \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $xvector_dir/voxceleb2cat_train/utt2num_frames \ + data/voxceleb2cat_train/snorm_utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/voxceleb2_qmf_scores + + fi + + if [ $stage -le 8 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $xvector_dir/voxceleb1_test/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_snorm.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_qmf.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxceleb1{,_snorm,_qmf}_[oeh]_clean_results); + do + echo $f + cat $f + echo "" + done + + fi + + if [ $stage -le 9 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $xvector_dir/voxsrc22_dev/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $xvector_dir/voxsrc22_test/utt2num_frames \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_qmf_dir/qmf.h5 \ + # $score_cosine_qmf_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_snorm.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_qmf.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxsrc22_dev{,_snorm,_qmf}_results); + do + echo $f + cat $f + echo "" + done + fi + +fi + + diff --git a/egs/voxceleb/v1/steps_be/eval-be-v2.py b/egs/voxceleb/v1/steps_be/eval_be_cos.py similarity index 80% rename from egs/voxceleb/v1/steps_be/eval-be-v2.py rename to egs/voxceleb/v1/steps_be/eval_be_cos.py index 413ca313..1f9978ee 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-v2.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos.py @@ -26,8 +26,8 @@ from hyperion.np.transforms import TransformList -def eval_plda( - iv_file, +def eval_cos( + v_file, ndx_file, enroll_file, test_file, @@ -47,7 +47,7 @@ def eval_plda( preproc = None tdr = TDR( - iv_file, + v_file, ndx_file, enroll_file, test_file, @@ -60,7 +60,7 @@ def eval_plda( x_e, x_t, enroll, ndx = tdr.read() t1 = time.time() - logging.info("computing llr") + logging.info("computing llr %d", x_e.shape[1]) scores = cosine_scoring(x_e, x_t) dt = time.time() - t1 @@ -82,15 +82,15 @@ def eval_plda( parser = ArgumentParser(description="Eval cosine-scoring") - parser.add_argument("--iv-file", dest="iv_file", required=True) - parser.add_argument("--ndx-file", dest="ndx_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) - parser.add_argument("--test-file", dest="test_file", default=None) - parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--v-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-file", default=None) + parser.add_argument("--preproc-file", default=None) TDR.add_argparse_args(parser) - parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--score-file", required=True) parser.add_argument( "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int ) @@ -101,4 +101,4 @@ def eval_plda( logging.debug(args) assert args.test_file is not None or args.ndx_file is not None - eval_plda(**namespace_to_dict(args)) + eval_cos(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos.sh b/egs/voxceleb/v1/steps_be/eval_be_cos.sh index 90f118af..434732d6 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos.sh +++ b/egs/voxceleb/v1/steps_be/eval_be_cos.sh @@ -2,13 +2,13 @@ # Copyright 2020 Johns Hopkins University (Jesus Villalba) # Apache 2.0. # - +set -e cmd=run.pl num_parts=8 +preproc_file="" if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; -set -e if [ $# -ne 4 ]; then echo "Usage: $0 " @@ -27,6 +27,9 @@ name=$(basename $output_file) echo "$0 score $ndx_file" +if [ -n "$preproc_file" ];then + extra_args="--preproc-file $preproc_file" +fi for((i=1;i<=$num_parts;i++)); do @@ -34,8 +37,8 @@ do do $cmd $output_dir/log/${name}_${i}_${j}.log \ hyp_utils/conda_env.sh \ - steps_be/eval-be-v2.py \ - --iv-file scp:$vector_file \ + steps_be/eval_be_cos.py $extra_args \ + --v-file scp:$vector_file \ --ndx-file $ndx_file \ --enroll-file $enroll_file \ --score-file $output_file \ diff --git a/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py similarity index 96% rename from egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py rename to egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py index 82050ed1..e0e1c2da 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-cos-qmf.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py @@ -61,6 +61,7 @@ def eval_plda( coh_v_file, score_file, qmf_file, + preproc_file, model_part_idx, num_model_parts, seg_part_idx, @@ -69,13 +70,18 @@ def eval_plda( **kwargs ): + if preproc_file is not None: + preproc = TransformList.load(preproc_file) + else: + preproc = None + logging.info("loading data") tdr = TDR( v_file, ndx_file, enroll_file, None, - None, + preproc, model_part_idx, num_model_parts, seg_part_idx, @@ -118,7 +124,7 @@ def eval_plda( scores = cosine_scoring(x_e, x_t) logging.info("read cohort x-vectors") - vcr = VCR(coh_v_file, coh_file) + vcr = VCR(coh_v_file, coh_file, preproc=preproc) x_coh, ids_coh = vcr.read() D_coh = PLDA.compute_stats_hard(x_coh, class_ids=ids_coh) x_coh = D_coh[1] / np.expand_dims(D_coh[0], axis=-1) @@ -194,7 +200,7 @@ def eval_plda( parser.add_argument("--coh-file", required=True) parser.add_argument("--coh-nbest", type=int, default=400) parser.add_argument("--qmf-file", default=None) - # parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--preproc-file", default=None) TDR.add_argparse_args(parser) diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh index a8ad0178..a0712304 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.sh @@ -7,6 +7,7 @@ cmd=run.pl stage=1 num_parts=16 coh_nbest=1000 +preproc_file="" if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; @@ -33,6 +34,10 @@ name=$(basename $output_file) echo "$0 score $ndx_file" +if [ -n "$preproc_file" ];then + extra_args="--preproc-file $preproc_file" +fi + if [ $stage -le 1 ];then for((i=1;i<=$num_parts;i++)); do @@ -40,7 +45,7 @@ if [ $stage -le 1 ];then do $cmd $output_dir/log/${name}_${i}_${j}.log \ hyp_utils/conda_env.sh \ - steps_be/eval-be-cos-qmf.py \ + steps_be/eval_be_cos_qmf.py $extra_args \ --v-file scp:$vector_file \ --ndx-file $ndx_file \ --enroll-file $enroll_file \ diff --git a/egs/voxceleb/v1/steps_be/eval-be-v2-snorm.py b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py similarity index 92% rename from egs/voxceleb/v1/steps_be/eval-be-v2-snorm.py rename to egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py index 4ad0a869..dad89ced 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-v2-snorm.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py @@ -30,13 +30,13 @@ def eval_plda( - iv_file, + v_file, ndx_file, enroll_file, test_file, preproc_file, score_file, - coh_iv_file, + coh_v_file, coh_file, coh_nbest, model_part_idx, @@ -53,7 +53,7 @@ def eval_plda( preproc = None tdr = TDR( - iv_file, + v_file, ndx_file, enroll_file, test_file, @@ -66,8 +66,10 @@ def eval_plda( x_e, x_t, enroll, ndx = tdr.read() coh_segs = SegmentSet.load(coh_file) - r = DRF.create(coh_iv_file) + r = DRF.create(coh_v_file) x_coh = r.read(coh_segs["id"], squeeze=True) + if preproc is not None: + x_coh = preproc(x_coh) _, spk_ids = np.unique(coh_segs["class_id"], return_inverse=True) num_coh_spks = np.max(spk_ids) + 1 x_coh_spk = np.zeros((num_coh_spks, x_coh.shape[1])) @@ -107,7 +109,7 @@ def eval_plda( parser = ArgumentParser(description="Eval cosine-scoring with adaptive s-norm") - parser.add_argument("--iv-file", required=True) + parser.add_argument("--v-file", required=True) parser.add_argument("--ndx-file", default=None) parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-file", default=None) @@ -115,11 +117,11 @@ def eval_plda( TDR.add_argparse_args(parser) - parser.add_argument("--coh-iv-file", required=True) + parser.add_argument("--coh-v-file", required=True) parser.add_argument("--coh-file", required=True) parser.add_argument("--coh-nbest", type=int, default=1000) - parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--score-file", required=True) parser.add_argument( "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int ) diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh index 4f5e3e76..b64d80a3 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.sh @@ -6,6 +6,8 @@ cmd=run.pl num_parts=16 coh_nbest=1000 +preproc_file="" + if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; set -e @@ -29,6 +31,9 @@ name=$(basename $output_file) echo "$0 score $ndx_file" +if [ -n "$preproc_file" ];then + extra_args="--preproc-file $preproc_file" +fi for((i=1;i<=$num_parts;i++)); do @@ -36,12 +41,12 @@ do do $cmd $output_dir/log/${name}_${i}_${j}.log \ hyp_utils/conda_env.sh \ - steps_be/eval-be-v2-snorm.py \ - --iv-file scp:$vector_file \ + steps_be/eval_be_cos_snorm.py $extra_args \ + --v-file scp:$vector_file \ --ndx-file $ndx_file \ --enroll-file $enroll_file \ --coh-file $coh_file \ - --coh-iv-file scp:$coh_vector_file \ + --coh-v-file scp:$coh_vector_file \ --score-file $output_file \ --coh-nbest $coh_nbest \ --model-part-idx $i --num-model-parts $num_parts \ diff --git a/egs/voxceleb/v1/steps_be/eval-be-v1.py b/egs/voxceleb/v1/steps_be/eval_be_plda_v1.py similarity index 100% rename from egs/voxceleb/v1/steps_be/eval-be-v1.py rename to egs/voxceleb/v1/steps_be/eval_be_plda_v1.py diff --git a/egs/voxceleb/v1/steps_be/eval_be_v1.sh b/egs/voxceleb/v1/steps_be/eval_be_plda_v1.sh similarity index 94% rename from egs/voxceleb/v1/steps_be/eval_be_v1.sh rename to egs/voxceleb/v1/steps_be/eval_be_plda_v1.sh index eefc989f..69d6ace1 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_v1.sh +++ b/egs/voxceleb/v1/steps_be/eval_be_plda_v1.sh @@ -36,8 +36,8 @@ do do $cmd $output_dir/log/${name}_${i}_${j}.log \ hyp_utils/conda_env.sh \ - steps_be/eval-be-v1.py \ - --iv-file scp:$vector_file \ + steps_be/eval_be_plda_v1.py \ + --v-file scp:$vector_file \ --ndx-file $ndx_file \ --enroll-file $enroll_file \ --preproc-file $preproc_file \ diff --git a/egs/voxceleb/v1/steps_be/eval_be_v2.sh b/egs/voxceleb/v1/steps_be/eval_be_v2.sh index 7389bf2c..bb58872e 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_v2.sh +++ b/egs/voxceleb/v1/steps_be/eval_be_v2.sh @@ -36,7 +36,7 @@ do $cmd $output_dir/log/${name}_${i}_${j}.log \ hyp_utils/conda_env.sh \ steps_be/eval-be-v2.py \ - --iv-file scp:$vector_file \ + --v-file scp:$vector_file \ --ndx-file $ndx_file \ --enroll-file $enroll_file \ --preproc-file $preproc_file \ diff --git a/egs/voxceleb/v1/steps_be/train-be-v2.py b/egs/voxceleb/v1/steps_be/train-be-v2.py deleted file mode 100755 index 4e3d7542..00000000 --- a/egs/voxceleb/v1/steps_be/train-be-v2.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" -import logging -import sys -import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time - -import numpy as np - -from hyperion.hyp_defs import config_logger -from hyperion.helpers import VectorReader as VR -from hyperion.np.transforms import TransformList, CentWhiten, PCA - -from numpy.linalg import matrix_rank - - -def train_be(iv_file, train_list, output_path, **kwargs): - - # Read data - vr_args = VR.filter_args(**kwargs) - vr_train = VR(iv_file, train_list, None, **vr_args) - x = vr_train.read() - del vr_train - - t1 = time.time() - rank = matrix_rank(x) - pca = None - if rank < x.shape[1]: - # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name="pca") - pca.fit(x) - x = pca.predict(x) - logging.info("PCA rank=%d" % (rank)) - - # Train centering and whitening - t1 = time.time() - cw = CentWhiten(name="cw") - cw.fit(x) - - logging.info("PCA-CW Elapsed time: %.2f s." % (time.time() - t1)) - - # Save models - if pca is None: - preproc = TransformList([cw]) - else: - preproc = TransformList([pca, cw]) - - if not os.path.exists(output_path): - os.makedirs(ouput_path) - - preproc.save(output_path + "/cw.h5") - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Train Back-end") - - parser.add_argument("--iv-file", dest="iv_file", required=True) - parser.add_argument("--train-list", dest="train_list", required=True) - - VR.add_argparse_args(parser) - - parser.add_argument("--output-path", dest="output_path", required=True) - parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int - ) - - args = parser.parse_args() - config_logger(args.verbose) - del args.verbose - logging.debug(args) - - train_be(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh b/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh index 7dbfcfb9..267466ae 100755 --- a/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh +++ b/egs/voxceleb/v1/steps_be/train_be_cos_qmf.sh @@ -39,7 +39,7 @@ if [ $stage -le 1 ];then do $cmd $output_dir/log/${name}_${i}_${j}.log \ hyp_utils/conda_env.sh \ - steps_be/eval-be-cos-qmf.py \ + steps_be/eval_be_cos_qmf.py \ --v-file scp:$vector_file \ --ndx-file $ndx_file \ --enroll-file $enroll_file \ @@ -72,7 +72,7 @@ fi if [ $stage -le 3 ];then $cmd $output_dir/log/train_qmf_${name}.log \ hyp_utils/conda_env.sh \ - steps_be/train-qmf.py \ + steps_be/train_qmf.py \ --score-file $output_file \ --key-file $ndx_file \ --model-file $output_dir/qmf.h5 diff --git a/egs/voxceleb/v1/steps_be/train-be-v1.py b/egs/voxceleb/v1/steps_be/train_be_plda_v1.py similarity index 100% rename from egs/voxceleb/v1/steps_be/train-be-v1.py rename to egs/voxceleb/v1/steps_be/train_be_plda_v1.py diff --git a/egs/voxceleb/v1/steps_be/train_be_v1.sh b/egs/voxceleb/v1/steps_be/train_be_plda_v1.sh similarity index 96% rename from egs/voxceleb/v1/steps_be/train_be_v1.sh rename to egs/voxceleb/v1/steps_be/train_be_plda_v1.sh index 68e470ff..ee5f8163 100755 --- a/egs/voxceleb/v1/steps_be/train_be_v1.sh +++ b/egs/voxceleb/v1/steps_be/train_be_plda_v1.sh @@ -44,7 +44,7 @@ while(getline < fv) $cmd $output_dir/log/train_be.log \ hyp_utils/conda_env.sh \ - steps_be/train-be-v1.py \ + steps_be/train_be_plda_v1.py \ --iv-file scp:$vector_file \ --train-list $train_list \ --lda-dim $lda_dim \ diff --git a/egs/voxceleb/v1/steps_be/train_be_proj_v1.py b/egs/voxceleb/v1/steps_be/train_be_proj_v1.py new file mode 100755 index 00000000..24a2a33b --- /dev/null +++ b/egs/voxceleb/v1/steps_be/train_be_proj_v1.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.helpers import VectorReader as VR +from hyperion.np.transforms import TransformList, CentWhiten, PCA, LNorm + +# from numpy.linalg import matrix_rank + + +def train_be_lda(v_file, train_list, output_path, pca, **kwargs): + from hyperion.helpers import VectorClassReader as VCR + from hyperion.np.transforms import LDA, LNorm + from sklearn.discriminant_analysis import LinearDiscriminantAnalysis + + # Read data + vr_args = VCR.filter_args(**kwargs) + vr_train = VCR(v_file, train_list, None, **vr_args) + x, ids = vr_train.read() + del vr_train + + t1 = time.time() + lnorm = LNorm() + x = lnorm(x) + _, ids = np.unique(ids, return_inverse=True) + pca = LDA(lda_dim=90) + pca.fit(x, ids) + logging.info("PCA elapsed time: %.2f s." % (time.time() - t1)) + + # Save models + preproc = TransformList([lnorm, pca]) + + if not os.path.exists(output_path): + os.makedirs(ouput_path) + + preproc.save(output_path + "/preproc.h5") + + +def train_be(v_file, train_list, output_path, pca, **kwargs): + + # Read data + vr_args = VR.filter_args(**kwargs) + vr_train = VR(v_file, train_list, None, **vr_args) + x = vr_train.read() + del vr_train + + t1 = time.time() + pca = PCA(**pca) + pca.fit(x) + logging.info("PCA dimenson=%d", pca.pca_dim) + logging.info("PCA elapsed time: %.2f s." % (time.time() - t1)) + + # Save models + preproc = TransformList([pca]) + if not os.path.exists(output_path): + os.makedirs(ouput_path) + + preproc.save(output_path + "/preproc.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train Back-end") + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + + VR.add_argparse_args(parser) + PCA.add_class_args(parser, prefix="pca") + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + train_be(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/train_be_v2.sh b/egs/voxceleb/v1/steps_be/train_be_proj_v1.sh similarity index 94% rename from egs/voxceleb/v1/steps_be/train_be_v2.sh rename to egs/voxceleb/v1/steps_be/train_be_proj_v1.sh index 487c9b1b..7d1be89d 100755 --- a/egs/voxceleb/v1/steps_be/train_be_v2.sh +++ b/egs/voxceleb/v1/steps_be/train_be_proj_v1.sh @@ -3,6 +3,7 @@ # Apache 2.0. # cmd=run.pl +pca_var_r=0.90 if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; @@ -40,7 +41,7 @@ while(getline < fv) $cmd $output_dir/log/train_be.log \ hyp_utils/conda_env.sh \ - steps_be/train-be-v2.py \ + steps_be/train_be_proj_v1.py \ --iv-file scp:$vector_file \ --train-list $train_list \ --output-path $output_dir diff --git a/egs/voxceleb/v1/steps_be/train-qmf.py b/egs/voxceleb/v1/steps_be/train_qmf.py similarity index 100% rename from egs/voxceleb/v1/steps_be/train-qmf.py rename to egs/voxceleb/v1/steps_be/train_qmf.py diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py index 8ee84ee8..ee464161 100644 --- a/hyperion/np/np_model.py +++ b/hyperion/np/np_model.py @@ -20,6 +20,8 @@ class NPModel(object): """ def __init__(self, name=None, **kwargs): + if name is None: + name = self.__class__.__name__ self.name = name self._is_init = False diff --git a/hyperion/np/transforms/pca.py b/hyperion/np/transforms/pca.py index eabb200d..aa25d8e9 100644 --- a/hyperion/np/transforms/pca.py +++ b/hyperion/np/transforms/pca.py @@ -224,6 +224,12 @@ def add_class_args(parser, prefix=None): action=ActionYesNo, help=("updates whitening parameter"), ) + parser.add_argument( + "--whiten", + default=False, + action=ActionYesNo, + help=("whitens the data after projection"), + ) parser.add_argument( "--pca-dim", default=None, type=int, help=("output dimension of PCA") From 1d78ea3596bd1c0c88623b1e3571c2af39c162ea Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 10 Mar 2023 11:23:58 -0500 Subject: [PATCH 086/154] refactored beam search --- egs/librispeech/v1/conf/infer.yaml | 1 + ...n_wav2vec2base_transducer_stage1_v6.1.yaml | 53 ++ .../wav2vec2base_rnn_transducer_do0.4.yaml | 12 + .../v1/global_conf/config_transducer_v5.0.sh | 1 + .../v1/global_conf/config_transducer_v6.1.sh | 34 ++ egs/librispeech/v1/run_011_train_asr2.sh | 119 +++++ egs/librispeech/v1/run_030_inference2.sh | 49 ++ .../decode_wav2vec2rnn_transducer.sh | 69 +++ .../decode_wav2vec2transducer.sh | 5 +- hyperion/bin/decode_wav2vec2rnn_transducer.py | 231 +++++++++ hyperion/bin/train_wav2rnn_transducer.py | 240 +++++++++ hyperion/bin/train_wav2vec2rnn_transducer.py | 259 ++++++++++ hyperion/torch/layer_blocks/__init__.py | 2 + .../torch/layer_blocks/transducer_joiner.py | 100 ++-- .../layer_blocks/transducer_predictor.py | 218 ++++----- .../transformer_conv2d_subsampler.py | 15 +- hyperion/torch/models/__init__.py | 1 + hyperion/torch/models/transducer/__init__.py | 9 +- .../models/transducer/lstm_rnn_transducer.py | 149 ++++++ .../models/transducer/rnn_rnn_transducer.py | 84 ++++ .../torch/models/transducer/rnn_transducer.py | 258 +++++----- .../torch/models/transducer/transducer.py | 10 +- .../torch/models/wav2transducer/__init__.py | 3 + .../wav2transducer/hf_wav2rnn_transducer.py | 375 +++++++++++++++ .../hf_wav2vec2rnn_rnn_transducer.py | 103 ++++ .../hf_wav2vec2rnn_transducer.py | 103 ++++ .../wav2transducer/wav2rnn_transducer.py | 103 ++++ .../hf_wav2vec2resnet1d_xvector.py | 10 +- .../models/wav2xvectors/hf_wav2xvector.py | 86 ++-- .../torch/models/wav2xvectors/wav2xvector.py | 17 +- .../torch/models/xvectors/resnet1d_xvector.py | 28 +- hyperion/torch/narchs/__init__.py | 2 + hyperion/torch/narchs/rnn_encoder.py | 281 +++++++++++ .../torch/narchs/rnn_transducer_decoder.py | 454 +++++++++++++++++- hyperion/torch/torch_model.py | 28 +- 35 files changed, 3109 insertions(+), 403 deletions(-) create mode 100644 egs/librispeech/v1/conf/infer.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml create mode 100644 egs/librispeech/v1/conf/wav2vec2base_rnn_transducer_do0.4.yaml create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v6.1.sh create mode 100755 egs/librispeech/v1/run_011_train_asr2.sh create mode 100755 egs/librispeech/v1/run_030_inference2.sh create mode 100755 egs/librispeech/v1/steps_transducer/decode_wav2vec2rnn_transducer.sh create mode 100755 hyperion/bin/decode_wav2vec2rnn_transducer.py create mode 100755 hyperion/bin/train_wav2rnn_transducer.py create mode 100755 hyperion/bin/train_wav2vec2rnn_transducer.py create mode 100644 hyperion/torch/models/transducer/lstm_rnn_transducer.py create mode 100644 hyperion/torch/models/transducer/rnn_rnn_transducer.py create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py create mode 100644 hyperion/torch/models/wav2transducer/wav2rnn_transducer.py create mode 100644 hyperion/torch/narchs/rnn_encoder.py diff --git a/egs/librispeech/v1/conf/infer.yaml b/egs/librispeech/v1/conf/infer.yaml new file mode 100644 index 00000000..ddfd25e2 --- /dev/null +++ b/egs/librispeech/v1/conf/infer.yaml @@ -0,0 +1 @@ +beam_width: 5 diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml new file mode 100644 index 00000000..c1490295 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml @@ -0,0 +1,53 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 75. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: 'bucketing_seg_sampler' + max_batch_length: 75. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2base_rnn_transducer_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 1200 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/wav2vec2base_rnn_transducer_do0.4.yaml b/egs/librispeech/v1/conf/wav2vec2base_rnn_transducer_do0.4.yaml new file mode 100644 index 00000000..6ddc7259 --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2base_rnn_transducer_do0.4.yaml @@ -0,0 +1,12 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h +transducer: + decoder: + embed_dim: 1024 + num_pred_layers: 2 + pred_hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v1/global_conf/config_transducer_v5.0.sh b/egs/librispeech/v1/global_conf/config_transducer_v5.0.sh index b1da75b7..2aaeed2b 100644 --- a/egs/librispeech/v1/global_conf/config_transducer_v5.0.sh +++ b/egs/librispeech/v1/global_conf/config_transducer_v5.0.sh @@ -24,6 +24,7 @@ nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/model_ep0030.pth nnet_s1=$nnet_s1_dir/model_ep0050.pth nnet_s1=$nnet_s1_dir/model_ep0075.pth +nnet_s1=$nnet_s1_dir/model_ep0106.pth nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml nnet_s2_args="" diff --git a/egs/librispeech/v1/global_conf/config_transducer_v6.1.sh b/egs/librispeech/v1/global_conf/config_transducer_v6.1.sh new file mode 100644 index 00000000..f67b0a88 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v6.1.sh @@ -0,0 +1,34 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_transducer_stage1_v6.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v6.1 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0075.pth +nnet_s1=$nnet_s1_dir/model_ep0106.pth +nnet_s1=$nnet_s1_dir/model_ep0646.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/run_011_train_asr2.sh b/egs/librispeech/v1/run_011_train_asr2.sh new file mode 100755 index 00000000..99b0065e --- /dev/null +++ b/egs/librispeech/v1/run_011_train_asr2.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=2 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_data}/ +val_dir=data/${dev_data}/ + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" + extra_args="--data.val.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v2 --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ + train_wav2vec2rnn_transducer.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s1_dir $args \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s2_dir $args \ + --in-model-file $nnet_s1 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu + +fi + +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2transducer.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.segments-file $train_dir/utt2spk \ + --data.train.dataset.bpe-model $bpe_model \ + --data.train.dataset.text-file $train_dir/text \ + --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.segments-file $val_dir/utt2spk \ + --data.val.dataset.text-file $val_dir/text \ + --trainer.exp-path $nnet_s3_dir $args \ + --in-model-file $nnet_s2 \ + --data.train.dataset.time-durs-file $train_dir/utt2dur \ + --data.val.dataset.time-durs-file $val_dir/utt2dur \ + --num-gpus $ngpu +fi + diff --git a/egs/librispeech/v1/run_030_inference2.sh b/egs/librispeech/v1/run_030_inference2.sh new file mode 100755 index 00000000..7ed9567a --- /dev/null +++ b/egs/librispeech/v1/run_030_inference2.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +use_gpu=false +nnet_stage=1 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + transducer_args="--use-gpu true" + transducer_cmd="$cuda_eval_cmd --mem 6G" +else + transducer_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +fi + +transducer_dir=exp/transducer/$nnet_name + + +test_data=test_clean + + +# Extracts x-vectors for evaluation +for name in dev_clean dev_other test_clean test_other +do + nj=40 + steps_transducer/decode_wav2vec2rnn_transducer.sh \ + --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ + $nnet data/$name \ + $transducer_dir/$name $bpe_model +done + diff --git a/egs/librispeech/v1/steps_transducer/decode_wav2vec2rnn_transducer.sh b/egs/librispeech/v1/steps_transducer/decode_wav2vec2rnn_transducer.sh new file mode 100755 index 00000000..470b92b1 --- /dev/null +++ b/egs/librispeech/v1/steps_transducer/decode_wav2vec2rnn_transducer.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +nj=30 +cmd="run.pl" +set -e +use_gpu=false +#write_utt2num_frames=true # If true writes utt2num_frames. +stage=0 +extra_args="" +infer_cfg=conf/infer.yaml +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ] && [ $# != 4 ]; then + echo "Usage: $0 [options] []" + echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --infer-cfg # decoding configuration" + echo " --use-gpu # If true, use GPU." + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + + +fi + +nnet_file=$1 +data_dir=$2 +output_dir=$3 +bpe_model=$4 + +for f in $data_dir/wav.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +log_dir=$output_dir/log +mkdir -p $log_dir + +num_gpus=0 +if [ "$use_gpu" == "true" ];then + cmd="$cmd --gpu 1" + num_gpus=1 + extra_args="${extra_args} --use-gpu" +fi + +# if [ "$write_utt2num_frames" == "true" ];then +# write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB" +# fi + +if [ $stage -le 0 ];then + $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + decode_wav2vec2rnn_transducer.py \ + --infer-args $infer_cfg \ + --part-idx JOB --num-parts $nj \ + --input $data_dir/wav.scp \ + --model-path $nnet_file \ + --bpe-model $bpe_model \ + --output $output_dir/transducer.JOB.text $extra_args +fi + +if [ $stage -le 1 ];then + echo "compute wer" + cat $output_dir/transducer.*.text > $output_dir/transducer.text + compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text +fi diff --git a/egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh b/egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh index 143087a5..67fc7081 100755 --- a/egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh +++ b/egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh @@ -1,6 +1,7 @@ #!/bin/bash # 2022 Johns Hopkins University (Author: Yen-Ju Lu) # Apache 2.0. +set -e nj=30 cmd="run.pl" @@ -61,7 +62,7 @@ if [ "$write_utt2num_frames" == "true" ];then fi if [ $stage -le 0 ];then - set +e + #set +e $cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ decode_wav2transducer.py \ @@ -70,7 +71,7 @@ if [ $stage -le 0 ];then --model-path $nnet_file \ --bpe-model $bpe_model \ --output $output_dir/transducer.JOB.text - set -e + # set -e fi if [ $stage -le 1 ];then diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py new file mode 100755 index 00000000..cc612628 --- /dev/null +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from typing import Dict, List, Tuple + +import sentencepiece as spm +import torch.nn as nn + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np +import pandas as pd + +import torch + +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.utils import Utt2Info +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.augment import SpeechAugment + +from hyperion.torch.utils import open_device +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.models import HFWav2Vec2RNNTransducer +from hyperion.torch import TorchModelLoader as TML + +from hyperion.torch.models.wav2transducer.beam_search import greedy_search, beam_search + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus={}".format(num_gpus)) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model {}".format(model_path)) + model = TML.load(model_path) + logging.info("transducer-model={}".format(model)) + model.to(device) + model.eval() + return model + + +def decode_one_batch( + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search") -> Dict[str, List[List[str]]]: + """Decode one batch and return the result in a dict. The dict has the + following format: + - key: It indicates the setting used for decoding. For example, + if greedy_search is used, it would be "greedy_search" + If beam search with a beam size of 7 is used, it would be + "beam_7" + - value: It contains the decoding result. `len(value)` equals to + batch size. `value[i]` is the decoding result for the i-th + utterance in the given batch. + Args: + params: + It's the return value of :func:`get_params`. + model: + The neural model. + sp: + The BPE model. + batch: + It is the return value from iterating + `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation + for the format of the `batch`. + Returns: + Return the decoding result. See above description for the format of + the returned dict. + """ + device = model.device + feature = x #batch["inputs"] + assert x.shape[0] == 1 + assert feature.ndim == 2 + + feature = feature.to(device) + # at entry, feature is (N, T, C) + + feature_lens = torch.Tensor([x.shape[1]]).int() + + encoder_out, hid_feats, encoder_out_lens = model.forward_feats( + x=feature, x_lengths=feature_lens) + + hyps = [] + batch_size = encoder_out.size(0) + + encoder_out = encoder_out.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + for i in range(batch_size): + # fmt: off + encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]] + # fmt: on + if decoding_method == "greedy_search": + hyp = greedy_search(model=model, encoder_out=encoder_out_i) + elif decoding_method == "beam_search": + hyp = beam_search(model=model, encoder_out=encoder_out_i, beam=5) + else: + raise ValueError(f"Unsupported decoding method: {decoding_method}") + hyps.append(sp.decode(hyp).split()) + + logging.info("hyps:{}".format(" ".join(hyps[0]))) + + if decoding_method == "greedy_search": + return hyps[0] + else: + return hyps[0] + + +def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, + infer_args, use_gpu, **kwargs): + + device = init_device(use_gpu) + model = load_model(model_path, device) + + logging.info("bpe-model=%s", bpe_model) + sp = spm.SentencePieceProcessor() + sp.load(bpe_model) + + infer_args = HFWav2Vec2RNNTransducer.filter_infer_args(**infer_args) + logging.info(f"infer-args={infer_args}") + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output: %s", output_spec) + with open(output_spec, "w") as writer: + logging.info(f"opening input stream: {input_spec} with args={ar_args}") + with AR(input_spec, **ar_args) as reader: + while not reader.eof(): + t1 = time.time() + key, x, fs = reader.read(1) + if len(key) == 0: + break + + x, key, fs = x[0], key[0], fs[0] + t2 = time.time() + logging.info("processing utt %s", key) + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype()).to(device) + + tot_frames = x.shape[1] + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + + if x.shape[1] == 0: + y = [""] + else: + #y = decode_one_batch(model=model, sp=sp, x=x) + x_lengths = torch.tensor((x.shape[1], ), + dtype=torch.long, + device=device) + y = model.infer(x, x_lengths, **infer_args) + + y = sp.decode(y[0]) + logging.info(f"utt: {key} hyps: {y}") + t3 = time.time() + writer.write(f"{key} {y}\n") + + t4 = time.time() + tot_time = t4 - t1 + infer_time = t3 - t2 + logging.info( + ("utt %s total-time=%.3f read-time=%.3f " + "infer-time=%.3f " + "write-time=%.3f " + "infer-rt-factor=%.2f tot-rt-factor=%.2f"), + key, + tot_time, + t2 - t1, + infer_time, + t4 - t3, + x.shape[1] / fs / infer_time, + x.shape[1] / fs / tot_time, + ) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description=("ASR decoding for RNN-T with Wav2vec features")) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--scp-sep", + default=" ", + help=("scp file field separator")) + + AR.add_class_args(parser) + parser.add_argument("--model-path", required=True) + parser.add_argument("--bpe-model", required=True) + + HFWav2Vec2RNNTransducer.add_infer_args(parser, "infer-args") + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--use-gpu", + default=False, + action="store_true", + help="extract xvectors in gpu") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + decode_transducer(**namespace_to_dict(args)) diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py new file mode 100755 index 00000000..026c9330 --- /dev/null +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import k2 +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.models import Wav2RNNRNNTransducer +from torch.nn.utils.rnn import pad_sequence + +model_dict = { + "rnn_rnn_transducer": Wav2RNNRNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + audio = pad_sequence(audio) + audio_length = torch.as_tensor(audio_length) + target = k2.RaggedTensor(target) + batch = { + "x": torch.transpose(audio, 0, 1), + "x_lengths": audio_length, + "text": target, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=transducer_collate) + return data_loader + + +def init_model(blank_id, vocab_size, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["decoder"]["blank_id"] = blank_id + model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = False + torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.link_arguments("data.train.dataset.bpe_model", + "data.val.dataset.bpe_model") + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train RNN Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + # multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py new file mode 100755 index 00000000..a2d75ba9 --- /dev/null +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import sys +import os +from pathlib import Path +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import k2 +import time +import logging +import multiprocessing + +import numpy as np + +import torch +import torch.nn as nn + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.utils import ddp +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.models import HFWav2Vec2RNNTransducer +from hyperion.torch.models import HFWav2Vec2RNNRNNTransducer +from torch.nn.utils.rnn import pad_sequence + +model_dict = { + "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, + "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, + # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, + # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, + # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, + # "hf_wavlm2rnn_rnn_transducer": HFWavLM2RNNRNNTransducer, +} + + +def transducer_collate(batch): + audio = [] + audio_length = [] + target = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + audio = pad_sequence(audio).transpose(0, 1) + audio_length = torch.as_tensor(audio_length) + + # sort audios by length + sort_idx = torch.argsort(audio_length, descending=True) + audio = audio[sort_idx] + audio_length = audio_length[sort_idx] + target = [target[k] for k in sort_idx] + target = k2.RaggedTensor(target) + + batch = { + "x": audio, + "x_lengths": audio_length, + "text": target, + } + return batch + + +def init_data(partition, rank, num_gpus, **kwargs): + data_kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**data_kwargs["dataset"]) + sampler_args = data_kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = data_kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ({ + "num_workers": num_workers_per_gpu, + "pin_memory": True + } if num_gpus > 0 else {}) + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=sampler, + **largs, + collate_fn=transducer_collate) + return data_loader + + +def init_model(blank_id, vocab_size, rank, model_class, **kwargs): + model_args = model_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("model network args={}".format(model_args)) + # TODO: check model_args + model_args["transducer"]["decoder"]["blank_id"] = blank_id + model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + model = model_class(**model_args) + if rank == 0: + logging.info("model={}".format(model)) + return model + + +def train_model(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + #torch.backends.cudnn.deterministic = True + #torch.backends.cudnn.benchmark = False + torch.backends.cudnn.enabled = False + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + # # for Debug + # rank = 0 + # kwargs["rank"] = 0 + # device = "cpu" + # world_size=1 + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_model(train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {} #{"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(model_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + train_parser = ArgumentParser(prog="") + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", + action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + + parser.add_argument( + "--data.train.dataset.text_file", + type=str, + ) + + parser.add_argument("--data.val.dataset.text_file", type=str) + + parser.add_argument( + "--data.train.dataset.bpe_model", + type=str, + ) + + parser.link_arguments("data.train.data_loader.num_workers", + "data.val.data_loader.num_workers") + + parser.link_arguments("data.train.dataset.bpe_model", + "data.val.dataset.bpe_model") + + model_class.add_class_args(parser, prefix="model") + Trainer.add_class_args(parser, + prefix="trainer", + train_modes=model_class.valid_train_modes()) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", + type=int, + default=1123581321, + help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train Wav2Vec2Transducer model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + + for k, v in model_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + model_type = args.subcommand + args_sc = vars(args)[model_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.model_class = model_dict[model_type] + # torch docs recommend using forkserver + # multiprocessing.set_start_method("forkserver") + train_model(gpu_id, args_sc) diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index 7ec806a5..7a738bca 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -34,3 +34,5 @@ from .transformer_encoder_v1 import TransformerEncoderBlockV1 from .transformer_feedforward import (Conv1dLinear, Conv1dx2, PositionwiseFeedForward) +from .transducer_predictor import TransducerPredictor +from .transducer_joiner import TransducerJoiner diff --git a/hyperion/torch/layer_blocks/transducer_joiner.py b/hyperion/torch/layer_blocks/transducer_joiner.py index ee7a667b..482b5aa6 100644 --- a/hyperion/torch/layer_blocks/transducer_joiner.py +++ b/hyperion/torch/layer_blocks/transducer_joiner.py @@ -25,7 +25,7 @@ def __init__(self, in_feats: int, vocab_size: int): self.in_feats = in_feats self.vocab_size = vocab_size - self.output = nn.Linear(in_feats, out_dims) + self.output = nn.Linear(in_feats, vocab_size) def forward(self, encoder_out: torch.Tensor, pred_out: torch.Tensor) -> torch.Tensor: @@ -43,57 +43,57 @@ def forward(self, encoder_out: torch.Tensor, encoder_out = encoder_out.unsqueeze(2) # Now encoder_out is (N, T, 1, C) pred_out = pred_out.unsqueeze(1) - # Now decoder_out is (N, 1, U, C) + # Now pred_out is (N, 1, U, C) x = torch.tanh(encoder_out + pred_out) logits = self.output(x) return logits - def get_config(self): - config = { - "in_feats": self.in_feats, - "out_dims": self.out_dims, - "num_layers": self.num_layers, - } - - # base_config = super().get_config() - return dict(list(config.items())) - - @staticmethod - def filter_args(**kwargs): - valid_args = ( - "in_feats", - "out_dims", - "num_layers", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - return args - - @staticmethod - def add_class_args(parser, - prefix=None, - skip=set(["in_feats", "out_dims"])): - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - if "in_feats" not in skip: - parser.add_argument("--in-feats", - type=int, - required=True, - help=("input feature dimension")) - - if "out_dims" not in skip: - parser.add_argument("--out-dims", - type=int, - required=True, - help=("output feature dimension (vocab size)")) - parser.add_argument("--num-layers", - default=1, - type=int, - help=("layers of the joiner")) - - if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + # def get_config(self): + # config = { + # "in_feats": self.in_feats, + # "out_dims": self.out_dims, + # "num_layers": self.num_layers, + # } + + # # base_config = super().get_config() + # return dict(list(config.items())) + + # @staticmethod + # def filter_args(**kwargs): + # valid_args = ( + # "in_feats", + # "out_dims", + # "num_layers", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + # return args + + # @staticmethod + # def add_class_args(parser, + # prefix=None, + # skip=set(["in_feats", "out_dims"])): + # if prefix is not None: + # outer_parser = parser + # parser = ArgumentParser(prog="") + + # if "in_feats" not in skip: + # parser.add_argument("--in-feats", + # type=int, + # required=True, + # help=("input feature dimension")) + + # if "out_dims" not in skip: + # parser.add_argument("--out-dims", + # type=int, + # required=True, + # help=("output feature dimension (vocab size)")) + # parser.add_argument("--num-layers", + # default=1, + # type=int, + # help=("layers of the joiner")) + + # if prefix is not None: + # outer_parser.add_argument("--" + prefix, + # action=ActionParser(parser=parser)) diff --git a/hyperion/torch/layer_blocks/transducer_predictor.py b/hyperion/torch/layer_blocks/transducer_predictor.py index 178c423a..ae354359 100644 --- a/hyperion/torch/layer_blocks/transducer_predictor.py +++ b/hyperion/torch/layer_blocks/transducer_predictor.py @@ -9,6 +9,8 @@ import torch import torch.nn as nn +from ...utils.misc import filter_func_args + class TransducerPredictor(nn.Module): """ RNN-T prediction network. @@ -40,7 +42,7 @@ def __init__(self, super().__init__() self.embedding = nn.Embedding( num_embeddings=vocab_size, - embed_dim=embed_dim, + embedding_dim=embed_dim, padding_idx=blank_id, ) self.embed_dropout = nn.Dropout(embed_dropout_rate) @@ -71,7 +73,7 @@ def __init__(self, self.hid_feats = hid_feats self.embed_dropout_rate = embed_dropout_rate self.rnn_dropout_rate = rnn_dropout_rate - self.output = nn.Linear(hid_feats, in_feats) + self.output = nn.Linear(hid_feats, out_feats) def forward( self, @@ -93,92 +95,6 @@ def forward( return out, (h, c) - def get_config(self): - config = { - "in_feats": self.in_feats, - "blank_id": self.blank_id, - "vocab_size": self.vocab_size, - "embed_dim": self.embed_dim, - "num_layers": self.num_layers, - "hid_feats": self.hid_feats, - "embed_dropout_rate": self.embed_dropout_rate, - "rnn_dropout_rate": self.rnn_dropout_rate, - } - - # base_config = super().get_config() - return dict(list(config.items())) - - @staticmethod - def filter_args(**kwargs): - valid_args = ( - "in_feats", - "blank_id", - "vocab_size", - "embed_dim", - "num_layers", - "hid_feats", - "embed_dropout_rate", - "rnn_dropout_rate", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - return args - - @staticmethod - def filter_finetune_args(**kwargs): - valid_args = ( - "embed_dropout_rate", - "rnn_dropout_rate", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - return args - - @staticmethod - def add_class_args(parser, - prefix=None, - skip=set(["in_feats", "blank_id", "vocab_size"])): - - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - if "in_feats" not in skip: - parser.add_argument("--in-feats", - type=int, - required=True, - help=("input feature dimension")) - if "blank_id" not in skip: - parser.add_argument("--blank-id", - type=int, - required=True, - help=("blank id from sp model")) - if "vocab_size" not in skip: - parser.add_argument("--vocab-size", - type=int, - required=True, - help=("output prediction dimension")) - parser.add_argument("--embedding-dim", - default=1024, - type=int, - help=("feature dimension")) - parser.add_argument("--embedding-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder input embeddings")) - parser.add_argument("--rnn-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder RNN ")) - - parser.add_argument("--num-layers", default=2, type=int, help=("")) - - parser.add_argument("--hidden-dim", default=512, type=int, help=("")) - - if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) - def change_config( self, override_dropouts=False, @@ -194,31 +110,101 @@ def change_config( self.embed_dropout_rate = embed_dropout_rate self.embed_dropout = nn.Dropout(self.embed_dropout_rate) - @staticmethod - def add_finetune_args(parser, - prefix=None, - skip=set(["in_feats", "blank_id", "vocab_size"])): - - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - parser.add_argument( - "--override-dropouts", - default=False, - action=ActionYesNo, - help=( - "whether to use the dropout probabilities passed in the " - "arguments instead of the defaults in the pretrained model.")) - parser.add_argument("--embedding-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder input embeddings")) - parser.add_argument("--rnn-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder RNN ")) - - if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + # def get_config(self): + # config = { + # "in_feats": self.in_feats, + # "blank_id": self.blank_id, + # "vocab_size": self.vocab_size, + # "embed_dim": self.embed_dim, + # "num_layers": self.num_layers, + # "hid_feats": self.hid_feats, + # "embed_dropout_rate": self.embed_dropout_rate, + # "rnn_dropout_rate": self.rnn_dropout_rate, + # } + + # # base_config = super().get_config() + # return dict(list(config.items())) + + # @staticmethod + # def filter_args(**kwargs): + # args = filter_func_args(TransducerPredictor.__init__, kwargs) + # return args + + # @staticmethod + # def filter_finetune_args(**kwargs): + # args = filter_func_args(TransducerPredictor.change_config, kwargs) + # return args + + # @staticmethod + # def add_class_args(parser, + # prefix=None, + # skip=set(["in_feats", "blank_id", "vocab_size"])): + + # if prefix is not None: + # outer_parser = parser + # parser = ArgumentParser(prog="") + + # if "in_feats" not in skip: + # parser.add_argument("--in-feats", + # type=int, + # required=True, + # help=("input feature dimension")) + # if "blank_id" not in skip: + # parser.add_argument("--blank-id", + # type=int, + # required=True, + # help=("blank id from sp model")) + # if "vocab_size" not in skip: + # parser.add_argument("--vocab-size", + # type=int, + # required=True, + # help=("output prediction dimension")) + # parser.add_argument("--embedding-dim", + # default=1024, + # type=int, + # help=("feature dimension")) + # parser.add_argument("--embedding-dropout-rate", + # default=0.0, + # type=float, + # help=("dropout prob for decoder input embeddings")) + # parser.add_argument("--rnn-dropout-rate", + # default=0.0, + # type=float, + # help=("dropout prob for decoder RNN ")) + + # parser.add_argument("--num-layers", default=2, type=int, help=("")) + + # parser.add_argument("--hidden-dim", default=512, type=int, help=("")) + + # if prefix is not None: + # outer_parser.add_argument("--" + prefix, + # action=ActionParser(parser=parser)) + + # @staticmethod + # def add_finetune_args(parser, + # prefix=None, + # skip=set(["in_feats", "blank_id", "vocab_size"])): + + # if prefix is not None: + # outer_parser = parser + # parser = ArgumentParser(prog="") + + # parser.add_argument( + # "--override-dropouts", + # default=False, + # action=ActionYesNo, + # help=( + # "whether to use the dropout probabilities passed in the " + # "arguments instead of the defaults in the pretrained model.")) + # parser.add_argument("--embedding-dropout-rate", + # default=0.0, + # type=float, + # help=("dropout prob for decoder input embeddings")) + # parser.add_argument("--rnn-dropout-rate", + # default=0.0, + # type=float, + # help=("dropout prob for decoder RNN ")) + + # if prefix is not None: + # outer_parser.add_argument("--" + prefix, + # action=ActionParser(parser=parser)) diff --git a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py b/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py index bdd9b707..942e1313 100644 --- a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py +++ b/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py @@ -6,6 +6,8 @@ import torch import torch.nn as nn +from ..layers import ActivationFactory as AF + class TransformerConv2dSubsampler(nn.Module): """Convolutional 2D subsampling (to 1/4 length) Tor transformer @@ -18,18 +20,23 @@ class TransformerConv2dSubsampler(nn.Module): time_dim: indicates which is the time dimension in the input tensor """ - def __init__(self, in_feats, out_feats, hid_act, pos_enc, time_dim=1): + def __init__(self, in_feats, out_feats, hid_act, pos_enc=None, time_dim=1): super().__init__() self.time_dim = time_dim + hid_act = AF.create(hid_act) self.conv = nn.Sequential( nn.Conv2d(1, out_feats, 3, 2, padding=(0, 1)), hid_act, nn.Conv2d(out_feats, out_feats, 3, 2, padding=(0, 1)), hid_act, ) - self.out = nn.Sequential( - nn.Linear(out_feats * (((in_feats - 1) // 2 - 1) // 2), out_feats), pos_enc - ) + + linear = nn.Linear(out_feats * (((in_feats - 1) // 2 - 1) // 2), + out_feats) + if pos_enc is None: + self.out = linear + else: + self.out = nn.Sequential(linear, pos_enc) def forward(self, x, x_mask=None): """Forward function. diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 3e8347ee..95042aed 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -7,6 +7,7 @@ from .vae.vae import VAE from .vae.vq_vae import VQVAE from .wav2transducer import HFWav2Vec2Transducer +from .wav2transducer import HFWav2Vec2RNNTransducer, HFWav2Vec2RNNRNNTransducer from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, HFWavLM2ResNet1dXVector) from .xvectors.efficient_net_xvector import EfficientNetXVector diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py index ee711a8d..fe55e34d 100644 --- a/hyperion/torch/models/transducer/__init__.py +++ b/hyperion/torch/models/transducer/__init__.py @@ -4,7 +4,10 @@ """ +from .rnn_transducer import RNNTransducer +from .rnn_rnn_transducer import RNNRNNTransducer + from .transducer import Transducer -from .conformer import Conformer -from .decoder import Decoder -from .joiner import Joiner \ No newline at end of file +#from .conformer import Conformer +#from .decoder import Decoder +#from .joiner import Joiner diff --git a/hyperion/torch/models/transducer/lstm_rnn_transducer.py b/hyperion/torch/models/transducer/lstm_rnn_transducer.py new file mode 100644 index 00000000..5ab74483 --- /dev/null +++ b/hyperion/torch/models/transducer/lstm_rnn_transducer.py @@ -0,0 +1,149 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Union +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import torch + +from ...torch_model import TorchModel +from ..narchs import RNNTransducerDecoder + + +class RNNTransducer(TorchModel): + """ Base-class for RNN-T in + "Sequence Transduction with Recurrent Neural Networks" + https://arxiv.org/pdf/1211.3711.pdf + + Attributes: + encoder: Encoder network module + decoder: RNN-T Decoder config. dictionary or module. + """ + + def __init__( + self, + encoder: TorchModel, + decoder: Union[Dict, RNNTransducerDecoder], + ): + super().__init__() + assert isinstance(encoder, TorchModel) + if isinstance(decoder, dict): + decoder = RNNTransducerDecoder(**decoder) + else: + assert isinstance(decoder, RNNTransducerDecoder) + + self.encoder = encoder + self.decoder = decoder + + def forward( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: k2.RaggedTensor, + ) -> torch.Tensor: + """ + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + y: ragged tensor with 2 axes [utt][label]. It contains labels of each + utterance. + Returns: + - Token logits with shape = (N, vocab_size) + - RNN-T loss. + """ + assert x.ndim == 3, x.shape + assert x_lengths.ndim == 1, x_lengths.shape + assert y.num_axes == 2, y.num_axes + + assert x.size(0) == x_lengths.size(0) == y.dim0 + + x, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) + + logits, loss = self.decoder(x, x_lengths, y) + return logits, loss + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + else: + raise ValueError(f"invalid train_mode={mode}") + + self._train_mode = mode + + def _train(self, train_mode: str): + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return ["full", "frozen"] + + def get_config(self): + dec_cfg = self.decoder.get_config() + config = { + "decoder": dec_cfg, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + + # get arguments for pooling + decoder_args = RNNTransducerDecoder.filter_args(**kwargs["decoder"]) + args["decoder"] = decoder_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNTransducerDecoder.add_class_args(parser, prefix="decoder") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + def change_config( + self, + decoder, + ): + logging.info("changing transducer config") + self.decoder.change_config(**decoder) + + @staticmethod + def filter_finetune_args(**kwargs): + # get arguments for pooling + decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) + args["decoder"] = decoder_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNTransducerDecoder.add_finetune_args(parser, prefix="decoder") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/rnn_rnn_transducer.py b/hyperion/torch/models/transducer/rnn_rnn_transducer.py new file mode 100644 index 00000000..0e1c7a85 --- /dev/null +++ b/hyperion/torch/models/transducer/rnn_rnn_transducer.py @@ -0,0 +1,84 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Union, Tuple +from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import torch + +from .rnn_transducer import RNNTransducer +from ...narchs import RNNEncoder + + +class RNNRNNTransducer(RNNTransducer): + """RNN-T with RNN Encoder + + Attributes: + encoder: dictionary of options to initialize RNNEncoder class or RNNEncoder object + decoder: RNN-T Decoder config. dictionary or module. + + """ + + def __init__(self, encoder, decoder): + if isinstance(encoder, dict): + encoder = RNNEncoder(**encoder) + else: + assert isinstance(encoder, RNNEncoder) + + super().__init__(encoder, decoder) + + @staticmethod + def filter_args(**kwargs): + args = RNNTransducer.filter_args(**kwargs) + encoder_args = RNNEncoder.filter_args(**kwargs["encoder"]) + args["encoder"] = encoder_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNEncoder.add_class_args(parser, prefix="encoder") + RNNTransducer.add_class_args(parser) + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + def change_config( + self, + encoder, + decoder, + ): + logging.info("changing transducer encoder config") + self.encoder.change_config(**encoder) + super().chage_config(**decoder) + + @staticmethod + def filter_finetune_args(**kwargs): + args = RNNTransducer.filter_finetune_args(**kwargs) + encoder_args = RNNEncoder.filter_finetune_args(**kwargs["encoder"]) + args["encoder"] = encoder_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNEncoder.add_finetune_args(parser, prefix="encoder") + RNNTransducer.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py index dd91da5f..ef54a5eb 100644 --- a/hyperion/torch/models/transducer/rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -1,101 +1,64 @@ -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """ -Note we use `rnnt_loss` from torchaudio, which exists only in -torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0 + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + +import logging +from typing import Dict, Optional, Union, Tuple, List from jsonargparse import ArgumentParser, ActionParser, ActionYesNo try: import k2 except ModuleNotFoundError: from ...utils import dummy_k2 as k2 -import logging import torch -import torch.nn as nn -import torchaudio -import torchaudio.functional -#from .encoder_interface import EncoderInterface +from ....utils.misc import filter_func_args from ...torch_model import TorchModel -from hyperion.utils.text import add_sos -# from .conformer import Conformer -from .decoder import Decoder -from .joiner import Joiner +from ...narchs import RNNTransducerDecoder -class Transducer(TorchModel): - """It implements https://arxiv.org/pdf/1211.3711.pdf +class RNNTransducer(TorchModel): + """ Base-class for RNN-T in "Sequence Transduction with Recurrent Neural Networks" + https://arxiv.org/pdf/1211.3711.pdf + + Attributes: + encoder: Encoder network module + decoder: RNN-T Decoder config. dictionary or module. """ def __init__( self, - encoder, - # conformer_enc, - decoder, - joiner, - vocab_size, - blank_id, + encoder: Union[TorchModel, None], + decoder: Union[Dict, RNNTransducerDecoder], ): - """ - Args: - encoder: - It is the transcription network in the paper. Its accepts - two inputs: `x` of (N, T, C) and `x_lengths` of shape (N,). - It returns two tensors: `logits` of shape (N, T, C) and - `logit_lens` of shape (N,). - decoder: - It is the prediction network in the paper. Its input shape - is (N, U) and its output shape is (N, U, C). It should contain - one attribute: `blank_id`. - joiner: - It has two inputs with shapes: (N, T, C) and (N, U, C). Its - output shape is (N, T, U, C). Note that its output contains - unnormalized probs, i.e., not processed by log-softmax. - """ super().__init__() - decoder["blank_id"] = blank_id - decoder["vocab_size"] = vocab_size - joiner["out_dims"] = vocab_size + if encoder is not None: + assert isinstance(encoder, TorchModel) + if isinstance(decoder, dict): + decoder = RNNTransducerDecoder(**decoder) + else: + assert isinstance(decoder, RNNTransducerDecoder) - self.vocab_size = vocab_size - self.blank_id = blank_id self.encoder = encoder - self.decoder = Decoder(**decoder) - self.joiner = Joiner(**joiner) + self.decoder = decoder def forward( self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: """ Args: - x: - A 3-D tensor of shape (N, T, C). - x_lengths: - A 1-D tensor of shape (N,). It contains the number of frames in `x` - before padding. - y: - A ragged tensor with 2 axes [utt][label]. It contains labels of each + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + y: ragged tensor with 2 axes [utt][label]. It contains labels of each utterance. Returns: - Return the transducer loss. + - Token logits with shape = (N, vocab_size) + - RNN-T loss. """ assert x.ndim == 3, x.shape assert x_lengths.ndim == 1, x_lengths.shape @@ -103,45 +66,52 @@ def forward( assert x.size(0) == x_lengths.size(0) == y.dim0 - # wav2vec2 works as encoder - # encoder_out, x_lengths = self.encoder(x, x_lengths) - assert torch.all(x_lengths > 0) - - encoder_out = x - # Now for the decoder, i.e., the prediction network - row_splits = y.shape.row_splits(1) - y_lens = row_splits[1:] - row_splits[:-1] - - blank_id = self.decoder.blank_id - sos_y = add_sos(y, sos_id=blank_id) - - sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id) - sos_y_padded = sos_y_padded.to(torch.int64) + if self.encoder is not None: + x, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) - decoder_out, _ = self.decoder(sos_y_padded) - - logits = self.joiner(encoder_out, decoder_out) - - # rnnt_loss requires 0 padded targets - # Note: y does not start with SOS - y_padded = y.pad(mode="constant", padding_value=0) + print("zz", x.shape, x_lengths, y, flush=True) + logits, loss = self.decoder(x, x_lengths, y) + return logits, loss - assert hasattr(torchaudio.functional, "rnnt_loss"), ( - f"Current torchaudio version: {torchaudio.__version__}\n" - "Please install a version >= 0.10.0") + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000) -> List[List[int]]: + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + assert x.ndim == 3, x.shape + assert x_lengths.ndim == 1, x_lengths.shape + assert x.size(0) == x_lengths.size(0) - x_lengths = x_lengths.to(torch.int32) + if self.encoder is not None: + x, x_lengths = self.encoder(x, x_lengths) + assert torch.all(x_lengths > 0) - loss = torchaudio.functional.rnnt_loss( - logits=logits, - targets=y_padded.to(torch.int32), - logit_lengths=x_lengths, - target_lengths=y_lens, - blank=blank_id, - reduction="sum", - ) + batch_size = x.size(0) + y = [] + for i in range(batch_size): + x_i = x[i:i + 1, :x_lengths[i]] + y_i = self.decoder.decode(x_i, + method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + y.append(y_i) - return logits, loss + return y def set_train_mode(self, mode): if mode == self._train_mode: @@ -151,9 +121,6 @@ def set_train_mode(self, mode): self.unfreeze() elif mode == "frozen": self.freeze() - elif mode == "ft-embed-affine": - self.unfreeze() - self.freeze_preembed_layers() else: raise ValueError(f"invalid train_mode={mode}") @@ -167,34 +134,30 @@ def _train(self, train_mode: str): @staticmethod def valid_train_modes(): - return ["full", "frozen", "ft-embed-affine"] + return ["full", "frozen"] def get_config(self): - dec_cfg = self.decoder.get_config() - join_cfg = self.joiner.get_config() + if self.encoder is None: + enc_cfg = None + else: + enc_cfg = self.encoder.get_config() + del enc_cfg["class_name"] + dec_cfg = self.decoder.get_config() + del dec_cfg["class_name"] config = { - "blank_id": self.blank_id, - "vocab_size": self.vocab_size, + "encoder": enc_cfg, "decoder": dec_cfg, - "joiner": join_cfg, } - - # base_config = super().get_config() - return dict(list(config.items())) + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) @staticmethod def filter_args(**kwargs): - # get arguments for pooling - decoder_args = Decoder.filter_args(**kwargs["decoder"]) - joiner_args = Joiner.filter_args(**kwargs["joiner"]) - - valid_args = () - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - + args = {} + decoder_args = RNNTransducerDecoder.filter_args(**kwargs["decoder"]) args["decoder"] = decoder_args - args["joiner"] = joiner_args return args @staticmethod @@ -204,8 +167,7 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - Decoder.add_class_args(parser, prefix="decoder") - Joiner.add_class_args(parser, prefix="joiner") + RNNTransducerDecoder.add_class_args(parser, prefix="decoder") if prefix is not None: outer_parser.add_argument("--" + prefix, @@ -213,24 +175,16 @@ def add_class_args(parser, prefix=None, skip=set()): def change_config( self, - decoder, - # joiner, + decoder: Dict, ): - logging.info("changing transducer config") + logging.info("changing decoder config") self.decoder.change_config(**decoder) - # self.joiner.change_config(**joiner) @staticmethod def filter_finetune_args(**kwargs): - # get arguments for pooling + args = {} decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) - # joiner_args = Joiner.filter_finetune_args(**kwargs["joiner"]) - - valid_args = () - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - args["decoder"] = decoder_args - # args["joiner"] = joiner_args return args @staticmethod @@ -239,12 +193,42 @@ def add_finetune_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") - Decoder.add_finetune_args(parser, prefix="decoder") - # Joiner.add_finetune_args(parser, prefix="joiner") + RNNTransducerDecoder.add_finetune_args(parser, prefix="decoder") if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - add_argparse_args = add_class_args - add_argparse_finetune_args = add_finetune_args + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument("--decoding-method", + default="time_sync_beam_search", + choices=[ + "greedy", "time_sync_beam_search", + "align_length_sync_beam_search" + ]) + + parser.add_argument("--beam-width", + default=5, + type=int, + help="beam width for beam search") + parser.add_argument("--max-sym-per-frame", + default=3, + type=int, + help="max symbols RNN-T can emit in 1 frame") + parser.add_argument("--max-sym-per-utt", + default=1000, + type=int, + help="max symbols RNN-T can emit in 1 frame") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return filter_func_args(RNNTransducer.infer, kwargs) diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index 855e1590..bae35e0e 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -28,7 +28,7 @@ import torch.nn as nn import torchaudio import torchaudio.functional -#from .encoder_interface import EncoderInterface +from .encoder_interface import EncoderInterface from ...torch_model import TorchModel from hyperion.utils.text import add_sos @@ -44,12 +44,11 @@ class Transducer(TorchModel): def __init__( self, - encoder_net, + vocab_size, + blank_id, # conformer_enc, decoder, joiner, - vocab_size, - blank_id, ): """ Args: @@ -68,6 +67,9 @@ def __init__( unnormalized probs, i.e., not processed by log-softmax. """ super().__init__() + # assert isinstance(encoder, EncoderInterface) + # assert hasattr(decoder, "blank_id") + decoder["blank_id"] = blank_id decoder["vocab_size"] = vocab_size joiner["out_dims"] = vocab_size diff --git a/hyperion/torch/models/wav2transducer/__init__.py b/hyperion/torch/models/wav2transducer/__init__.py index 5346bc78..de4879a5 100644 --- a/hyperion/torch/models/wav2transducer/__init__.py +++ b/hyperion/torch/models/wav2transducer/__init__.py @@ -5,3 +5,6 @@ """ from .hf_wav2vec2_transducer import HFWav2Vec2Transducer + +from .hf_wav2vec2rnn_transducer import HFWav2Vec2RNNTransducer +from .hf_wav2vec2rnn_rnn_transducer import HFWav2Vec2RNNRNNTransducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py new file mode 100644 index 00000000..922996f6 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py @@ -0,0 +1,375 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import contextlib +from typing import Union, Dict, List +from jsonargparse import ArgumentParser, ActionParser + +import torch +import torch.nn as nn + +from ...utils import remove_silence +from ...torch_model import TorchModel +from ..transducer import RNNTransducer + + +class HFWav2RNNTransducer(TorchModel): + """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + + Attributes: + hf_feats: hugging face model wrapper object. + transducer: transducer model object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__(self, + hf_feats: TorchModel, + transducer: Union[Dict, TorchModel], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg"): + + super().__init__() + self.hf_feats = hf_feats + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer["encoder"] = None + transducer = RNNTransducer(**transducer) + else: + assert isinstance(transducer, RNNTransducer) + if transducer.encoder is None: + assert transducer.decoder.in_feats == hf_feats.hidden_size + #assert transducer.joiner.in_feats == hf_feats.hidden_size + + self.transducer = transducer + self.feat_fusion_start = feat_fusion_start + self.feat_fusion_method = feat_fusion_method + self._hf_context = contextlib.nullcontext() + self._make_fuser() + + def _make_fuser(self): + if self.feat_fusion_method == "last": + self.feat_fuser = None + return + + num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + layer_dim = self.hf_feats.hidden_size + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + elif self.feat_fusion_method == "linear": + self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers + elif self.feat_fusion_method == "cat": + self.feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) + + def _fuse_hid_feats(self, hid_feats): + """Fuses the hidden features from the Wav2Vec model. + + Args: + hid_feats: list of hidden features Tensors from Wav2Vec model. + + Returns: + Tensor of fused features (batch, channels, time) + """ + if len(hid_feats) == 1: + # There is only one layer of features + return hid_feats[0] + + hid_feats = hid_feats[self.feat_fusion_start:] + if self.feat_fusion_method == "weighted-avg": + hid_feats = torch.stack(hid_feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(hid_feats * norm_weights, dim=-1) + elif self.feat_fusion_method == "linear": + hid_feats = torch.stack(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + elif self.feat_fusion_method == "cat": + hid_feats = torch.cat(hid_feats, dim=-1) + feats = self.feat_fuser(hid_feats) + elif self.feat_fusion_method == "last": + feats = hid_feats[-1] + + return feats + + def forward_feats(self, + x, + x_lengths, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + return_hid_states = (False if return_feat_layers is None + and self.feat_fusion_method == "last" else True) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + feats = self._fuse_hid_feats(hid_feats) + else: + hid_feats = None + feats = hf_output["last_hidden_state"] + + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward( + self, + x, + x_lengths=None, + y=None, + return_feat_layers=None, + # return_enc_layers=None, + return_logits=True, + ): + """Forward function. If returns the logits posteriors of the classes. + It can also returns the hidden representations in the wav2vec feature extractor, + the x-vector encoder and the + classification head. In this case the ouput variable is a dictionary. + + Args: + x: input features tensor with shape=(batch, in_feats, time) + x_lengths: time lengths of the features with shape=(batch,) + y: target classes torch.long tensor with shape=(batch,) + return_feat_layers: list of integers indicating, which wav2vec layers + we should return. If None, no wav2vec layers are returned. + return_enc_layers: list of integers indicating, which encoder layers + we should return. If None, no encoder layers are returned. + return_logits: if True, it adds the logits to the output dictionary. + Returns: + Tensor with class logits with shape=(batch, num_classes) or + Dictionary with "logits", "h_enc" (list of hidden encoder layers), + "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) + """ + feats, hid_feats, feat_lengths = self.forward_feats( + x, x_lengths, return_feat_layers) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + output, loss = self.transducer( + feats, + feat_lengths, + y, + ) + + if not return_feat_layers: + return output, loss + + if not isinstance(output, dict): + # if the transducer just returned the logits we put then into a dictionary + # to append the hid feats later. + output["logits"] = output + + output["h_feats"] = hid_feats + return output, loss + + def infer(self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000): + """ + ASR tokens inference + Args: + x: input features with shape = (N, T, C) + x_lengths: feature number for frames with shape = (N,) + decoding_method: greedy, time_sync_beam_search or align_length_sync_beam_search + max_sym_per_frame: maximum number of symbols RNN-T can emit in 1 frame. + max_sym_per_utt: maximimum number of symbols in a single utterance. + Returns: + List of list of integer indexes of the recognizer's symbols. + """ + + feats, _, feat_lengths = self.forward_feats(x, x_lengths) + + feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) + + y = self.transducer.infer(feats, + feat_lengths, + decoding_method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + return y + + def freeze_feat_fuser(self): + if self.feat_fuser is None: + return + + if self.feat_fusion_method == "weighted-avg": + self.feat_fuser.requires_grad = False + return + + for param in self.feat_fuser.parameters(): + param.requires_grad = False + + def freeze_hf_feats(self): + self.hf_feats.freeze() + + def freeze_hf_feature_encoder(self): + self.hf_feats.freeze_feature_encoder() + + def set_train_mode(self, mode): + if mode == self._train_mode: + return + + if mode == "full": + self.unfreeze() + elif mode == "frozen": + self.freeze() + elif mode in ["ft-transducer", "ft-transducer-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + self.freeze_feat_fuser() + elif mode in ["hf-feats-frozen", "hf-feats-frozen-nograd"]: + self.unfreeze() + self.freeze_hf_feats() + elif mode == "hf-feat-extractor-frozen": + self.unfreeze() + self.freeze_hf_feature_encoder() + else: + raise ValueError(f"invalid train_mode={mode}") + + logging.info("train mode set to %s", mode) + + if "nograd" in mode: + logging.info("using torch.no_grad for hf_feats") + self._hf_context = torch.no_grad() + else: + self._hf_context = contextlib.nullcontext() + + self._train_mode = mode + + def _train(self, train_mode: str): + + if train_mode in ["full", "frozen"]: + super()._train(train_mode) + elif train_mode in [ + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ]: + self.hf_feats.train() + self.transducer._train("full") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", + ] + + @staticmethod + def filter_args(**kwargs): + valid_args = ( + "hf_feats", + "transducer", + "feat_fusion_start", + "feat_fusion_method", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + def get_config(self): + hf_cfg = self.hf_feats.get_config() + tran_cfg = self.transducer.get_config() + del hf_cfg["class_name"] + del tran_cfg["class_name"] + config = { + "hf_feats": hf_cfg, + "transducer": tran_cfg, + "feat_fusion_start": self.feat_fusion_start, + "feat_fusion_method": self.feat_fusion_method, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, hf_feats, transducer): + logging.info("changing hf wav2transducer config") + self.hf_feats.change_config(**hf_feats) + self.transducer.change_config(**transducer) + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--feat-fusion-start", + default=0, + type=int, + help=""" + the input to x-vector model will fuse the wav2vec + layers from feat_fusion_start to + the wav2vec num_layers""", + ) + parser.add_argument( + "--feat-fusion-method", + default="weighted-avg", + choices=["weighted-avg", "linear", "cat", "last"], + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]"), + ) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def add_infer_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNTransducer.add_infer_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_infer_args(**kwargs): + return RNNTransducer.filter_infer_args(**kwargs) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py new file mode 100644 index 00000000..412a182b --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py @@ -0,0 +1,103 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from jsonargparse import ArgumentParser, ActionParser +from typing import Union, Dict, Optional + +import torch +import torch.nn as nn + +from ..transducer import RNNRNNTransducer +from ...tpm import HFWav2Vec2 +from .hf_wav2rnn_transducer import HFWav2RNNTransducer + + +class HFWav2Vec2RNNRNNTransducer(HFWav2RNNTransducer): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNRNNTransducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(transducer, dict): + transducer["decoder"]["in_feats"] = hf_feats.hidden_size + #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer = RNNRNNTransducer(**transducer) + else: + assert isinstance(transducer, RNNRNNTransducer) + + super().__init__(hf_feats, transducer, feat_fusion_start, + feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNTransducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNRNNTransducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNRNNTransducer.add_class_args(parser, prefix="transducer") + HFWav2RNNTransducer.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNRNNTransducer.filter_finetune_args( + **kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNRNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py new file mode 100644 index 00000000..d89953b2 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py @@ -0,0 +1,103 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from jsonargparse import ArgumentParser, ActionParser +from typing import Union, Dict, Optional + +import torch +import torch.nn as nn + +from ..transducer import RNNTransducer +from ...tpm import HFWav2Vec2 +from .hf_wav2rnn_transducer import HFWav2RNNTransducer + + +class HFWav2Vec2RNNTransducer(HFWav2RNNTransducer): + """Class for RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNTransducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + # if isinstance(transducer, dict): + # transducer["decoder"]["in_feats"] = hf_feats.hidden_size + # transducer["joiner"]["in_feats"] = hf_feats.hidden_size + # if "class_name" in transducer: + # del transducer["class_name"] + # transducer = Transducer(**transducer) + # else: + # assert isinstance(transducer, Transducer) + # assert transducer.decoder.in_feats == hf_feats.hidden_size + # assert transducer.joiner.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, transducer, feat_fusion_start, + feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNTransducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNTransducer.filter_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + RNNTransducer.add_class_args(parser, prefix="transducer") + HFWav2RNNTransducer.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = RNNTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + RNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py new file mode 100644 index 00000000..a5df4b8a --- /dev/null +++ b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py @@ -0,0 +1,103 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union, Tuple +from jsonargparse import ActionParser, ArgumentParser + +import torch +import torch.nn as nn + +from ...narchs import AudioFeatsMVN +from ...torch_model import TorchModel +from ...utils import remove_silence + + +class Wav2RNNTransducer(TorchModel): + """Base class for models that integrate the acoustic feature extractor and and x-vector model that takes acoustic features as input. + + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + xvector: x-vector model object. + """ + + def __init__(self, feats, transducer): + + super().__init__() + + if isinstance(feats, dict): + feats = AudioFeatsMVN.filter_args(**feats) + feats["trans"] = True + feats = AudioFeatsMVN(**feats) + else: + assert isinstance(feats, AudioFeatsMVN) + + self.feats = feats + self.transducer = transducer + + def forward( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: k2.RaggedTensor, + vad_samples: Optional[torch.Tensor] = None, + vad_feats: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + + if vad_samples is not None: + x, x_lengths = remove_silence(x, x_lengths) + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, feat_lengths) + + return self.transducer(feats, feat_lengths, y) + + def set_train_mode(self, mode): + self.transducer.set_train_mode(mode) + + def get_config(self): + feat_cfg = self.feats.get_config() + xvector_cfg = self.xvector.get_config() + config = { + "feats": feat_cfg, + "xvector": xvector_cfg, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(*kwargs): + """Filters Wav2XVector class arguments from arguments dictionary. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with SpecAugment options. + """ + valid_args = ( + "feats", + "xvector", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2XVector options common to all child classes to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + AudioFeatsMVN.add_class_args(parser, prefix="feats") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py index 43ab2382..8a17379c 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -19,7 +19,6 @@ class HFWav2Vec2ResNet1dXVector(HFWav2XVector): """Class extracting Wav2Vec2 + ResNet1d x-vectors from waveform. Attributes: - Attributes: hf_feats: HFWav2Vec configuration dictionary or object. This is a warpper over Hugging Face Wav2Vec model. xvector: ResNet1dXVector configuration dictionary or object. @@ -53,7 +52,8 @@ def __init__( assert isinstance(xvector, ResNet1dXVector) assert xvector.encoder_net.in_feats == hf_feats.hidden_size - super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + super().__init__(hf_feats, xvector, feat_fusion_start, + feat_fusion_method) @staticmethod def filter_args(**kwargs): @@ -76,7 +76,8 @@ def add_class_args(parser, prefix=None): HFWav2XVector.add_class_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) @staticmethod def filter_finetune_args(**kwargs): @@ -97,4 +98,5 @@ def add_finetune_args(parser, prefix=None): ResNet1dXVector.add_finetune_args(parser, prefix="xvector") if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 8a65f12e..5599fa1e 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -13,9 +13,6 @@ from ...torch_model import TorchModel from ...utils import remove_silence -# import torch.nn.functional as nnf - - class HFWav2XVector(TorchModel): """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. @@ -29,9 +26,11 @@ class HFWav2XVector(TorchModel): than one layer is used. """ - def __init__( - self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" - ): + def __init__(self, + hf_feats, + xvector, + feat_fusion_start=0, + feat_fusion_method="weighted-avg"): super().__init__() self.hf_feats = hf_feats @@ -52,9 +51,12 @@ def _make_fuser(self): self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "linear": self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers + self.feat_fuser.weight.data = torch.ones(1, + num_layers) / num_layers elif self.feat_fusion_method == "cat": - self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + self.feat_fuser = nn.Linear(num_layers * layer_dim, + layer_dim, + bias=False) def _fuse_hid_feats(self, hid_feats): """Fuses the hidden features from the Wav2Vec model. @@ -69,7 +71,7 @@ def _fuse_hid_feats(self, hid_feats): # There is only one layer of features return hid_feats[0] - hid_feats = hid_feats[self.feat_fusion_start :] + hid_feats = hid_feats[self.feat_fusion_start:] if self.feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) @@ -123,14 +125,14 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) - def forward_feats( - self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False - ): - return_hid_states = ( - False - if return_feat_layers is None and self.feat_fusion_method == "last" - else True - ) + def forward_feats(self, + x, + x_lengths, + return_feat_layers=None, + chunk_length=0, + detach_chunks=False): + return_hid_states = (False if return_feat_layers is None + and self.feat_fusion_method == "last" else True) with self._hf_context: hf_output = self.hf_feats( x, @@ -152,8 +154,7 @@ def forward_feats( # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) # as the hidden features of the x-vector encoder. hid_feats = [ - f.transpose(1, 2) - for i, f in enumerate(hid_feats) + f.transpose(1, 2) for i, f in enumerate(hid_feats) if i in return_feat_layers ] else: @@ -193,8 +194,7 @@ def forward( "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) """ feats, hid_feats, feat_lengths = self.forward_feats( - x, x_lengths, return_feat_layers - ) + x, x_lengths, return_feat_layers) output = self.xvector( feats, feat_lengths, @@ -230,17 +230,16 @@ def extract_embed( x, x_lengths = remove_silence(x, x_lengths) feats, _, feat_lengths = self.forward_feats( - x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks - ) - xvec_chunk_length = int( - xvec_chunk_length - * self.hf_feats.sample_frequency - * feats.size(-1) - // x.size(-1) - ) - return self.xvector.extract_embed( - feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks - ) + x, + x_lengths, + chunk_length=hf_chunk_length, + detach_chunks=detach_chunks) + xvec_chunk_length = int(xvec_chunk_length * + self.hf_feats.sample_frequency * + feats.size(-1) // x.size(-1)) + return self.xvector.extract_embed(feats, feat_lengths, + xvec_chunk_length, embed_layer, + detach_chunks) def freeze_feat_fuser(self): if self.feat_fuser is None: @@ -303,11 +302,11 @@ def _train(self, train_mode: str): self.hf_feats.train() self.xvector._train("ft-embed_affine") elif train_mode in [ - "ft-xvector", - "hf-feats-frozen", - "ft-xvector-nograd", - "hf-feats-frozen-nograd", - "hf-feat-extractor-frozen", + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", ]: self.hf_feats.train() self.xvector._train("full") @@ -370,19 +369,16 @@ def add_class_args(parser, prefix=None, skip=set()): "--feat-fusion-start", default=0, type=int, - help=( - "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" - "the wav2vec num_layers" - ), + help= + ("the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers"), ) parser.add_argument( "--feat-fusion-method", default="weighted-avg", choices=["weighted-avg", "linear", "cat", "last"], - help=( - "method to fuse the hidden layers from the wav2vec model " - "in [weighted-avg, cat]" - ), + help=("method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]"), ) if prefix is not None: diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 824b5830..4c21f478 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -80,9 +80,8 @@ def forward( feats, feat_lengths = remove_silence(feats, feat_lengths) # feat_lengths = torch.div(x_lengths * feats.size(-1), x.size(-1)) - return self.xvector( - feats, feat_lengths, y, enc_layers, classif_layers, return_output - ) + return self.xvector(feats, feat_lengths, y, enc_layers, classif_layers, + return_output) def extract_embed( self, @@ -102,12 +101,11 @@ def extract_embed( feats, feat_lengths = remove_silence(feats, feat_lengths) feats = feats.transpose(1, 2) - return self.xvector.extract_embed( - feats, feat_lengths, chunk_length, embed_layer, detach_chunks - ) + return self.xvector.extract_embed(feats, feat_lengths, chunk_length, + embed_layer, detach_chunks) - def train_mode(self, mode="ft-embed-affine"): - self.xvector.train_mode(mode) + def set_train_mode(self, mode): + self.xvector.set_train_mode(mode) def get_config(self): feat_cfg = self.feats.get_config() @@ -152,4 +150,5 @@ def add_class_args(parser, prefix=None): AudioFeatsMVN.add_class_args(parser, prefix="feats") if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index 1bce0f87..09136b7d 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -15,6 +15,7 @@ class ResNet1dXVector(XVector): + def __init__( self, resnet_enc, @@ -22,7 +23,10 @@ def __init__( pool_net="mean+stddev", embed_dim=256, num_embed_layers=1, - hid_act={"name": "relu", "inplace": True}, + hid_act={ + "name": "relu", + "inplace": True + }, loss_type="arc-softmax", cos_scale=64, margin=0.3, @@ -41,7 +45,8 @@ def __init__( ): if isinstance(resnet_enc, dict): - logging.info("making %s resnet1d encoder network", resnet_enc["resb_type"]) + logging.info("making %s resnet1d encoder network", + resnet_enc["resb_type"]) resnet_enc = Encoder(**resnet_enc) super().__init__( @@ -145,12 +150,12 @@ def add_class_args(parser, prefix=None): parser = ArgumentParser(prog="") XVector.add_class_args(parser, skip=set(["in_feats"])) - Encoder.add_class_args(parser, prefix="resnet_enc", skip=set(["head_channels"])) - # parser.link_arguments("in_feats", "resnet_enc.in_feats", apply_on="parse") - # parser.link_arguments("norm_layer", "encoder_net.norm_layer", apply_on="parse") - + Encoder.add_class_args(parser, + prefix="resnet_enc", + skip=set(["head_channels"])) if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) add_argparse_args = add_class_args @@ -168,9 +173,10 @@ def add_finetune_args(parser, prefix=None): parser = ArgumentParser(prog="") XVector.add_finetune_args(parser) - Encoder.add_finetune_args( - parser, prefix="resnet_enc", skip=set(["head_channels"]) - ) + Encoder.add_finetune_args(parser, + prefix="resnet_enc", + skip=set(["head_channels"])) if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/__init__.py b/hyperion/torch/narchs/__init__.py index c8504425..5f333fc8 100644 --- a/hyperion/torch/narchs/__init__.py +++ b/hyperion/torch/narchs/__init__.py @@ -26,3 +26,5 @@ from .tdnn_factory import TDNNFactory from .torch_na_loader import TorchNALoader from .transformer_encoder_v1 import TransformerEncoderV1 +from .rnn_transducer_decoder import RNNTransducerDecoder +from .rnn_encoder import RNNEncoder diff --git a/hyperion/torch/narchs/rnn_encoder.py b/hyperion/torch/narchs/rnn_encoder.py new file mode 100644 index 00000000..dcf02564 --- /dev/null +++ b/hyperion/torch/narchs/rnn_encoder.py @@ -0,0 +1,281 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +from typing import Dict, Optional, Union, Tuple + +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +import torch +import torch.nn as nn +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + +from ..layer_blocks import TransformerConv2dSubsampler as Subsampler +from ..layers import ActivationFactory as AF +#from ..layers import NormLayer1dFactory as NLF +from ..utils import seq_lengths_to_mask +from .net_arch import NetArch + + +class RNNEncoder(NetArch): + """ RNN Encoder network + + Attributeds: + in_feats: input features + hid_feats: hidden features in RNN layers + out_feats: output features, if 0 we remove last projection layer + num_layers: number of RNN layers + proj_feats: projection features in LSTM layers + rnn_type: type of RNN in [lstm, gru] + bidirectional: whether RNN layers are bidirectional + dropout_rate: dropout rate + subsample_input: whether to subsample the input features time dimension x4 + subsampling_act: activation function of the subsampling block + """ + + def __init__(self, + in_feats: int, + hid_feats: int, + out_feats: int, + num_layers: int, + proj_feats: int = 0, + rnn_type: str = "lstm", + bidirectional: bool = False, + dropout_rate: float = 0.0, + subsample_input: bool = False, + subsampling_act: str = "relu6"): + super().__init__() + if rnn_type != "lstm": + proj_feats = 0 + + self.in_feats = in_feats + self.hid_feats = hid_feats + self.out_feats = out_feats + self.num_layers = num_layers + self.proj_feats = proj_feats + self.rnn_type = rnn_type + self.bidirectional = bidirectional + self.subsample_input = subsample_input + self.subsampling_act = subsampling_act + + rnn_feats = hid_feats if proj_feats == 0 else proj_feats + if subsample_input: + subsamplinb_act = AF.create(subsampling_act) + self.subsampler = Subsampler(in_feats, + hid_feats, + hid_act=subsampling_act) + lstm_in_dim = hid_feats + else: + self.subsampler = None + lstm_in_dim = in_feats + + if rnn_type == "lstm": + self.rnn = nn.LSTM( + input_size=hid_feats, + hidden_size=hid_feats, + num_layers=num_layers, + bias=True, + proj_size=proj_feats, + batch_first=True, + dropout=dropout_rate, + bidirectional=bidirectional, + ) + else: + self.rnn = nn.GRU( + input_size=hid_feats, + hidden_size=hid_feats, + num_layers=num_layers, + bias=True, + batch_first=True, + dropout=dropout_rate, + bidirectional=bidirectional, + ) + + if out_feats > 0: + self.output = nn.Sequential( + nn.Dropout(p=dropout_rate), + nn.Linear(rnn_feats, out_feats), + ) + + def forward(self, x: torch.Tensor, + x_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + if self.subsample_input: + t1 = x.size(1) + x = self.subsampler(x) + t2 = x.size(2) + x_lengths = torch.div(t2 * x_lengths, t1, rounding_mode="floor") + + x = pack_padded_sequence(input=x, + lengths=x_lengths.cpu(), + batch_first=True, + enforce_sorted=True) + x, _ = self.rnn(x) + x = pad_packed_sequence(x, batch_first=True) + if self.out_feats > 0: + x = self.output(x) + + return x, x_lengths + + def in_context(self): + return (self._context, self._context) + + def in_shape(self): + return (None, None, self.in_feats) + + def out_shape(self, in_shape=None): + out_feats = self.out_feats if self.out_feats > 0 else ( + self.proj_feats if self.proj_feats > 0 else self.hid_feats) + + if in_shape is None: + return (None, None, out_feats) + + assert len(in_shape) == 3 + return (*in_shape, out_feats) + + def get_config(self): + config = filter_func_args(RNNEncoder.__init__, self.__dict__) + base_config = super().get_config() + base_config.update(config) + return base_config + #return dict(list(base_config.items()) + list(config.items())) + + def change_config(self, override_dropouts, dropout_rate): + if override_dropouts: + logging.info("changing RNNEncoder dropouts") + self.change_dropouts(dropout_rate) + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(RNNEncoder.__init__, **kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument("--in-feats", + type=int, + required=True, + help=("input feature dimension")) + + parser.add_argument( + "--hid-feats", + default=512, + type=int, + help=("num of hidden dimensions of RNN layers"), + ) + + parser.add_argument( + "--out-feats", + default=512, + type=int, + help= + ("number of output dimensions of the encoder, if 0 output projection is removed" + ), + ) + + parser.add_argument( + "--proj-feats", + default=512, + type=int, + help=("projection features of LSTM layers"), + ) + + parser.add_argument( + "--num-layers", + default=5, + type=int, + help=("number of RNN layers"), + ) + + parser.add_argument( + "--in-kernel-size", + default=3, + type=int, + help=("kernel size of input convolution"), + ) + + parser.add_argument( + "--rnn-type", + default="lstm", + choices=[ + "lstm", + "gru", + ], + help=("RNN type in [lstm, gru]"), + ) + + parser.add_argument( + "--bidirectional", + default=False, + action=ActionYesNo, + help="whether to use bidirectional RNN", + ) + + parser.add_argument( + "--subsample-input", + default=False, + action=ActionYesNo, + help="whether to subsaple input features x4", + ) + parser.add_argument("--subsampling-act", + default="relu6", + help="activation for subsampler block") + + if "dropout_rate" not in skip: + parser.add_argument("--dropout-rate", + default=0, + type=float, + help="dropout probability") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + + valid_args = ( + "override_dropouts", + "dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set([])): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", + default=0, + type=float, + help="dropout probability") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index ef153776..8f1a60be 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -4,20 +4,34 @@ """ from jsonargparse import ActionParser, ArgumentParser +from typing import Optional, Dict, List, Tuple +from dataclasses import dataclass import torch import torch.nn as nn +import torchaudio +import torchaudio.functional try: import k2 except ModuleNotFoundError: from ...utils import dummy_k2 as k2 -from ...utils import filter_func_args +from ...utils.misc import filter_func_args +from ...utils.text import add_sos from ..layer_blocks import TransducerPredictor as Predictor, TransducerJoiner as Joiner from .net_arch import NetArch +@dataclass +class Hypothesis: + ys: List[int] # predicted sequences + log_prob: float # log prob of ys + + # Optional LSTM predictor state. + pred_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + + class RNNTransducerDecoder(NetArch): """ RNN-T Decoder composed of Predictor and Joiner networks Implementation based on @@ -70,16 +84,13 @@ def forward(self, x: torch.Tensor, x_lengths: torch.Tensor, # get y_lengths row_splits = y.shape.row_splits(1) y_lengths = row_splits[1:] - row_splits[:-1] - # shift y adding token sos_y = add_sos(y, sos_id=self.blank_id) sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id) sos_y_padded = sos_y_padded.to(torch.int64) - # apply predictor and joiner pred_out, _ = self.predictor(sos_y_padded) logits = self.joiner(x, pred_out) - # rnnt_loss requires 0 padded targets # Note: y does not start with SOS y_padded = y.pad(mode="constant", padding_value=0) @@ -89,7 +100,438 @@ def forward(self, x: torch.Tensor, x_lengths: torch.Tensor, targets=y_padded.to(torch.int32), logit_lengths=x_lengths, target_lengths=y_lengths, - blank=blank_id, + blank=self.blank_id, reduction="sum", ) - return loss + return logits, loss + + def decode(self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000) -> List[int]: + if method == "time_sync_beam_search": + return self.decode_time_sync_beam_search(x, + x_lengths, + beam_width=beam_width) + elif method == "align_length_sync_beam_search": + return self.decode_align_length_sync_beam_search( + x, + x_lengths, + beam_width=beam_width, + max_sym_per_utt=max_sym_per_utt) + elif method == "greedy": + return self.decode_greedy(x, + x_lengths, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt) + + def decode_greedy(self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000) -> List[int]: + """ + Args: + x: encoder embeddings with shape = (N, T, C) + Returns: + Decoded tokens + """ + assert x.ndim == 3 + + # support only batch_size == 1 for now + assert x.size(0) == 1, x.size(0) + blank_id = self.blank_id + device = x.device + + sos = torch.tensor([blank_id], device=device, + dtype=torch.int64).reshape(1, 1) + pred_out, (h, c) = self.predictor(sos) + T = x.size(1) + t = 0 + hyp = [] + + sym_per_frame = 0 + sym_per_utt = 0 + + while t < T and sym_per_utt < max_sym_per_utt: + x_t = x[:, t:t + 1, :] + logits = self.joiner(x_t, pred_out) # (1, 1, 1, vocab_size) + # logits is + + log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) + # TODO: Use logits.argmax() + y = log_prob.argmax() + if y != blank_id: + hyp.append(y.item()) + y = y.reshape(1, 1) + pred_out, (h, c) = self.predictor(y, (h, c)) + + sym_per_utt += 1 + sym_per_frame += 1 + + if y == blank_id or sym_per_frame > max_sym_per_frame: + sym_per_frame = 0 + t += 1 + + return hyp + + def decode_time_sync_beam_search(self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + beam_width: int = 5) -> List[int]: + assert x.ndim == 3 + assert x.size(0) == 1, x.size(0) + + blank_id = self.blank_id + device = x.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + pred_out, (h, c) = self.predictor(sos) + T = x.size(1) + t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] + max_u = 20000 # terminate after this number of steps + u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]] = {} + + while t < T and u < max_u: + x_t = x[:, t:t + 1, :] + A = B + B = [] + + while u < max_u: + y_star = max(A, key=lambda hyp: hyp.log_prob) + A.remove(y_star) + + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + pred_in = torch.tensor([y_star.ys[-1]], + device=device).reshape(1, 1) + + pred_out, pred_state = self.predictor( + pred_in, + y_star.pred_state, + ) + cache[cached_key] = (pred_out, pred_state) + else: + pred_out, pred_state = cache[cached_key] + + logits = self.joiner(x_t, pred_out) + log_prob = logits.log_softmax(dim=-1) + # log_prob is (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() + # Now log_prob is (vocab_size,) + + # If we choose blank here, add the new hypothesis to B. + # Otherwise, add the new hypothesis to A + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.decoder_state here + pred_state=y_star.pred_state, + ) + B.append(new_y_star) + + topk_log_prob = log_prob.topk(beam_width, dim=-1) + + # Second, choose other labels + #for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, + ) + A.append(new_hyp) + + u += 1 + # check whether B contains more than "beam" elements more probable + # than the most probable in A + A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + B = sorted( + [ + hyp + for hyp in B if hyp.log_prob > A_most_probable.log_prob + ], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) + if len(B) >= beam_width: + B = B[:beam_width] + break + t += 1 + + best_hyp = max(B, + key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys + + def decode_align_length_sync_beam_search( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + beam_width: int = 5, + max_sym_per_utt: int = 1000) -> List[int]: + assert x.ndim == 3 + assert x.size(0) == 1, x.size(0) + + blank_id = self.blank_id + device = x.device + + sos = torch.tensor([blank_id], device=device).reshape(1, 1) + pred_out, (h, c) = self.predictor(sos) + T = x.size(1) + #t = 0 + B = [Hypothesis(ys=[blank_id], log_prob=0.0, decoder_state=None)] + #max_u = 20000 # terminate after this number of steps + #u = 0 + + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]] = {} + F = [] + #for t < T and u < max_u: + for i in range(T + max_sym_per_utt): + A = [] + for y_star in B: + #while u < max_u: + u = len(y_star.ys) - 1 + t = i - u + if t >= T: + continue + + #y_star = max(A, key=lambda hyp: hyp.log_prob) + #A.remove(y_star) + x_t = x[:, t:t + 1, :] + # Note: y_star.ys is unhashable, i.e., cannot be used + # as a key into a dict + cached_key = "_".join(map(str, y_star.ys)) + + if cached_key not in cache: + pred_in = torch.tensor([y_star.ys[-1]], + device=device).reshape(1, 1) + + pred_out, pred_state = self.predictor( + pred_in, + y_star.pred_state, + ) + cache[cached_key] = (pred_out, pred_state) + else: + pred_out, pred_state = cache[cached_key] + + logits = self.joiner(x_t, pred_out) + log_prob = logits.log_softmax(dim=-1) # (1, 1, 1, vocab_size) + log_prob = log_prob.squeeze() # (vocab_size,) + + # First, choose blank + skip_log_prob = log_prob[blank_id] + new_y_star_log_prob = y_star.log_prob + skip_log_prob.item() + # print("tuAB0", t, u, len(y_star.ys), y_star.log_prob, + # skip_log_prob.item(), new_y_star_log_prob) + # ys[:] returns a copy of ys + new_y_star = Hypothesis( + ys=y_star.ys[:], + log_prob=new_y_star_log_prob, + # Caution: Use y_star.decoder_state here + pred_state=y_star.pred_state, + ) + A.append(new_y_star) + if t == T - 1: + F.append(y_star) + + topk_log_prob = log_prob.topk(beam_width, dim=-1) + + # Second, choose other labels + #for i, v in enumerate(log_prob.tolist()): + for v, i in zip(*topk_log_prob): + v = v.item() + i = i.item() + if i == blank_id: + continue + new_ys = y_star.ys + [i] + new_log_prob = y_star.log_prob + v + new_hyp = Hypothesis( + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, + ) + A.append(new_hyp) + + # check whether B contains more than "beam_width" elements more probable + # than the most probable in A + #A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + B0 = sorted( + [hyp for hyp in A], + key=lambda hyp: hyp.log_prob, + reverse=True, + ) + B = [] + B_ys = set() + for hyp in B0: + if hyp.ys not in B_ys: + B.append(hyp) + B_ys.add(hyp.ys) + # print("tuAB2", + # t, + # u, + # len(A), + # A_most_probable.log_prob, + # len(B), + # flush=True) + if len(B) >= beam_width: + B = B[:beam_width] + break + + best_hyp = max(F, + key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + ys = best_hyp.ys[1:] # [1:] to remove the blank + return ys + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + rnn_dropout_rate: float = 0.0, + ): + logging.info("changing decoder config") + self.predictor.change_config(override_dropouts, embed_dropout_rate, + rnn_dropout_rate) + + def get_config(self): + + config = { + "in_feats": self.in_feats, + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "num_pred_layers": self.num_pred_layers, + "pred_hid_feats": self.pred_hid_feats, + "embed_dropout_rate": self.embed_dropout_rate, + "rnn_dropout_rate": self.rnn_dropout_rate, + "blank_id": self.blank_id, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(RNNTransducerDecoder.__init__, kwargs) + return args + + @staticmethod + def filter_finetune_args(**kwargs): + args = filter_func_args(RNNTransducerDecoder.change_config, kwargs) + return args + + @staticmethod + def add_class_args(parser, + prefix=None, + skip=set(["in_feats", "blanck_id", "vocab_size"])): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument("--in-feats", + type=int, + required=True, + help=("input feature dimension")) + if "blank_id" not in skip: + parser.add_argument("--blank-id", + type=int, + default=0, + help=("blank id from tokenizer model")) + if "vocab_size" not in skip: + parser.add_argument("--vocab-size", + type=int, + required=True, + help=("output prediction dimension")) + parser.add_argument("--embed-dim", + default=1024, + type=int, + help=("token embedding dimension")) + parser.add_argument( + "--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for predictor input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) + parser.add_argument( + "--rnn-type", + default="lstm", + choices=["lstm", "gru"], + help=( + "type of recurrent network for thep predictor in [lstm, gru]")) + + parser.add_argument("--num-pred-layers", + default=2, + type=int, + help="""number of layers of the predictor """) + + parser.add_argument("--pred-hid-feats", + default=512, + type=int, + help="""hidden features of the predictor""") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model.")) + parser.add_argument("--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings")) + parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN ")) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 1d01e02b..540697f7 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -15,6 +15,7 @@ class TorchModel(nn.Module): + def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) torch_model_registry[cls.__name__] = cls @@ -56,6 +57,8 @@ def change_dropouts(self, dropout_rate): for module in self.modules(): if isinstance(module, nn.modules.dropout._DropoutNd): module.p = dropout_rate + if isinstance(module, nn.RNNBase): + module.dropout = dropout if hasattr(self, "dropout_rate"): assert dropout_rate == 0 or self.dropout_rate > 0 @@ -67,7 +70,6 @@ def train_mode(self): @train_mode.setter def train_mode(self, mode): - print("hola3", mode, flush=True) self.set_train_mode(mode) def set_train_mode(self, mode): @@ -106,9 +108,10 @@ def save(self, file_path): os.makedirs(file_dir, exist_ok=True) config = self.get_config() - torch.save( - {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()} - ) + torch.save({ + "model_cfg": self.get_config(), + "model_state_dict": self.state_dict() + }) @staticmethod def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): @@ -128,7 +131,8 @@ def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = TorchModel._load_cfg_state_dict(file_path, cfg, state_dict) + cfg, state_dict = TorchModel._load_cfg_state_dict( + file_path, cfg, state_dict) model = cls(**cfg) if state_dict is not None: @@ -143,15 +147,14 @@ def get_loss(self): @property def device(self): - devices = {param.device for param in self.parameters()} | { - buf.device for buf in self.buffers() - } + devices = {param.device + for param in self.parameters() + } | {buf.device + for buf in self.buffers()} if len(devices) != 1: raise RuntimeError( "Cannot determine device: {} different devices found".format( - len(devices) - ) - ) + len(devices))) return next(iter(devices)) @@ -213,4 +216,5 @@ def auto_load(file_path, extra_objs={}, map_location=None): # if it failed the 3 trials raise exception raise err # remove module prefix when is trained with dataparallel - state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) + state_dict = ODict( + (p.sub("", k), v) for k, v in state_dict.items()) From 97050f91772146db8aaa30733561951efa295a52 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 10 Mar 2023 16:58:46 -0500 Subject: [PATCH 087/154] working align length sycn dec --- egs/librispeech/v1/conf/infer.yaml | 3 +++ hyperion/torch/narchs/rnn_transducer_decoder.py | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/egs/librispeech/v1/conf/infer.yaml b/egs/librispeech/v1/conf/infer.yaml index ddfd25e2..cd50a2cb 100644 --- a/egs/librispeech/v1/conf/infer.yaml +++ b/egs/librispeech/v1/conf/infer.yaml @@ -1 +1,4 @@ beam_width: 5 +decoding_method: time_sync_beam_search +#decoding_method: greedy +#decoding_method: align_length_sync_beam_search \ No newline at end of file diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index 8f1a60be..64c71dcd 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -311,7 +311,7 @@ def decode_align_length_sync_beam_search( pred_out, (h, c) = self.predictor(sos) T = x.size(1) #t = 0 - B = [Hypothesis(ys=[blank_id], log_prob=0.0, decoder_state=None)] + B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] #max_u = 20000 # terminate after this number of steps #u = 0 @@ -360,7 +360,7 @@ def decode_align_length_sync_beam_search( new_y_star = Hypothesis( ys=y_star.ys[:], log_prob=new_y_star_log_prob, - # Caution: Use y_star.decoder_state here + # Caution: Use y_star.pred_state here pred_state=y_star.pred_state, ) A.append(new_y_star) @@ -397,9 +397,10 @@ def decode_align_length_sync_beam_search( B = [] B_ys = set() for hyp in B0: - if hyp.ys not in B_ys: + hyp_ys = tuple(hyp.ys) # to make ys hashable + if hyp_ys not in B_ys: B.append(hyp) - B_ys.add(hyp.ys) + B_ys.add(hyp_ys) # print("tuAB2", # t, # u, From db14742c3a6fc01c789aa3f53dfe149b3da5d88a Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 21 Mar 2023 20:55:17 -0400 Subject: [PATCH 088/154] finised refactoring rnn transducer --- ...v2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml | 69 +++ ...v2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml | 69 +++ ...v2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml | 70 +++ ...rain_wav2vec2base_rnnt_k2_stage1_v1.0.yaml | 69 +++ ...n_wav2vec2base_transducer_stage1_v7.1.yaml | 53 +++ .../v1/conf/wav2vec2base_rnnt_ta_do0.4.yaml | 16 + .../v1/global_conf/config_transducer_v7.1.sh | 33 ++ ...config_wav2vec2base_rnnt_k2_pruned_v1.0.sh | 31 ++ ...config_wav2vec2base_rnnt_k2_pruned_v1.2.sh | 30 ++ ...config_wav2vec2base_rnnt_k2_pruned_v1.3.sh | 30 ++ .../config_wav2vec2base_rnnt_k2_v1.0.sh | 36 ++ egs/librispeech/v1/run_011_train_asr.sh | 2 +- ...train_asr2.sh => run_011_train_asr_old.sh} | 2 +- egs/librispeech/v1/run_030_inference.sh | 4 +- ...inference2.sh => run_030_inference_old.sh} | 4 +- hyperion/bin/apply_mvn_select_frames.py | 5 +- hyperion/bin/audio_to_duration.py | 15 +- hyperion/bin/compute_energy_vad.py | 5 +- hyperion/bin/compute_mfcc_feats.py | 5 +- hyperion/bin/copy_feats.py | 1 - hyperion/bin/decode_wav2transducer.py | 33 +- hyperion/bin/decode_wav2vec2rnn_transducer.py | 35 +- ...l_xvec_cosine_scoring_from_adv_test_wav.py | 5 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 5 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 7 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 5 +- ...sine_scoring_from_transfer_adv_test_wav.py | 5 +- ...sine_scoring_from_transfer_art_test_wav.py | 7 +- hyperion/bin/extract_wav2vec2xvectors.py | 7 +- hyperion/bin/extract_xvectors_from_feats.py | 5 +- hyperion/bin/extract_xvectors_from_wav.py | 5 +- .../extract_xvectors_slidwin_from_feats.py | 7 +- .../bin/extract_xvectors_slidwin_from_wav.py | 7 +- hyperion/bin/finetune_wav2vec2transducer.py | 28 +- hyperion/bin/finetune_wav2vec2xvector.py | 5 +- .../bin/finetune_xvector_dfr_from_feats.py | 5 +- hyperion/bin/finetune_xvector_dfr_from_wav.py | 5 +- hyperion/bin/finetune_xvector_from_feats.py | 5 +- hyperion/bin/finetune_xvector_from_wav.py | 9 +- hyperion/bin/make_babble_noise_audio_files.py | 7 +- hyperion/bin/pack_wav_rirs.py | 5 +- hyperion/bin/plot_embedding_tsne.py | 5 +- hyperion/bin/plot_embedding_tsne_per_class.py | 5 +- hyperion/bin/preprocess_audio_files.py | 7 +- hyperion/bin/train_wav2rnn_transducer.py | 24 +- hyperion/bin/train_wav2vec2rnn_transducer.py | 28 +- hyperion/bin/train_wav2vec2transducer.py | 24 +- hyperion/bin/train_wav2vec2xvector.py | 5 +- hyperion/bin/train_xvector_from_feats.py | 5 +- hyperion/bin/train_xvector_from_wav.py | 5 +- hyperion/bin_deprec/ark2hyp.py | 1 - hyperion/bin_deprec/arkvad2nist.py | 1 - hyperion/bin_deprec/compute-gmm-post.py | 3 +- .../bin_deprec/eval-2class-performance.py | 1 - hyperion/bin_deprec/eval-elbo-ubm.py | 1 - .../bin_deprec/eval-q-scoring-homo-gbe.py | 1 - hyperion/bin_deprec/eval-score-norm.py | 1 - hyperion/bin_deprec/h5vad2nist.py | 1 - hyperion/bin_deprec/init-ubm.py | 3 +- hyperion/bin_deprec/scores2lre_format.py | 1 - .../torch-train-conformer-enc-v1-vq-dvae.py | 1 - .../torch-train-conformer-enc-v1-vq-vae.py | 1 - hyperion/bin_deprec/torch-train-dc1d-dvae.py | 1 - hyperion/bin_deprec/torch-train-dc1d-vae.py | 1 - hyperion/bin_deprec/torch-train-dc2d-dvae.py | 1 - hyperion/bin_deprec/torch-train-dc2d-vae.py | 1 - .../bin_deprec/torch-train-resnet1d-dvae.py | 1 - .../bin_deprec/torch-train-resnet1d-vae.py | 1 - .../torch-train-resnet1d-vq-dvae.py | 1 - .../bin_deprec/torch-train-resnet1d-vq-vae.py | 1 - .../bin_deprec/torch-train-resnet2d-dvae.py | 1 - .../bin_deprec/torch-train-resnet2d-vae.py | 5 +- .../torch-train-resnet2d-vq-dvae.py | 1 - .../bin_deprec/torch-train-resnet2d-vq-vae.py | 1 - .../torch-train-transformer-enc-v1-dvae.py | 1 - .../torch-train-transformer-enc-v1-vae.py | 1 - .../torch-train-transformer-enc-v1-vq-dvae.py | 1 - .../torch-train-transformer-enc-v1-vq-vae.py | 1 - hyperion/bin_deprec/torch-train-xvector.py | 1 - .../bin_deprec/train-q-scoring-homo-gbe.py | 1 - hyperion/bin_deprec/vectors2scores.py | 1 - .../bin_deprec2/apply-mvn-select-frames.py | 5 +- hyperion/bin_deprec2/compute-mfcc-feats.py | 5 +- hyperion/bin_deprec2/copy-feats.py | 1 - hyperion/bin_deprec2/eval-cos-1vs1.py | 1 - hyperion/bin_deprec2/eval-linear-gbe-up.py | 1 - hyperion/bin_deprec2/eval-linear-gbe.py | 1 - hyperion/bin_deprec2/eval-linear-svmc.py | 1 - .../bin_deprec2/eval-logistic-regression.py | 1 - hyperion/bin_deprec2/eval-plda-1vs1.py | 1 - hyperion/bin_deprec2/eval-plda-nvs1.py | 1 - hyperion/bin_deprec2/merge-h5-files.py | 1 - hyperion/bin_deprec2/pack-audio-files.py | 3 +- hyperion/bin_deprec2/plot-vector-hist.py | 1 - hyperion/bin_deprec2/rttm-to-bin-vad.py | 1 - hyperion/bin_deprec2/segments-to-bin-vad.py | 5 +- .../torch-adv-finetune-xvec-from-wav.py | 5 +- .../bin_deprec2/torch-adv-finetune-xvec.py | 5 +- .../bin_deprec2/torch-compute-mfcc-feats.py | 5 +- hyperion/bin_deprec2/torch-eval-vae.py | 1 - ...osine-scoring-from-adv-test-wav-wavegan.py | 5 +- ...l-xvec-cosine-scoring-from-adv-test-wav.py | 5 +- ...l-xvec-cosine-scoring-from-art-test-wav.py | 7 +- ...-eval-xvec-cosine-scoring-from-test-wav.py | 5 +- ...sine-scoring-from-transfer-adv-test-wav.py | 5 +- ...sine-scoring-from-transfer-art-test-wav.py | 7 +- .../torch-eval-xvec-logits-from-wav.py | 5 +- ...rch-extract-xvectors-from-wav-with-rttm.py | 5 +- ...torch-extract-xvectors-slidwin-from-wav.py | 7 +- .../torch-extract-xvectors-slidwin.py | 5 +- .../torch-extract-xvectors-vae-preproc.py | 5 +- .../bin_deprec2/torch-extract-xvectors.py | 5 +- ...ch-generate-adv-attacks-xvector-classif.py | 7 +- ...orch-generate-adv-attacks-xvector-verif.py | 7 +- hyperion/bin_deprec2/torch-train-dc1d-ae.py | 1 - hyperion/bin_deprec2/torch-train-dvae.py | 5 +- .../torch-train-efficientnet-xvec-from-wav.py | 5 +- .../torch-train-efficientnet-xvec.py | 5 +- .../torch-train-resnet-xvec-from-wav.py | 5 +- .../bin_deprec2/torch-train-resnet-xvec.py | 5 +- .../torch-train-resnet1d-xvec-from-wav.py | 5 +- .../torch-train-spinenet-xvec-from-wav.py | 5 +- .../torch-train-tdnn-xvec-from-wav.py | 5 +- hyperion/bin_deprec2/torch-train-tdnn-xvec.py | 5 +- ...orch-train-transformer-xvec-v1-from-wav.py | 5 +- .../torch-train-transformer-xvec-v1.py | 5 +- hyperion/bin_deprec2/torch-train-vae.py | 5 +- hyperion/bin_deprec2/torch-train-vq-dvae.py | 5 +- hyperion/bin_deprec2/torch-train-vq-vae.py | 5 +- hyperion/bin_deprec2/train-cw-up.py | 1 - hyperion/bin_deprec2/train-cw.py | 1 - hyperion/bin_deprec2/train-gaussianizer.py | 1 - hyperion/bin_deprec2/train-lda.py | 1 - hyperion/bin_deprec2/train-linear-gbe-up.py | 1 - hyperion/bin_deprec2/train-linear-gbe.py | 1 - hyperion/bin_deprec2/train-linear-svmc.py | 1 - .../bin_deprec2/train-logistic-regression.py | 1 - hyperion/bin_deprec2/train-mvn.py | 1 - hyperion/bin_deprec2/train-nda.py | 1 - hyperion/bin_deprec2/train-pca.py | 1 - hyperion/bin_deprec2/train-plda.py | 1 - hyperion/np/score_norm/adapt_s_norm.py | 1 + .../adv_attacks/random_attack_factory.py | 3 +- hyperion/torch/adv_defenses/wave_gan_white.py | 4 +- hyperion/torch/data/__init__.py | 4 +- hyperion/torch/data/audio_dataset.py | 19 +- .../data/class_weighted_embed_sampler.py | 3 +- .../data/class_weighted_seg_chunk_sampler.py | 3 +- hyperion/torch/data/embed_dataset.py | 3 +- hyperion/torch/data/embed_sampler.py | 3 +- hyperion/torch/data/feat_seq_dataset.py | 3 +- hyperion/torch/data/hyp_sampler.py | 3 +- .../torch/data/paired_feat_seq_dataset.py | 1 - hyperion/torch/data/seg_chunk_sampler.py | 3 +- hyperion/torch/data/seg_sampler.py | 3 +- hyperion/torch/data/weighted_embed_sampler.py | 1 - hyperion/torch/data/weighted_seq_sampler.py | 3 +- hyperion/torch/layer_blocks/__init__.py | 4 +- hyperion/torch/layer_blocks/etdnn_blocks.py | 1 - .../torch/layer_blocks/resetdnn_blocks.py | 1 - .../torch/layer_blocks/transducer_joiner.py | 106 ++--- .../layer_blocks/transducer_predictor.py | 274 ++++++----- hyperion/torch/layers/global_pool.py | 1 - hyperion/torch/layers/mvn.py | 3 +- hyperion/torch/layers/pool_factory.py | 3 +- hyperion/torch/layers/spec_augment.py | 3 +- hyperion/torch/loggers/logger.py | 1 - hyperion/torch/loggers/logger_list.py | 1 - hyperion/torch/lr_schedulers/factory.py | 3 +- hyperion/torch/models/__init__.py | 5 +- hyperion/torch/models/transducer/__init__.py | 4 +- hyperion/torch/models/transducer/conformer.py | 6 +- hyperion/torch/models/transducer/decoder.py | 3 +- hyperion/torch/models/transducer/joiner.py | 2 +- .../models/transducer/lstm_rnn_transducer.py | 4 +- .../models/transducer/rnn_rnn_transducer.py | 8 +- .../torch/models/transducer/rnn_transducer.py | 30 +- .../torch/models/transducer/transducer.py | 8 +- .../torch/models/transducer/transformer.py | 4 +- hyperion/torch/models/tvector/tvector.py | 3 +- .../torch/models/wav2transducer/__init__.py | 3 +- .../wav2transducer/hf_wav2rnn_transducer.py | 31 +- .../wav2transducer/hf_wav2transducer.py | 9 +- .../wav2transducer/hf_wav2vec2_transducer.py | 7 +- .../hf_wav2vec2rnn_rnn_transducer.py | 6 +- .../hf_wav2vec2rnn_transducer.py | 6 +- .../wav2transducer/wav2rnn_transducer.py | 4 +- .../hf_hubert2resnet1d_xvector.py | 3 +- .../hf_wav2vec2resnet1d_xvector.py | 3 +- .../models/wav2xvectors/hf_wav2xvector.py | 3 +- .../wav2xvectors/hf_wavlm2resnet1d_xvector.py | 3 +- .../wav2xvectors/wav2resnet1d_xvector.py | 3 +- .../models/wav2xvectors/wav2resnet_xvector.py | 3 +- .../torch/models/wav2xvectors/wav2xvector.py | 3 +- .../models/xvectors/efficient_net_xvector.py | 3 +- .../torch/models/xvectors/resnet1d_xvector.py | 3 +- .../torch/models/xvectors/resnet_xvector.py | 3 +- .../torch/models/xvectors/spinenet_xvector.py | 3 +- .../torch/models/xvectors/tdnn_xvector.py | 3 +- .../models/xvectors/transformer_xvector_v1.py | 3 +- hyperion/torch/models/xvectors/xvector.py | 3 +- hyperion/torch/narchs/__init__.py | 4 +- hyperion/torch/narchs/audio_feats_mvn.py | 3 +- hyperion/torch/narchs/classif_head.py | 3 +- hyperion/torch/narchs/conformer_encoder_v1.py | 3 +- hyperion/torch/narchs/dc1d_decoder.py | 3 +- hyperion/torch/narchs/dc1d_encoder.py | 3 +- hyperion/torch/narchs/dc2d_decoder.py | 3 +- hyperion/torch/narchs/dc2d_encoder.py | 3 +- hyperion/torch/narchs/efficient_net.py | 3 +- hyperion/torch/narchs/etdnn.py | 1 - hyperion/torch/narchs/net_arch.py | 1 - hyperion/torch/narchs/resetdnn.py | 1 - hyperion/torch/narchs/resnet.py | 1 - hyperion/torch/narchs/resnet1d_decoder.py | 3 +- hyperion/torch/narchs/resnet1d_encoder.py | 3 +- hyperion/torch/narchs/resnet2d_decoder.py | 3 +- hyperion/torch/narchs/resnet2d_encoder.py | 3 +- hyperion/torch/narchs/rnn_encoder.py | 5 +- .../torch/narchs/rnn_transducer_decoder.py | 442 ++++++++++++++---- hyperion/torch/narchs/spinenet.py | 1 - hyperion/torch/narchs/tdnn.py | 1 - .../torch/narchs/transformer_encoder_v1.py | 3 +- hyperion/torch/optim/factory.py | 3 +- hyperion/torch/tpm/hf/hf_hubert.py | 5 +- hyperion/torch/tpm/hf/hf_wav2vec2.py | 5 +- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 5 +- hyperion/torch/tpm/hf/hf_wavlm.py | 5 +- hyperion/torch/trainers/ae_trainer.py | 3 +- hyperion/torch/trainers/dvae_trainer.py | 3 +- hyperion/torch/trainers/torch_trainer.py | 5 +- hyperion/torch/trainers/transducer_trainer.py | 68 +-- hyperion/torch/trainers/vae_trainer.py | 3 +- hyperion/torch/trainers/vq_dvae_trainer.py | 3 +- hyperion/torch/trainers/vq_vae_trainer.py | 3 +- .../torch/trainers/xvector_adv_trainer.py | 3 +- .../trainers/xvector_adv_trainer_from_wav.py | 3 +- .../trainers/xvector_trainer_deep_feat_reg.py | 3 +- hyperion/torch/utils/ddp.py | 7 +- hyperion/torch/utils/metric_acc.py | 1 - hyperion/utils/__init__.py | 1 + hyperion/utils/hyp_dataclass.py | 31 ++ 242 files changed, 1490 insertions(+), 918 deletions(-) create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml create mode 100644 egs/librispeech/v1/conf/wav2vec2base_rnnt_ta_do0.4.yaml create mode 100644 egs/librispeech/v1/global_conf/config_transducer_v7.1.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh rename egs/librispeech/v1/{run_011_train_asr2.sh => run_011_train_asr_old.sh} (98%) rename egs/librispeech/v1/{run_030_inference2.sh => run_030_inference_old.sh} (88%) create mode 100644 hyperion/utils/hyp_dataclass.py diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml new file mode 100644 index 00000000..bdb33845 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + decoder: + rnnt_loss: k2_pruned + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml new file mode 100644 index 00000000..cfd41553 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + decoder: + rnnt_loss: k2_pruned + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml new file mode 100644 index 00000000..2cf2d04c --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.005 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml new file mode 100644 index 00000000..c66a1ca4 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 70. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + decoder: + rnnt_loss: k2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml new file mode 100644 index 00000000..7381bb01 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml @@ -0,0 +1,53 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 75. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 75. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: wav2vec2base_rnnt_ta_do0.4.yaml +trainer: + optim: + opt_type: sgd + lr: 0.003 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/wav2vec2base_rnnt_ta_do0.4.yaml b/egs/librispeech/v1/conf/wav2vec2base_rnnt_ta_do0.4.yaml new file mode 100644 index 00000000..cfab3fb9 --- /dev/null +++ b/egs/librispeech/v1/conf/wav2vec2base_rnnt_ta_do0.4.yaml @@ -0,0 +1,16 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h +transducer: + decoder: + rnnt_loss: torchaudio + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/librispeech/v1/global_conf/config_transducer_v7.1.sh b/egs/librispeech/v1/global_conf/config_transducer_v7.1.sh new file mode 100644 index 00000000..48f0d363 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_transducer_v7.1.sh @@ -0,0 +1,33 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_transducer_stage1_v7.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_transducer_v7.1 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0056.pth +nnet_s1=$nnet_s1_dir/model_ep0068.pth +nnet_s1=$nnet_s1_dir/model_ep0090.pth +nnet_s1=$nnet_s1_dir/model_ep0094.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh new file mode 100644 index 00000000..1fd43d23 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh @@ -0,0 +1,31 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0081.pth +nnet_s1=$nnet_s1_dir/model_ep0120.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh new file mode 100644 index 00000000..7cd22d2d --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh @@ -0,0 +1,30 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh new file mode 100644 index 00000000..18875086 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh @@ -0,0 +1,30 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2_pruned.v1.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0100.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh new file mode 100644 index 00000000..ed274e91 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh @@ -0,0 +1,36 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_rnnt_k2.v1.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0041.pth +nnet_s1=$nnet_s1_dir/model_ep0048.pth +nnet_s1=$nnet_s1_dir/model_ep0066.pth +nnet_s1=$nnet_s1_dir/model_ep0106.pth +# nnet_s1=$nnet_s1_dir/model_ep0075.pth +# nnet_s1=$nnet_s1_dir/model_ep0106.pth +# nnet_s1=$nnet_s1_dir/model_ep0646.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index 3d0e6eb1..99b0065e 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -47,7 +47,7 @@ if [ $stage -le 1 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ - train_wav2vec2transducer.py $nnet_type \ + train_wav2vec2rnn_transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ diff --git a/egs/librispeech/v1/run_011_train_asr2.sh b/egs/librispeech/v1/run_011_train_asr_old.sh similarity index 98% rename from egs/librispeech/v1/run_011_train_asr2.sh rename to egs/librispeech/v1/run_011_train_asr_old.sh index 99b0065e..3d0e6eb1 100755 --- a/egs/librispeech/v1/run_011_train_asr2.sh +++ b/egs/librispeech/v1/run_011_train_asr_old.sh @@ -47,7 +47,7 @@ if [ $stage -le 1 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ - train_wav2vec2rnn_transducer.py $nnet_type \ + train_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.audio-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ diff --git a/egs/librispeech/v1/run_030_inference.sh b/egs/librispeech/v1/run_030_inference.sh index 02b97001..7ed9567a 100755 --- a/egs/librispeech/v1/run_030_inference.sh +++ b/egs/librispeech/v1/run_030_inference.sh @@ -38,10 +38,10 @@ test_data=test_clean # Extracts x-vectors for evaluation -for name in dev_clean dev_other test_clean test_other #$test_data +for name in dev_clean dev_other test_clean test_other do nj=40 - steps_transducer/decode_wav2vec2transducer.sh \ + steps_transducer/decode_wav2vec2rnn_transducer.sh \ --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ $nnet data/$name \ $transducer_dir/$name $bpe_model diff --git a/egs/librispeech/v1/run_030_inference2.sh b/egs/librispeech/v1/run_030_inference_old.sh similarity index 88% rename from egs/librispeech/v1/run_030_inference2.sh rename to egs/librispeech/v1/run_030_inference_old.sh index 7ed9567a..02b97001 100755 --- a/egs/librispeech/v1/run_030_inference2.sh +++ b/egs/librispeech/v1/run_030_inference_old.sh @@ -38,10 +38,10 @@ test_data=test_clean # Extracts x-vectors for evaluation -for name in dev_clean dev_other test_clean test_other +for name in dev_clean dev_other test_clean test_other #$test_data do nj=40 - steps_transducer/decode_wav2vec2rnn_transducer.sh \ + steps_transducer/decode_wav2vec2transducer.sh \ --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \ $nnet data/$name \ $transducer_dir/$name $bpe_model diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py index a2456dc9..53a01d6d 100755 --- a/hyperion/bin/apply_mvn_select_frames.py +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import RandomAccessDataReaderFactory as RDRF @@ -21,6 +18,8 @@ from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.utils import Utt2Info from hyperion.utils.kaldi_matrix import compression_methods +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def process_feats( diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py index 04acb76c..ac8852a4 100755 --- a/hyperion/bin/audio_to_duration.py +++ b/hyperion/bin/audio_to_duration.py @@ -3,22 +3,17 @@ Copyright 2022 Jesus Villalba (Johns Hopkins University) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging import numpy as np - from hyperion.hyp_defs import config_logger -from hyperion.utils import SegmentSet from hyperion.io import SequentialAudioReader as AR +from hyperion.utils import SegmentSet +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def audio_to_duration(audio_file, output_file, **kwargs): diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index 15d74f3a..e4d47ef0 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -9,13 +9,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.feats import EnergyVAD +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def compute_vad(input_path, output_path, write_num_frames, **kwargs): diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index a83f95d1..c8193e5c 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -9,15 +9,14 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.np.feats import MFCC +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def compute_mfcc_feats( diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py index 0385cc55..4549caec 100755 --- a/hyperion/bin/copy_feats.py +++ b/hyperion/bin/copy_feats.py @@ -12,7 +12,6 @@ import time import numpy as np - from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index bbcd0dc7..420f8a9f 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -4,38 +4,29 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from typing import Dict, List, Tuple - -import sentencepiece as spm -import torch.nn as nn - -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging +from typing import Dict, List, Tuple import numpy as np import pandas as pd - +import sentencepiece as spm import torch - +import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML - -from hyperion.torch.models.wav2transducer.beam_search import greedy_search, beam_search +from hyperion.torch.models.wav2transducer.beam_search import (beam_search, + greedy_search) +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py index cc612628..4fdc3140 100755 --- a/hyperion/bin/decode_wav2vec2rnn_transducer.py +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -4,39 +4,30 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from typing import Dict, List, Tuple - -import sentencepiece as spm -import torch.nn as nn - -import sys +import logging import os -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) +import sys import time -import logging +from typing import Dict, List, Tuple import numpy as np import pandas as pd - +import sentencepiece as spm import torch - +import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu -from hyperion.utils import Utt2Info from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment - -from hyperion.torch.utils import open_device -from hyperion.torch.narchs import AudioFeatsMVN as AF -from hyperion.torch.models import HFWav2Vec2RNNTransducer from hyperion.torch import TorchModelLoader as TML - -from hyperion.torch.models.wav2transducer.beam_search import greedy_search, beam_search +from hyperion.torch.models import HFWav2Vec2RNNTransducer +from hyperion.torch.models.wav2transducer.beam_search import (beam_search, + greedy_search) +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index 437127b2..27d36d6f 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -10,9 +10,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,6 +26,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index aaa91214..1c00ed2a 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -12,9 +12,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -32,6 +29,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) torch.backends.cudnn.enabled = False diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index 8d4add76..f9b77f11 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -11,12 +11,9 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from art.classifiers import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -32,6 +29,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 0e9493c0..9f6801ef 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -27,6 +24,8 @@ from hyperion.torch.utils.misc import l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index e0754498..6fdca983 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -10,9 +10,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,6 +26,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 0f9f375d..7ef4815c 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -11,12 +11,9 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from art.classifiers import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -32,6 +29,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 2a92a83e..70b838e6 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -11,11 +11,8 @@ import numpy as np import pandas as pd -import torchaudio.transforms as tat -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +import torchaudio.transforms as tat from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -24,6 +21,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) resamplers = {} diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index 926e0bcc..13ad4277 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -22,6 +19,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index addabbcf..4f48bbdc 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -11,9 +11,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -24,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index e3d2fcbb..fb6583e2 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -10,11 +10,8 @@ import time import numpy as np -import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -23,6 +20,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index 2b1bba3b..9f1728eb 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -11,11 +11,8 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -25,6 +22,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py index b940c024..6f17f800 100755 --- a/hyperion/bin/finetune_wav2vec2transducer.py +++ b/hyperion/bin/finetune_wav2vec2transducer.py @@ -3,37 +3,29 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import k2 -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path +import k2 import numpy as np - import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import TransducerTrainer as Trainer -from hyperion.torch.data import AudioDataset as AD - from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import HFWav2Vec2Transducer +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence - model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, } diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index b3edd9b5..d9d9c281 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -26,6 +23,8 @@ HFWavLM2ResNet1dXVector) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py index 2ac01025..17cafb85 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_feats.py +++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py @@ -12,9 +12,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py index ff97d3ca..f7832a47 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_wav.py +++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data( diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py index 7a1fb5a9..ac9c2d0b 100755 --- a/hyperion/bin/finetune_xvector_from_feats.py +++ b/hyperion/bin/finetune_xvector_from_feats.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -23,6 +20,8 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index d7b1f17d..2e120815 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -10,13 +10,6 @@ import time from pathlib import Path -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -32,6 +25,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 972ff01f..4a356037 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -10,15 +10,14 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal - from hyperion.hyp_defs import config_logger from hyperion.io import AudioWriter as Writer from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal def make_noise(xs): diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index dccf58da..c5ddd25c 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -10,12 +10,11 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def pack_wav_rirs(input_path, output_spec, **kwargs): diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py index e011dfe8..e2157e3e 100755 --- a/hyperion/bin/plot_embedding_tsne.py +++ b/hyperion/bin/plot_embedding_tsne.py @@ -13,13 +13,12 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 6f35f074..6af0202c 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -13,15 +13,14 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.clustering import AHC from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet from hyperion.utils.math import cosine_scoring +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index 2f4e5cbc..e8adfd16 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -10,15 +10,14 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal - from hyperion.hyp_defs import config_logger from hyperion.io import AudioWriter as Writer from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal def process_vad(vad, length, fs, dilation, erosion): diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 026c9330..8930b299 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -3,31 +3,25 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import k2 -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path +import k2 import numpy as np - import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.models import Wav2RNNRNNTransducer +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index a2d75ba9..67f5c6ba 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -3,32 +3,26 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import k2 -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path +import k2 import numpy as np - import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory -from hyperion.torch.models import HFWav2Vec2RNNTransducer -from hyperion.torch.models import HFWav2Vec2RNNRNNTransducer +from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer) +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 8b945217..55f3b996 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -3,32 +3,26 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu, Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys -import os -from pathlib import Path -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import k2 -import time import logging import multiprocessing +import os +import sys +import time +from pathlib import Path +import k2 import numpy as np - import torch import torch.nn as nn - from hyperion.hyp_defs import config_logger, set_float_cpu -from hyperion.torch.utils import ddp -from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import HFWav2Vec2Transducer +from hyperion.torch.trainers import TransducerTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 5e7ecafa..8e1653b1 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ HFWavLM2ResNet1dXVector) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index 7f4ab0fa..71bba080 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -28,6 +25,8 @@ from hyperion.torch.models import TransformerXVectorV1 as TFXVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index 57a33b56..6fef9bbe 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -10,9 +10,6 @@ import time from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu # from hyperion.torch.data import ClassWeightedSeqSampler as Sampler @@ -28,6 +25,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin_deprec/ark2hyp.py b/hyperion/bin_deprec/ark2hyp.py index abcb4457..a25c561b 100755 --- a/hyperion/bin_deprec/ark2hyp.py +++ b/hyperion/bin_deprec/ark2hyp.py @@ -13,7 +13,6 @@ import time import numpy as np - from hyperion.hyp_defs import config_logger from hyperion.io import HypDataWriter, KaldiDataReader diff --git a/hyperion/bin_deprec/arkvad2nist.py b/hyperion/bin_deprec/arkvad2nist.py index 559371be..15a04f67 100755 --- a/hyperion/bin_deprec/arkvad2nist.py +++ b/hyperion/bin_deprec/arkvad2nist.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.io import KaldiDataReader diff --git a/hyperion/bin_deprec/compute-gmm-post.py b/hyperion/bin_deprec/compute-gmm-post.py index 58675336..45d17623 100755 --- a/hyperion/bin_deprec/compute-gmm-post.py +++ b/hyperion/bin_deprec/compute-gmm-post.py @@ -14,13 +14,12 @@ import time import numpy as np -from keras import backend as K - from hyperion.helpers import SequenceReader as SR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter from hyperion.pdfs import DiagGMM from hyperion.transforms import TransformList +from keras import backend as K def to_sparse(r, num_comp): diff --git a/hyperion/bin_deprec/eval-2class-performance.py b/hyperion/bin_deprec/eval-2class-performance.py index eff16830..d149deb2 100755 --- a/hyperion/bin_deprec/eval-2class-performance.py +++ b/hyperion/bin_deprec/eval-2class-performance.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.hyp_defs import config_logger from hyperion.metrics import compute_eer from hyperion.utils.trial_key import TrialKey diff --git a/hyperion/bin_deprec/eval-elbo-ubm.py b/hyperion/bin_deprec/eval-elbo-ubm.py index bf4839db..5f2eab28 100755 --- a/hyperion/bin_deprec/eval-elbo-ubm.py +++ b/hyperion/bin_deprec/eval-elbo-ubm.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import SequenceReader as SR from hyperion.hyp_defs import config_logger, float_cpu from hyperion.pdfs import DiagGMM diff --git a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py index 4548e49b..7817b570 100755 --- a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.classifiers import QScoringHomoGBE as GBE from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger diff --git a/hyperion/bin_deprec/eval-score-norm.py b/hyperion/bin_deprec/eval-score-norm.py index 4b620518..4f66a8e4 100755 --- a/hyperion/bin_deprec/eval-score-norm.py +++ b/hyperion/bin_deprec/eval-score-norm.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.hyp_defs import config_logger from hyperion.score_norm import * from hyperion.utils.trial_ndx import TrialNdx diff --git a/hyperion/bin_deprec/h5vad2nist.py b/hyperion/bin_deprec/h5vad2nist.py index fb45c22b..21d61d3a 100755 --- a/hyperion/bin_deprec/h5vad2nist.py +++ b/hyperion/bin_deprec/h5vad2nist.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.hyp_defs import config_logger from hyperion.io import HypDataReader diff --git a/hyperion/bin_deprec/init-ubm.py b/hyperion/bin_deprec/init-ubm.py index 204ca855..77aed464 100755 --- a/hyperion/bin_deprec/init-ubm.py +++ b/hyperion/bin_deprec/init-ubm.py @@ -15,12 +15,11 @@ import time import numpy as np -from keras import backend as K - from hyperion.helpers import SequenceReader as SR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.pdfs import DiagGMM from hyperion.utils.multithreading import threadsafe_generator +from keras import backend as K @threadsafe_generator diff --git a/hyperion/bin_deprec/scores2lre_format.py b/hyperion/bin_deprec/scores2lre_format.py index 717c1535..fcba8804 100755 --- a/hyperion/bin_deprec/scores2lre_format.py +++ b/hyperion/bin_deprec/scores2lre_format.py @@ -12,7 +12,6 @@ import time import numpy as np - from hyperion.hyp_defs import config_logger from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores diff --git a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py index 608a5271..5c1b19fc 100755 --- a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py index a4cc54e6..326175ab 100755 --- a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-dc1d-dvae.py b/hyperion/bin_deprec/torch-train-dc1d-dvae.py index 1b88beba..7a4f9634 100755 --- a/hyperion/bin_deprec/torch-train-dc1d-dvae.py +++ b/hyperion/bin_deprec/torch-train-dc1d-dvae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-dc1d-vae.py b/hyperion/bin_deprec/torch-train-dc1d-vae.py index dd5d2e72..1de4560a 100755 --- a/hyperion/bin_deprec/torch-train-dc1d-vae.py +++ b/hyperion/bin_deprec/torch-train-dc1d-vae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-dc2d-dvae.py b/hyperion/bin_deprec/torch-train-dc2d-dvae.py index 3f7cb17d..5bbc53bf 100755 --- a/hyperion/bin_deprec/torch-train-dc2d-dvae.py +++ b/hyperion/bin_deprec/torch-train-dc2d-dvae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-dc2d-vae.py b/hyperion/bin_deprec/torch-train-dc2d-vae.py index 5b97f55c..b073c4c0 100755 --- a/hyperion/bin_deprec/torch-train-dc2d-vae.py +++ b/hyperion/bin_deprec/torch-train-dc2d-vae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet1d-dvae.py b/hyperion/bin_deprec/torch-train-resnet1d-dvae.py index ca6f6996..c10c6fe7 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-dvae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vae.py b/hyperion/bin_deprec/torch-train-resnet1d-vae.py index a6218567..cf460a0a 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py b/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py index 89448754..a1b13d95 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py b/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py index 4a84bbff..a773d9aa 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet2d-dvae.py b/hyperion/bin_deprec/torch-train-resnet2d-dvae.py index 3f6cd6ba..a3857701 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-dvae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vae.py b/hyperion/bin_deprec/torch-train-resnet2d-vae.py index 4e853230..695472cb 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vae.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import VAETrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py b/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py index 5e0add50..fdcc0c47 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py b/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py index 6398d959..17d4c474 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py index 0137e101..ff8ef4dc 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py @@ -11,7 +11,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py index 71021825..92dad725 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py index a6908c4f..18888706 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py index b3b07682..566ea106 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-xvector.py b/hyperion/bin_deprec/torch-train-xvector.py index 4c69eb25..88147d37 100755 --- a/hyperion/bin_deprec/torch-train-xvector.py +++ b/hyperion/bin_deprec/torch-train-xvector.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.torch.data import ClassWeightedSeqSampler as Sampler diff --git a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py index 8a348728..48967f58 100755 --- a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.classifiers import QScoringHomoGBE as GBE from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger diff --git a/hyperion/bin_deprec/vectors2scores.py b/hyperion/bin_deprec/vectors2scores.py index ab4be8ac..2ff635c2 100755 --- a/hyperion/bin_deprec/vectors2scores.py +++ b/hyperion/bin_deprec/vectors2scores.py @@ -11,7 +11,6 @@ import time import numpy as np - from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.utils.trial_scores import TrialScores diff --git a/hyperion/bin_deprec2/apply-mvn-select-frames.py b/hyperion/bin_deprec2/apply-mvn-select-frames.py index a2456dc9..53a01d6d 100755 --- a/hyperion/bin_deprec2/apply-mvn-select-frames.py +++ b/hyperion/bin_deprec2/apply-mvn-select-frames.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import RandomAccessDataReaderFactory as RDRF @@ -21,6 +18,8 @@ from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.utils import Utt2Info from hyperion.utils.kaldi_matrix import compression_methods +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def process_feats( diff --git a/hyperion/bin_deprec2/compute-mfcc-feats.py b/hyperion/bin_deprec2/compute-mfcc-feats.py index a83f95d1..c8193e5c 100755 --- a/hyperion/bin_deprec2/compute-mfcc-feats.py +++ b/hyperion/bin_deprec2/compute-mfcc-feats.py @@ -9,15 +9,14 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.np.feats import MFCC +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def compute_mfcc_feats( diff --git a/hyperion/bin_deprec2/copy-feats.py b/hyperion/bin_deprec2/copy-feats.py index 0385cc55..4549caec 100755 --- a/hyperion/bin_deprec2/copy-feats.py +++ b/hyperion/bin_deprec2/copy-feats.py @@ -12,7 +12,6 @@ import time import numpy as np - from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF diff --git a/hyperion/bin_deprec2/eval-cos-1vs1.py b/hyperion/bin_deprec2/eval-cos-1vs1.py index de508333..f60fdd4b 100755 --- a/hyperion/bin_deprec2/eval-cos-1vs1.py +++ b/hyperion/bin_deprec2/eval-cos-1vs1.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import TrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.np.transforms import LNorm, TransformList diff --git a/hyperion/bin_deprec2/eval-linear-gbe-up.py b/hyperion/bin_deprec2/eval-linear-gbe-up.py index d82bf967..ba646498 100755 --- a/hyperion/bin_deprec2/eval-linear-gbe-up.py +++ b/hyperion/bin_deprec2/eval-linear-gbe-up.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter as HDW diff --git a/hyperion/bin_deprec2/eval-linear-gbe.py b/hyperion/bin_deprec2/eval-linear-gbe.py index cf788392..9828944d 100755 --- a/hyperion/bin_deprec2/eval-linear-gbe.py +++ b/hyperion/bin_deprec2/eval-linear-gbe.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter as HDW diff --git a/hyperion/bin_deprec2/eval-linear-svmc.py b/hyperion/bin_deprec2/eval-linear-svmc.py index ba4c5e81..3b8b644b 100755 --- a/hyperion/bin_deprec2/eval-linear-svmc.py +++ b/hyperion/bin_deprec2/eval-linear-svmc.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter as HDW diff --git a/hyperion/bin_deprec2/eval-logistic-regression.py b/hyperion/bin_deprec2/eval-logistic-regression.py index 992ca7b8..56507a9a 100755 --- a/hyperion/bin_deprec2/eval-logistic-regression.py +++ b/hyperion/bin_deprec2/eval-logistic-regression.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter as HDW diff --git a/hyperion/bin_deprec2/eval-plda-1vs1.py b/hyperion/bin_deprec2/eval-plda-1vs1.py index 5a810cf7..1a966f57 100755 --- a/hyperion/bin_deprec2/eval-plda-1vs1.py +++ b/hyperion/bin_deprec2/eval-plda-1vs1.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import PLDAFactory as F from hyperion.helpers import TrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu diff --git a/hyperion/bin_deprec2/eval-plda-nvs1.py b/hyperion/bin_deprec2/eval-plda-nvs1.py index 5c5d200c..5ead954a 100755 --- a/hyperion/bin_deprec2/eval-plda-nvs1.py +++ b/hyperion/bin_deprec2/eval-plda-nvs1.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import PLDAFactory as F from hyperion.helpers import TrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu diff --git a/hyperion/bin_deprec2/merge-h5-files.py b/hyperion/bin_deprec2/merge-h5-files.py index aeda3bab..51207343 100755 --- a/hyperion/bin_deprec2/merge-h5-files.py +++ b/hyperion/bin_deprec2/merge-h5-files.py @@ -12,7 +12,6 @@ import time import numpy as np - from hyperion.io import H5Merger diff --git a/hyperion/bin_deprec2/pack-audio-files.py b/hyperion/bin_deprec2/pack-audio-files.py index 5d544df4..a843825a 100755 --- a/hyperion/bin_deprec2/pack-audio-files.py +++ b/hyperion/bin_deprec2/pack-audio-files.py @@ -11,13 +11,12 @@ import time import numpy as np -from scipy import ndimage, signal - from hyperion.hyp_defs import config_logger from hyperion.io import PackedAudioWriter as Writer from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.io import WSpecifier as WS +from scipy import ndimage, signal def process_vad(vad, length, fs, dilation, erosion): diff --git a/hyperion/bin_deprec2/plot-vector-hist.py b/hyperion/bin_deprec2/plot-vector-hist.py index 75236726..a4d842c0 100755 --- a/hyperion/bin_deprec2/plot-vector-hist.py +++ b/hyperion/bin_deprec2/plot-vector-hist.py @@ -15,7 +15,6 @@ matplotlib.use("Agg") import matplotlib.pyplot as plt - from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.transforms import TransformList diff --git a/hyperion/bin_deprec2/rttm-to-bin-vad.py b/hyperion/bin_deprec2/rttm-to-bin-vad.py index 19e98d8f..610a0019 100755 --- a/hyperion/bin_deprec2/rttm-to-bin-vad.py +++ b/hyperion/bin_deprec2/rttm-to-bin-vad.py @@ -11,7 +11,6 @@ import numpy as np import pandas as pd - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.utils import RTTM, SegmentList diff --git a/hyperion/bin_deprec2/segments-to-bin-vad.py b/hyperion/bin_deprec2/segments-to-bin-vad.py index 24021a4b..56e6bf9f 100755 --- a/hyperion/bin_deprec2/segments-to-bin-vad.py +++ b/hyperion/bin_deprec2/segments-to-bin-vad.py @@ -10,12 +10,11 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.utils import SegmentList +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def segments_to_bin_vad( diff --git a/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py index ad33515c..9dde434d 100755 --- a/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -26,6 +23,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-adv-finetune-xvec.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec.py index 850233e2..88d21cdb 100755 --- a/hyperion/bin_deprec2/torch-adv-finetune-xvec.py +++ b/hyperion/bin_deprec2/torch-adv-finetune-xvec.py @@ -12,9 +12,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -25,6 +22,8 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorAdvTrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-compute-mfcc-feats.py b/hyperion/bin_deprec2/torch-compute-mfcc-feats.py index 07f71bfb..17565a3c 100755 --- a/hyperion/bin_deprec2/torch-compute-mfcc-feats.py +++ b/hyperion/bin_deprec2/torch-compute-mfcc-feats.py @@ -8,9 +8,6 @@ import sys import time -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -18,6 +15,8 @@ from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.torch.layers import AudioFeatsFactory as AFF +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def compute_mfcc_feats( diff --git a/hyperion/bin_deprec2/torch-eval-vae.py b/hyperion/bin_deprec2/torch-eval-vae.py index d676b0f1..bf99dddd 100755 --- a/hyperion/bin_deprec2/torch-eval-vae.py +++ b/hyperion/bin_deprec2/torch-eval-vae.py @@ -16,7 +16,6 @@ matplotlib.use("Agg") # matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) import matplotlib.pyplot as plt - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py index aaa91214..1c00ed2a 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py @@ -12,9 +12,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -32,6 +29,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) torch.backends.cudnn.enabled = False diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py index 437127b2..27d36d6f 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py @@ -10,9 +10,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,6 +26,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py index 8d4add76..f9b77f11 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py @@ -11,12 +11,9 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from art.classifiers import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -32,6 +29,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py index 0e9493c0..9f6801ef 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -27,6 +24,8 @@ from hyperion.torch.utils.misc import l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py index e0754498..6fdca983 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py @@ -10,9 +10,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,6 +26,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py index 0f9f375d..7ef4815c 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py @@ -11,12 +11,9 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +from art.classifiers import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -32,6 +29,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py index da6389fb..b95b2a7c 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py @@ -11,9 +11,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -24,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py b/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py index fc494448..a9785a61 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py @@ -11,9 +11,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -24,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import RTTM, Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py index c85fe4c9..7453e0ba 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py @@ -11,11 +11,8 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -25,6 +22,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py index 6da57e16..3153b312 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, float_cpu from hyperion.io import DataWriterFactory as DWF @@ -22,6 +19,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py b/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py index 6edf60ed..347c80f8 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -22,6 +19,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors.py b/hyperion/bin_deprec2/torch-extract-xvectors.py index 76d941e0..83d21692 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -22,6 +19,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py index 88b0b1d9..077da270 100755 --- a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py +++ b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py @@ -11,12 +11,9 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -27,6 +24,8 @@ from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialNdx, Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def read_utt_list(list_file, class2int_file, part_idx, num_parts): diff --git a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py index a4df5091..54d217e2 100755 --- a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py +++ b/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py @@ -11,12 +11,9 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -31,6 +28,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-train-dc1d-ae.py b/hyperion/bin_deprec2/torch-train-dc1d-ae.py index 50ac7d42..cdba46b3 100755 --- a/hyperion/bin_deprec2/torch-train-dc1d-ae.py +++ b/hyperion/bin_deprec2/torch-train-dc1d-ae.py @@ -10,7 +10,6 @@ import time import numpy as np - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec2/torch-train-dvae.py b/hyperion/bin_deprec2/torch-train-dvae.py index 808bfbba..6c21bbcf 100755 --- a/hyperion/bin_deprec2/torch-train-dvae.py +++ b/hyperion/bin_deprec2/torch-train-dvae.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -27,6 +24,8 @@ TransformerEncoderV1) from hyperion.torch.trainers import DVAETrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py index f256f735..9db86225 100755 --- a/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -23,6 +20,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py index 622ac62e..124e9cb3 100755 --- a/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py +++ b/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py @@ -12,9 +12,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.models import EfficientNetXVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py index 3d135b18..6b9455df 100755 --- a/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -28,6 +25,8 @@ # from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) # import torch.multiprocessing as mp diff --git a/hyperion/bin_deprec2/torch-train-resnet-xvec.py b/hyperion/bin_deprec2/torch-train-resnet-xvec.py index f976cc6e..f035032a 100755 --- a/hyperion/bin_deprec2/torch-train-resnet-xvec.py +++ b/hyperion/bin_deprec2/torch-train-resnet-xvec.py @@ -12,9 +12,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.models import ResNetXVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py index 3ee6bf18..616e2cd3 100755 --- a/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py index 0857ce5c..f579a807 100755 --- a/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py @@ -12,9 +12,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py index 7bbbff03..486b1d92 100755 --- a/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -23,6 +20,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-tdnn-xvec.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec.py index 5614f1b9..be429344 100755 --- a/hyperion/bin_deprec2/torch-train-tdnn-xvec.py +++ b/hyperion/bin_deprec2/torch-train-tdnn-xvec.py @@ -12,9 +12,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.models import TDNNXVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py index 6b361583..3e91da90 100755 --- a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -23,6 +20,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py index 62164f15..d08a58a3 100755 --- a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py +++ b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py @@ -12,9 +12,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.models import TransformerXVectorV1 as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-vae.py b/hyperion/bin_deprec2/torch-train-vae.py index 4c41d49c..6f545795 100755 --- a/hyperion/bin_deprec2/torch-train-vae.py +++ b/hyperion/bin_deprec2/torch-train-vae.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -27,6 +24,8 @@ TransformerEncoderV1) from hyperion.torch.trainers import VAETrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/torch-train-vq-dvae.py b/hyperion/bin_deprec2/torch-train-vq-dvae.py index 5de1bbd4..449c3b49 100755 --- a/hyperion/bin_deprec2/torch-train-vq-dvae.py +++ b/hyperion/bin_deprec2/torch-train-vq-dvae.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -27,6 +24,8 @@ TransformerEncoderV1) from hyperion.torch.trainers import VQDVAETrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/torch-train-vq-vae.py b/hyperion/bin_deprec2/torch-train-vq-vae.py index 2a95f853..17dea6aa 100755 --- a/hyperion/bin_deprec2/torch-train-vq-vae.py +++ b/hyperion/bin_deprec2/torch-train-vq-vae.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -27,6 +24,8 @@ TransformerEncoderV1) from hyperion.torch.trainers import VQVAETrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/train-cw-up.py b/hyperion/bin_deprec2/train-cw-up.py index c1c372ad..bab22ce7 100755 --- a/hyperion/bin_deprec2/train-cw-up.py +++ b/hyperion/bin_deprec2/train-cw-up.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal diff --git a/hyperion/bin_deprec2/train-cw.py b/hyperion/bin_deprec2/train-cw.py index cabca7c2..e8c693c1 100755 --- a/hyperion/bin_deprec2/train-cw.py +++ b/hyperion/bin_deprec2/train-cw.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal diff --git a/hyperion/bin_deprec2/train-gaussianizer.py b/hyperion/bin_deprec2/train-gaussianizer.py index aeb51e46..4718d3df 100755 --- a/hyperion/bin_deprec2/train-gaussianizer.py +++ b/hyperion/bin_deprec2/train-gaussianizer.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal diff --git a/hyperion/bin_deprec2/train-lda.py b/hyperion/bin_deprec2/train-lda.py index 1887a72f..9dfe394f 100755 --- a/hyperion/bin_deprec2/train-lda.py +++ b/hyperion/bin_deprec2/train-lda.py @@ -13,7 +13,6 @@ import time import numpy as np - from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.transforms import LDA, SbSw, TransformList diff --git a/hyperion/bin_deprec2/train-linear-gbe-up.py b/hyperion/bin_deprec2/train-linear-gbe-up.py index 9986b6bc..9435d0ad 100755 --- a/hyperion/bin_deprec2/train-linear-gbe-up.py +++ b/hyperion/bin_deprec2/train-linear-gbe-up.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LinearGBEUP as GBE diff --git a/hyperion/bin_deprec2/train-linear-gbe.py b/hyperion/bin_deprec2/train-linear-gbe.py index e9455cb8..75fe0b67 100755 --- a/hyperion/bin_deprec2/train-linear-gbe.py +++ b/hyperion/bin_deprec2/train-linear-gbe.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LinearGBE as GBE diff --git a/hyperion/bin_deprec2/train-linear-svmc.py b/hyperion/bin_deprec2/train-linear-svmc.py index 90ff8768..f48a573e 100755 --- a/hyperion/bin_deprec2/train-linear-svmc.py +++ b/hyperion/bin_deprec2/train-linear-svmc.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LinearSVMC as SVM diff --git a/hyperion/bin_deprec2/train-logistic-regression.py b/hyperion/bin_deprec2/train-logistic-regression.py index 1aa128a3..f7036879 100755 --- a/hyperion/bin_deprec2/train-logistic-regression.py +++ b/hyperion/bin_deprec2/train-logistic-regression.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LogisticRegression as LR diff --git a/hyperion/bin_deprec2/train-mvn.py b/hyperion/bin_deprec2/train-mvn.py index 2d10b116..ff03175b 100755 --- a/hyperion/bin_deprec2/train-mvn.py +++ b/hyperion/bin_deprec2/train-mvn.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal diff --git a/hyperion/bin_deprec2/train-nda.py b/hyperion/bin_deprec2/train-nda.py index 946a8baa..ec73db2a 100755 --- a/hyperion/bin_deprec2/train-nda.py +++ b/hyperion/bin_deprec2/train-nda.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.transforms import NDA, NSbSw, TransformList diff --git a/hyperion/bin_deprec2/train-pca.py b/hyperion/bin_deprec2/train-pca.py index 25dcb366..9d9ae7a9 100755 --- a/hyperion/bin_deprec2/train-pca.py +++ b/hyperion/bin_deprec2/train-pca.py @@ -13,7 +13,6 @@ import time import numpy as np - from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.transforms import PCA, TransformList diff --git a/hyperion/bin_deprec2/train-plda.py b/hyperion/bin_deprec2/train-plda.py index 520f4cd7..f8d24366 100755 --- a/hyperion/bin_deprec2/train-plda.py +++ b/hyperion/bin_deprec2/train-plda.py @@ -14,7 +14,6 @@ import time import numpy as np - from hyperion.helpers import PLDAFactory as F from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger diff --git a/hyperion/np/score_norm/adapt_s_norm.py b/hyperion/np/score_norm/adapt_s_norm.py index 944fcad5..294893ae 100644 --- a/hyperion/np/score_norm/adapt_s_norm.py +++ b/hyperion/np/score_norm/adapt_s_norm.py @@ -4,6 +4,7 @@ """ import math + import h5py import numpy as np diff --git a/hyperion/torch/adv_attacks/random_attack_factory.py b/hyperion/torch/adv_attacks/random_attack_factory.py index 0c83bc56..2f7cc7f3 100644 --- a/hyperion/torch/adv_attacks/random_attack_factory.py +++ b/hyperion/torch/adv_attacks/random_attack_factory.py @@ -5,9 +5,8 @@ import math -from jsonargparse import ActionParser, ArgumentParser - import torch +from jsonargparse import ActionParser, ArgumentParser from .attack_factory import AttackFactory as AF diff --git a/hyperion/torch/adv_defenses/wave_gan_white.py b/hyperion/torch/adv_defenses/wave_gan_white.py index 5d045f08..af51dc00 100644 --- a/hyperion/torch/adv_defenses/wave_gan_white.py +++ b/hyperion/torch/adv_defenses/wave_gan_white.py @@ -8,9 +8,8 @@ import librosa import numpy as np -import yaml - import torch +import yaml try: # import parallel_wavegan.models @@ -21,7 +20,6 @@ pass from sklearn.preprocessing import StandardScaler - from torch import nn diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index 252ac160..959a635a 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -4,11 +4,11 @@ """ from .audio_dataset import AudioDataset +# samplers +from .bucketing_seg_sampler import BucketingSegSampler from .embed_sampler_factory import EmbedSamplerFactory # datasets from .feat_seq_dataset import FeatSeqDataset from .paired_feat_seq_dataset import PairedFeatSeqDataset -# samplers -from .bucketing_seg_sampler import BucketingSegSampler # from .weighted_seq_sampler import ClassWeightedSeqSampler from .seg_sampler_factory import SegSamplerFactory diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index b352f94d..7a69c45f 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -9,28 +9,25 @@ import numpy as np import pandas as pd -import torchaudio.transforms as tat -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - +#import k2 +import sentencepiece as spm import torch import torch.distributed as dist +import torchaudio.transforms as tat +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.utils.data import Dataset from ...io import RandomAccessAudioReader as AR from ...np.augment import SpeechAugment - -#import k2 -import sentencepiece as spm -#from torch.nn.utils.rnn import pad_sequence - -from torch.utils.data import Dataset -import torch.distributed as dist - from ...utils.class_info import ClassInfo from ...utils.segment_set import SegmentSet from ...utils.text import read_text from ..torch_defs import floatstr_torch +#from torch.nn.utils.rnn import pad_sequence + + + class AudioDataset(Dataset): diff --git a/hyperion/torch/data/class_weighted_embed_sampler.py b/hyperion/torch/data/class_weighted_embed_sampler.py index edf1c00d..aed9105d 100644 --- a/hyperion/torch/data/class_weighted_embed_sampler.py +++ b/hyperion/torch/data/class_weighted_embed_sampler.py @@ -9,9 +9,8 @@ import numpy as np import pandas as pd -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 7fbfbd71..b551f342 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -9,9 +9,8 @@ import numpy as np import pandas as pd -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/embed_dataset.py b/hyperion/torch/data/embed_dataset.py index 519f498d..31fd00fd 100644 --- a/hyperion/torch/data/embed_dataset.py +++ b/hyperion/torch/data/embed_dataset.py @@ -10,10 +10,9 @@ import numpy as np import pandas as pd -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.distributed as dist +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.utils.data import Dataset from ...io import RandomAccessDataReaderFactory as RF diff --git a/hyperion/torch/data/embed_sampler.py b/hyperion/torch/data/embed_sampler.py index 65adcba6..8836fe2a 100644 --- a/hyperion/torch/data/embed_sampler.py +++ b/hyperion/torch/data/embed_sampler.py @@ -7,9 +7,8 @@ import math import numpy as np -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/feat_seq_dataset.py b/hyperion/torch/data/feat_seq_dataset.py index bb487dda..68dea5c3 100644 --- a/hyperion/torch/data/feat_seq_dataset.py +++ b/hyperion/torch/data/feat_seq_dataset.py @@ -12,10 +12,9 @@ import numpy as np import pandas as pd -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.distributed as dist +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.utils.data import Dataset from ...io import RandomAccessDataReaderFactory as RF diff --git a/hyperion/torch/data/hyp_sampler.py b/hyperion/torch/data/hyp_sampler.py index d1bcb0a8..c5097723 100644 --- a/hyperion/torch/data/hyp_sampler.py +++ b/hyperion/torch/data/hyp_sampler.py @@ -2,10 +2,9 @@ import math import numpy as np -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser from torch.utils.data import Sampler diff --git a/hyperion/torch/data/paired_feat_seq_dataset.py b/hyperion/torch/data/paired_feat_seq_dataset.py index eff2ed58..fc17593e 100644 --- a/hyperion/torch/data/paired_feat_seq_dataset.py +++ b/hyperion/torch/data/paired_feat_seq_dataset.py @@ -6,7 +6,6 @@ import logging import numpy as np - import torch from ...utils.utt2info import Utt2Info diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py index 2933dcc6..76054cd8 100644 --- a/hyperion/torch/data/seg_chunk_sampler.py +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -8,10 +8,9 @@ import numpy as np import pandas as pd -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser from ...utils.segment_set import SegmentSet from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index 39d1eed2..74726f63 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -7,9 +7,8 @@ import math import numpy as np -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/weighted_embed_sampler.py b/hyperion/torch/data/weighted_embed_sampler.py index 5870512a..22da93f9 100644 --- a/hyperion/torch/data/weighted_embed_sampler.py +++ b/hyperion/torch/data/weighted_embed_sampler.py @@ -7,7 +7,6 @@ import math import numpy as np - import torch from torch.utils.data import Sampler diff --git a/hyperion/torch/data/weighted_seq_sampler.py b/hyperion/torch/data/weighted_seq_sampler.py index b6f0b670..345c2429 100644 --- a/hyperion/torch/data/weighted_seq_sampler.py +++ b/hyperion/torch/data/weighted_seq_sampler.py @@ -7,10 +7,9 @@ import math import numpy as np -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser from torch.utils.data import Sampler diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index 7a738bca..0487ae4f 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -30,9 +30,9 @@ from .seresnet_blocks import SEResNetBasicBlock, SEResNetBNBlock from .spine_blocks import BlockSpec, SpineConv, SpineEndpoints, SpineResample from .tdnn_blocks import TDNNBlock +from .transducer_joiner import TransducerJoiner +from .transducer_predictor import TransducerRNNPredictor, TransducerConvPredictor from .transformer_conv2d_subsampler import TransformerConv2dSubsampler from .transformer_encoder_v1 import TransformerEncoderBlockV1 from .transformer_feedforward import (Conv1dLinear, Conv1dx2, PositionwiseFeedForward) -from .transducer_predictor import TransducerPredictor -from .transducer_joiner import TransducerJoiner diff --git a/hyperion/torch/layer_blocks/etdnn_blocks.py b/hyperion/torch/layer_blocks/etdnn_blocks.py index b6afdd29..10fd09b3 100644 --- a/hyperion/torch/layer_blocks/etdnn_blocks.py +++ b/hyperion/torch/layer_blocks/etdnn_blocks.py @@ -4,7 +4,6 @@ """ import numpy as np - import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear diff --git a/hyperion/torch/layer_blocks/resetdnn_blocks.py b/hyperion/torch/layer_blocks/resetdnn_blocks.py index dfea3720..1af632fb 100644 --- a/hyperion/torch/layer_blocks/resetdnn_blocks.py +++ b/hyperion/torch/layer_blocks/resetdnn_blocks.py @@ -5,7 +5,6 @@ # import numpy as np - import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear diff --git a/hyperion/torch/layer_blocks/transducer_joiner.py b/hyperion/torch/layer_blocks/transducer_joiner.py index 482b5aa6..738c0cda 100644 --- a/hyperion/torch/layer_blocks/transducer_joiner.py +++ b/hyperion/torch/layer_blocks/transducer_joiner.py @@ -2,98 +2,68 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import logging from typing import Optional, Tuple import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser class TransducerJoiner(nn.Module): """ RNN-T Joiner network. Implementation based on - https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/joiner.py + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer_stateless7/joiner.py Attributes: in_feats: input feature dimension. vocab_size: vocabulary size """ - def __init__(self, in_feats: int, vocab_size: int): + def __init__(self, enc_feats: int, pred_feats: int, hid_feats: int, + vocab_size: int): super().__init__() - self.in_feats = in_feats + self.enc_feats = enc_feats + self.pred_feats = pred_feats + self.hid_feats = hid_feats self.vocab_size = vocab_size - self.output = nn.Linear(in_feats, vocab_size) - - def forward(self, encoder_out: torch.Tensor, - pred_out: torch.Tensor) -> torch.Tensor: + self.enc_proj = nn.Linear(enc_feats, hid_feats) + self.pred_proj = nn.Linear(pred_feats, hid_feats) + self.output = nn.Linear(hid_feats, vocab_size) + + def get_config(self): + config = { + "joiner_type": "basic", + "hid_feats": self.hid_feats, + } + return config + + def forward(self, + enc_out: torch.Tensor, + pred_out: torch.Tensor, + project_input: bool = True) -> torch.Tensor: """ Args: - encoder_out: Output from the encoder with shape = (N, T, C). - pred_out: Output from the predictor with shape = (N, U, C). + enc_out: output from the encoder with shape = (N, T, C) or (N, T, s_range, C) + pred_out: output from the predictor with shape = (N, U, C) or (N, T, s_range, C) + project_input: if True projects the encoder and predictor features + in the forward founction, if False it expects them outside. Returns: - Return a tensor of shape (N, T, U, C). + Symbols' logits of shape (N, T, U, C). """ - assert encoder_out.ndim == pred_out.ndim == 3 - assert encoder_out.size(0) == pred_out.size(0) - assert encoder_out.size(2) == pred_out.size(2) + assert enc_out.ndim == pred_out.ndim + assert enc_out.ndim in (3, 4) + + if enc_out.ndim == 3: + enc_out = enc_out.unsqueeze(2) # (N, T, 1, C) + pred_out = pred_out.unsqueeze(1) # (N, 1, U, C) - encoder_out = encoder_out.unsqueeze(2) - # Now encoder_out is (N, T, 1, C) - pred_out = pred_out.unsqueeze(1) - # Now pred_out is (N, 1, U, C) - x = torch.tanh(encoder_out + pred_out) + if project_input: + x = self.enc_proj(enc_out) + self.pred_proj(pred_out) + else: + x = enc_out + pred_out + x = torch.tanh(x) logits = self.output(x) return logits - - # def get_config(self): - # config = { - # "in_feats": self.in_feats, - # "out_dims": self.out_dims, - # "num_layers": self.num_layers, - # } - - # # base_config = super().get_config() - # return dict(list(config.items())) - - # @staticmethod - # def filter_args(**kwargs): - # valid_args = ( - # "in_feats", - # "out_dims", - # "num_layers", - # ) - # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - # return args - - # @staticmethod - # def add_class_args(parser, - # prefix=None, - # skip=set(["in_feats", "out_dims"])): - # if prefix is not None: - # outer_parser = parser - # parser = ArgumentParser(prog="") - - # if "in_feats" not in skip: - # parser.add_argument("--in-feats", - # type=int, - # required=True, - # help=("input feature dimension")) - - # if "out_dims" not in skip: - # parser.add_argument("--out-dims", - # type=int, - # required=True, - # help=("output feature dimension (vocab size)")) - # parser.add_argument("--num-layers", - # default=1, - # type=int, - # help=("layers of the joiner")) - - # if prefix is not None: - # outer_parser.add_argument("--" + prefix, - # action=ActionParser(parser=parser)) diff --git a/hyperion/torch/layer_blocks/transducer_predictor.py b/hyperion/torch/layer_blocks/transducer_predictor.py index ae354359..00339fe7 100644 --- a/hyperion/torch/layer_blocks/transducer_predictor.py +++ b/hyperion/torch/layer_blocks/transducer_predictor.py @@ -2,31 +2,32 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba, Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import logging from typing import Optional, Tuple import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ...utils.misc import filter_func_args +from ..layers import ActivationFactory as AF -class TransducerPredictor(nn.Module): - """ RNN-T prediction network. +class TransducerRNNPredictor(nn.Module): + """ RNN-T prediction network with LSTM or GRU Implmentation based on: https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/decoder.py Attributes: vocab_size: Number of tokens of the modeling unit including blank. embed_dim: Dimension of the input embedding. - blank_id: The ID of the blank symbol. num_layers: Number of LSTM layers. hid_feats: Hidden dimension of LSTM layers. out_feats: Output dimension of the predictor. embed_dropout_rate: Dropout rate for the embedding layer. rnn_dropout_rate: Dropout for LSTM layers. - + rnn_type: between lstm and gru + blank_id: The ID of the blank symbol. """ def __init__(self, @@ -34,7 +35,7 @@ def __init__(self, embed_dim: int, num_layers: int, hid_feats: int, - out_feats: int, + out_feats: Optional[int] = None, embed_dropout_rate: float = 0.0, rnn_dropout_rate: float = 0.0, rnn_type: str = "lstm", @@ -73,7 +74,29 @@ def __init__(self, self.hid_feats = hid_feats self.embed_dropout_rate = embed_dropout_rate self.rnn_dropout_rate = rnn_dropout_rate - self.output = nn.Linear(hid_feats, out_feats) + if out_feats is None: + out_feats = hid_feats + + self.out_feats = out_feats + if out_feats != hid_feats: + self.output_proj = nn.Linear(hid_feats, out_feats) + else: + self.output_proj = None + + def get_config(self): + config = { + "pred_type": "conv", + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "num_layers": self.num_layers, + "hid_feats": self.hid_feats, + "out_feats": self.out_feats, + "embed_dropout_rate": self.embed_dropout_rate, + "rnn_dropout_rate": self.rnn_dropout_rate, + "rnn_type": self.rnn_type, + "blank_id": self.blank_id, + } + return config def forward( self, @@ -90,8 +113,9 @@ def forward( """ embed = self.embedding(y) embed = self.embed_dropout(embed) - rnn_out, (h, c) = self.rnn(embed, states) - out = self.output(rnn_out) + out, (h, c) = self.rnn(embed, states) + if self.output_proj: + out = self.output_proj(out) return out, (h, c) @@ -110,101 +134,137 @@ def change_config( self.embed_dropout_rate = embed_dropout_rate self.embed_dropout = nn.Dropout(self.embed_dropout_rate) - # def get_config(self): - # config = { - # "in_feats": self.in_feats, - # "blank_id": self.blank_id, - # "vocab_size": self.vocab_size, - # "embed_dim": self.embed_dim, - # "num_layers": self.num_layers, - # "hid_feats": self.hid_feats, - # "embed_dropout_rate": self.embed_dropout_rate, - # "rnn_dropout_rate": self.rnn_dropout_rate, - # } - - # # base_config = super().get_config() - # return dict(list(config.items())) - - # @staticmethod - # def filter_args(**kwargs): - # args = filter_func_args(TransducerPredictor.__init__, kwargs) - # return args - - # @staticmethod - # def filter_finetune_args(**kwargs): - # args = filter_func_args(TransducerPredictor.change_config, kwargs) - # return args - - # @staticmethod - # def add_class_args(parser, - # prefix=None, - # skip=set(["in_feats", "blank_id", "vocab_size"])): - - # if prefix is not None: - # outer_parser = parser - # parser = ArgumentParser(prog="") - - # if "in_feats" not in skip: - # parser.add_argument("--in-feats", - # type=int, - # required=True, - # help=("input feature dimension")) - # if "blank_id" not in skip: - # parser.add_argument("--blank-id", - # type=int, - # required=True, - # help=("blank id from sp model")) - # if "vocab_size" not in skip: - # parser.add_argument("--vocab-size", - # type=int, - # required=True, - # help=("output prediction dimension")) - # parser.add_argument("--embedding-dim", - # default=1024, - # type=int, - # help=("feature dimension")) - # parser.add_argument("--embedding-dropout-rate", - # default=0.0, - # type=float, - # help=("dropout prob for decoder input embeddings")) - # parser.add_argument("--rnn-dropout-rate", - # default=0.0, - # type=float, - # help=("dropout prob for decoder RNN ")) - - # parser.add_argument("--num-layers", default=2, type=int, help=("")) - - # parser.add_argument("--hidden-dim", default=512, type=int, help=("")) - - # if prefix is not None: - # outer_parser.add_argument("--" + prefix, - # action=ActionParser(parser=parser)) - - # @staticmethod - # def add_finetune_args(parser, - # prefix=None, - # skip=set(["in_feats", "blank_id", "vocab_size"])): - - # if prefix is not None: - # outer_parser = parser - # parser = ArgumentParser(prog="") - - # parser.add_argument( - # "--override-dropouts", - # default=False, - # action=ActionYesNo, - # help=( - # "whether to use the dropout probabilities passed in the " - # "arguments instead of the defaults in the pretrained model.")) - # parser.add_argument("--embedding-dropout-rate", - # default=0.0, - # type=float, - # help=("dropout prob for decoder input embeddings")) - # parser.add_argument("--rnn-dropout-rate", - # default=0.0, - # type=float, - # help=("dropout prob for decoder RNN ")) - - # if prefix is not None: - # outer_parser.add_argument("--" + prefix, - # action=ActionParser(parser=parser)) + +class TransducerConvPredictor(nn.Module): + """ RNN-T prediction network based on Convolutions + Implmentation based on: + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7/decoder.py + + Attributes: + vocab_size: Number of tokens of the modeling unit including blank. + embed_dim: Dimension of the input embedding. + blank_id: The ID of the blank symbol. + out_feats: Output dimension of the predictor. + embed_dropout_rate: Dropout rate for the embedding layer. + """ + + def __init__( + self, + vocab_size: int, + embed_dim: int, + out_feats: Optional[int] = None, + context_size: int = 2, + embed_dropout_rate: float = 0.0, + hid_act: str = "relu", + blank_id: int = 0, + ): + super().__init__() + self.embedding = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=embed_dim, + padding_idx=blank_id, + ) + self.embed_dropout = nn.Dropout(embed_dropout_rate) + assert context_size >= 1, context_size + if context_size > 1: + self.conv = nn.Conv1d( + in_channels=embed_dim, + out_channels=embed_dim, + kernel_size=context_size, + padding=0, + groups=out_feats // 4, + bias=False, + ) + + self.blank_id = blank_id + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.embed_dropout_rate = embed_dropout_rate + self.context_size = context_size + self.hid_act = AF.create(hid_act) + + if out_feats is None: + out_feats = embed_dim + + self.out_feats = out_feats + if out_feats != embed_feats: + self.output_proj = nn.Linear(embed_dim, out_feats) + else: + self.output_proj = None + + def get_config(self): + hid_act = AF.get_config(self.hid_act) + config = { + "pred_type": "conv", + "vocab_size": self.vocab_size, + "embed_dim": self.embed_dim, + "out_feats": self.out_feats, + "context_size": self.context_size, + "embed_dropout_rate": self.embed_dropout_rate, + "blank_id": self.blank_id, + "hid_act": hid_act, + } + return config + + def forward( + self, + y: torch.Tensor, + states: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, None]: + """ + Args: + y: + A 2-D tensor of shape (N, U). + # need_pad: + # True to left pad the input. Should be True during training. + # False to not pad the input. Should be False during inference. + Returns: + Return a tensor of shape (N, U, decoder_dim). + """ + y = y.to(torch.int64) + embed = self.embedding(y) + if self.context > 1: + embed = embed.transpose(1, 2) + if states is None: + embed = F.pad(embedding_out, pad=(self.context_size - 1, 0)) + else: + raise NotImplementedError() + embed = self.conv(embed).transpose(1, 2) + + out = self.hid_act(embed) + if self.output_proj: + out = self.output_proj(out) + + return out, None + + # # this stuff about clamp() is a temporary fix for a mismatch + # # at utterance start, we use negative ids in beam_search.py + # if torch.jit.is_tracing(): + # # This is for exporting to PNNX via ONNX + # embedding_out = self.embedding(y) + # else: + # embedding_out = self.embedding(y.clamp(min=0)) * (y >= 0).unsqueeze(-1) + # if self.context_size > 1: + # embedding_out = embedding_out.permute(0, 2, 1) + # if need_pad is True: + # embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0)) + # else: + # # During inference time, there is no need to do extra padding + # # as we only need one output + # assert embedding_out.size(-1) == self.context_size + # embedding_out = self.conv(embedding_out) + # embedding_out = embedding_out.permute(0, 2, 1) + # embedding_out = F.relu(embedding_out) + # return embedding_out + + def change_config( + self, + override_dropouts=False, + embed_dropout_rate: float = 0.0, + ): + logging.info("changing predictor config") + + if override_dropouts: + logging.info("overriding predictor dropouts") + self.embed_dropout_rate = embed_dropout_rate + self.embed_dropout = nn.Dropout(self.embed_dropout_rate) diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 5e38494f..85ba92f6 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -6,7 +6,6 @@ import math import numpy as np - import torch import torch.nn as nn import torch.nn.functional as nnf diff --git a/hyperion/torch/layers/mvn.py b/hyperion/torch/layers/mvn.py index 4b4c5927..b90a65c8 100644 --- a/hyperion/torch/layers/mvn.py +++ b/hyperion/torch/layers/mvn.py @@ -2,10 +2,9 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser class MeanVarianceNorm(nn.Module): diff --git a/hyperion/torch/layers/pool_factory.py b/hyperion/torch/layers/pool_factory.py index 84d0cbf1..c0e573af 100644 --- a/hyperion/torch/layers/pool_factory.py +++ b/hyperion/torch/layers/pool_factory.py @@ -2,9 +2,8 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from .global_pool import * diff --git a/hyperion/torch/layers/spec_augment.py b/hyperion/torch/layers/spec_augment.py index f4e03842..761a4e31 100644 --- a/hyperion/torch/layers/spec_augment.py +++ b/hyperion/torch/layers/spec_augment.py @@ -4,11 +4,10 @@ """ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn import torch.nn.functional as nnf +from jsonargparse import ActionParser, ArgumentParser count = 0 diff --git a/hyperion/torch/loggers/logger.py b/hyperion/torch/loggers/logger.py index 7e9c91f2..46c1130d 100644 --- a/hyperion/torch/loggers/logger.py +++ b/hyperion/torch/loggers/logger.py @@ -4,7 +4,6 @@ """ import numpy as np - import torch.distributed as dist diff --git a/hyperion/torch/loggers/logger_list.py b/hyperion/torch/loggers/logger_list.py index 0291a01f..20ae58ec 100644 --- a/hyperion/torch/loggers/logger_list.py +++ b/hyperion/torch/loggers/logger_list.py @@ -4,7 +4,6 @@ """ import numpy as np - import torch.distributed as dist from .tensorboard_logger import TensorBoardLogger as TBL diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index d3111140..4bd086ad 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -2,9 +2,8 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch +from jsonargparse import ActionParser, ArgumentParser from .cos_lr import AdamCosineLR, CosineLR from .exp_lr import ExponentialLR diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 95042aed..2df4e047 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -6,8 +6,9 @@ from .vae.vae import VAE from .vae.vq_vae import VQVAE -from .wav2transducer import HFWav2Vec2Transducer -from .wav2transducer import HFWav2Vec2RNNTransducer, HFWav2Vec2RNNRNNTransducer +from .transducer import RNNTransducer, RNNRNNTransducer +from .wav2transducer import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer) from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, HFWavLM2ResNet1dXVector) from .xvectors.efficient_net_xvector import EfficientNetXVector diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py index fe55e34d..ee3c85f5 100644 --- a/hyperion/torch/models/transducer/__init__.py +++ b/hyperion/torch/models/transducer/__init__.py @@ -4,10 +4,10 @@ """ -from .rnn_transducer import RNNTransducer from .rnn_rnn_transducer import RNNRNNTransducer - +from .rnn_transducer import RNNTransducer, RNNTransducerOutput from .transducer import Transducer + #from .conformer import Conformer #from .decoder import Decoder #from .joiner import Joiner diff --git a/hyperion/torch/models/transducer/conformer.py b/hyperion/torch/models/transducer/conformer.py index a350d579..f62621af 100644 --- a/hyperion/torch/models/transducer/conformer.py +++ b/hyperion/torch/models/transducer/conformer.py @@ -20,12 +20,12 @@ import warnings from typing import List, Optional, Tuple -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import torch +from hyperion.utils.text import make_pad_mask, subsequent_chunk_mask +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch import Tensor, nn -from .transformer import Transformer -from hyperion.utils.text import make_pad_mask, subsequent_chunk_mask +from .transformer import Transformer class Conformer(Transformer): diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py index e7a40ec0..7f3698d7 100644 --- a/hyperion/torch/models/transducer/decoder.py +++ b/hyperion/torch/models/transducer/decoder.py @@ -14,13 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import logging from typing import Optional, Tuple -import logging import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser # TODO(fangjun): Support switching between LSTM and GRU diff --git a/hyperion/torch/models/transducer/joiner.py b/hyperion/torch/models/transducer/joiner.py index 0fc1fe51..983f064a 100644 --- a/hyperion/torch/models/transducer/joiner.py +++ b/hyperion/torch/models/transducer/joiner.py @@ -14,9 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser class Joiner(nn.Module): diff --git a/hyperion/torch/models/transducer/lstm_rnn_transducer.py b/hyperion/torch/models/transducer/lstm_rnn_transducer.py index 5ab74483..983334d4 100644 --- a/hyperion/torch/models/transducer/lstm_rnn_transducer.py +++ b/hyperion/torch/models/transducer/lstm_rnn_transducer.py @@ -5,7 +5,9 @@ import logging from typing import Dict, Optional, Union -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + try: import k2 except ModuleNotFoundError: diff --git a/hyperion/torch/models/transducer/rnn_rnn_transducer.py b/hyperion/torch/models/transducer/rnn_rnn_transducer.py index 0e1c7a85..1c0704f5 100644 --- a/hyperion/torch/models/transducer/rnn_rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_rnn_transducer.py @@ -4,8 +4,10 @@ """ import logging -from typing import Dict, Optional, Union, Tuple -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from typing import Dict, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + try: import k2 except ModuleNotFoundError: @@ -13,8 +15,8 @@ import torch -from .rnn_transducer import RNNTransducer from ...narchs import RNNEncoder +from .rnn_transducer import RNNTransducer class RNNRNNTransducer(RNNTransducer): diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py index ef54a5eb..0b886fdf 100644 --- a/hyperion/torch/models/transducer/rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -4,8 +4,11 @@ """ import logging -from typing import Dict, Optional, Union, Tuple, List -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + try: import k2 except ModuleNotFoundError: @@ -13,9 +16,19 @@ import torch +from ....utils import HypDataClass from ....utils.misc import filter_func_args -from ...torch_model import TorchModel from ...narchs import RNNTransducerDecoder +from ...torch_model import TorchModel + + +@dataclass +class RNNTransducerOutput(HypDataClass): + + loss: torch.Tensor + loss_simple: Optional[torch.Tensor] = None + loss_pruned: Optional[torch.Tensor] = None + h_feats: Optional[List[torch.Tensor]] = None class RNNTransducer(TorchModel): @@ -49,7 +62,7 @@ def forward( x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> RNNTransducerOutput: """ Args: x: input features with shape = (N, T, C) @@ -65,14 +78,17 @@ def forward( assert y.num_axes == 2, y.num_axes assert x.size(0) == x_lengths.size(0) == y.dim0 + assert torch.all( + x_lengths[:-1] >= x_lengths[1:] + ), f"x_lengths={x_lengths}" # check x_lengths are sorted if self.encoder is not None: x, x_lengths = self.encoder(x, x_lengths) assert torch.all(x_lengths > 0) - print("zz", x.shape, x_lengths, y, flush=True) - logits, loss = self.decoder(x, x_lengths, y) - return logits, loss + dec_output = self.decoder(x, x_lengths, y) + output = RNNTransducerOutput(*dec_output) + return output def infer(self, x: torch.Tensor, diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index bae35e0e..c9ba365e 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -17,23 +17,25 @@ Note we use `rnnt_loss` from torchaudio, which exists only in torchaudio >= v0.10.0. It also means you have to use torch >= v1.10.0 """ -from jsonargparse import ArgumentParser, ActionParser, ActionYesNo +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + try: import k2 except ModuleNotFoundError: from ...utils import dummy_k2 as k2 import logging + import torch import torch.nn as nn import torchaudio import torchaudio.functional -from .encoder_interface import EncoderInterface +from hyperion.utils.text import add_sos from ...torch_model import TorchModel -from hyperion.utils.text import add_sos # from .conformer import Conformer from .decoder import Decoder +from .encoder_interface import EncoderInterface from .joiner import Joiner diff --git a/hyperion/torch/models/transducer/transformer.py b/hyperion/torch/models/transducer/transformer.py index d9d5d6fb..a354b5f5 100644 --- a/hyperion/torch/models/transducer/transformer.py +++ b/hyperion/torch/models/transducer/transformer.py @@ -20,11 +20,11 @@ import torch import torch.nn as nn +from hyperion.utils.text import make_pad_mask + from .encoder_interface import EncoderInterface from .subsampling import Conv2dSubsampling, VggSubsampling -from hyperion.utils.text import make_pad_mask - class Transformer(EncoderInterface): def __init__( diff --git a/hyperion/torch/models/tvector/tvector.py b/hyperion/torch/models/tvector/tvector.py index a46fc324..a4e4d148 100644 --- a/hyperion/torch/models/tvector/tvector.py +++ b/hyperion/torch/models/tvector/tvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import ClassifHead, ConformerEncoderV1, TorchNALoader from ..layer_blocks import TDNNBlock diff --git a/hyperion/torch/models/wav2transducer/__init__.py b/hyperion/torch/models/wav2transducer/__init__.py index de4879a5..79af6349 100644 --- a/hyperion/torch/models/wav2transducer/__init__.py +++ b/hyperion/torch/models/wav2transducer/__init__.py @@ -5,6 +5,5 @@ """ from .hf_wav2vec2_transducer import HFWav2Vec2Transducer - -from .hf_wav2vec2rnn_transducer import HFWav2Vec2RNNTransducer from .hf_wav2vec2rnn_rnn_transducer import HFWav2Vec2RNNRNNTransducer +from .hf_wav2vec2rnn_transducer import HFWav2Vec2RNNTransducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py index 922996f6..f68a6f7a 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py @@ -2,17 +2,18 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import logging import contextlib -from typing import Union, Dict, List -from jsonargparse import ArgumentParser, ActionParser +import logging +from dataclasses import dataclass +from typing import Dict, List, Union import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ...utils import remove_silence from ...torch_model import TorchModel -from ..transducer import RNNTransducer +from ...utils import remove_silence +from ..transducer import RNNTransducer, RNNTransducerOutput class HFWav2RNNTransducer(TorchModel): @@ -163,31 +164,23 @@ def forward( we should return. If None, no encoder layers are returned. return_logits: if True, it adds the logits to the output dictionary. Returns: - Tensor with class logits with shape=(batch, num_classes) or - Dictionary with "logits", "h_enc" (list of hidden encoder layers), - "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) + Dataclass with losses, "h_enc" (list of hidden encoder layers), + "h_feats" (wav2vec features) """ feats, hid_feats, feat_lengths = self.forward_feats( x, x_lengths, return_feat_layers) feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) - - output, loss = self.transducer( + output = self.transducer( feats, feat_lengths, y, ) - if not return_feat_layers: - return output, loss - - if not isinstance(output, dict): - # if the transducer just returned the logits we put then into a dictionary - # to append the hid feats later. - output["logits"] = output + if return_feat_layers: + output.h_feats = hid_feats - output["h_feats"] = hid_feats - return output, loss + return output def infer(self, x: torch.Tensor, diff --git a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py index ec4c83b0..4cebfd66 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py @@ -2,17 +2,18 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import logging import contextlib -from jsonargparse import ArgumentParser, ActionParser +import logging import torch import torch.nn as nn - -# import torch.nn.functional as nnf +from jsonargparse import ActionParser, ArgumentParser from ...torch_model import TorchModel from ...utils import remove_silence + +# import torch.nn.functional as nnf + # from ..wav2xvectors.hf_wav2xvector import HFWav2XVector diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py index 242a5ca1..bd58e2cd 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py @@ -3,18 +3,17 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from jsonargparse import ArgumentParser, ActionParser -from typing import Union, Dict, Optional +from typing import Dict, Optional, Union import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..transducer import Transducer from ...tpm import HFWav2Vec2 +from ..transducer import Transducer from .hf_wav2transducer import HFWav2Transducer - class HFWav2Vec2Transducer(HFWav2Transducer): """Class extracting Wav2Vec2 + ResNet1d x-vectors from waveform. diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py index 412a182b..d2b13fb6 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py @@ -3,14 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from jsonargparse import ArgumentParser, ActionParser -from typing import Union, Dict, Optional +from typing import Dict, Optional, Union import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..transducer import RNNRNNTransducer from ...tpm import HFWav2Vec2 +from ..transducer import RNNRNNTransducer from .hf_wav2rnn_transducer import HFWav2RNNTransducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py index d89953b2..f4e02a23 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py @@ -3,14 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from jsonargparse import ArgumentParser, ActionParser -from typing import Union, Dict, Optional +from typing import Dict, Optional, Union import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser -from ..transducer import RNNTransducer from ...tpm import HFWav2Vec2 +from ..transducer import RNNTransducer from .hf_wav2rnn_transducer import HFWav2RNNTransducer diff --git a/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py index a5df4b8a..458e7cae 100644 --- a/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py @@ -3,11 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from typing import Dict, Optional, Union, Tuple -from jsonargparse import ActionParser, ArgumentParser +from typing import Dict, Optional, Tuple, Union import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import AudioFeatsMVN from ...torch_model import TorchModel diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py index b75ac53f..fb528809 100644 --- a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py @@ -5,10 +5,9 @@ import logging from typing import Dict, Optional, Union -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFHubert from ..xvectors import ResNet1dXVector diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py index 8a17379c..739213b4 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -5,10 +5,9 @@ import logging from typing import Dict, Optional, Union -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFWav2Vec2 from ..xvectors import ResNet1dXVector diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 5599fa1e..6ff8f8b4 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -5,10 +5,9 @@ import contextlib import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...torch_model import TorchModel from ...utils import remove_silence diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py index 56a19130..87e9a6f8 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py @@ -5,10 +5,9 @@ import logging from typing import Dict, Optional, Union -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFWavLM from ..xvectors import ResNet1dXVector diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py index 0d9f1bc4..b545bfaf 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ..xvectors import ResNet1dXVector from .wav2xvector import Wav2XVector diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py index 1f7283a0..51e045da 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ..xvectors import ResNetXVector from .wav2xvector import Wav2XVector diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 4c21f478..838f3342 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -4,10 +4,9 @@ """ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import AudioFeatsMVN from ...torch_model import TorchModel diff --git a/hyperion/torch/models/xvectors/efficient_net_xvector.py b/hyperion/torch/models/xvectors/efficient_net_xvector.py index a8663cd9..e2c46be5 100644 --- a/hyperion/torch/models/xvectors/efficient_net_xvector.py +++ b/hyperion/torch/models/xvectors/efficient_net_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import EfficientNet as EN from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index 09136b7d..2e3973a2 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import ResNet1dEncoder as Encoder from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index c6889626..2d1fe95a 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import ResNetFactory as RNF from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index 203008be..e1c3e4a4 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -5,10 +5,9 @@ """ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import SpineNetFactory as SNF from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/tdnn_xvector.py b/hyperion/torch/models/xvectors/tdnn_xvector.py index 530ca63b..ae2a9d78 100644 --- a/hyperion/torch/models/xvectors/tdnn_xvector.py +++ b/hyperion/torch/models/xvectors/tdnn_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import TDNNFactory as TF from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/transformer_xvector_v1.py b/hyperion/torch/models/xvectors/transformer_xvector_v1.py index 7c55844a..9d884ed6 100644 --- a/hyperion/torch/models/xvectors/transformer_xvector_v1.py +++ b/hyperion/torch/models/xvectors/transformer_xvector_v1.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import TransformerEncoderV1 as TE from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 3807bbd8..eae4a48f 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -6,10 +6,9 @@ from enum import Enum from typing import Optional -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ...layer_blocks import TDNNBlock from ...layers import GlobalPool1dFactory as PF diff --git a/hyperion/torch/narchs/__init__.py b/hyperion/torch/narchs/__init__.py index 5f333fc8..4fe8b4ed 100644 --- a/hyperion/torch/narchs/__init__.py +++ b/hyperion/torch/narchs/__init__.py @@ -20,11 +20,11 @@ from .resnet2d_decoder import ResNet2dDecoder from .resnet2d_encoder import ResNet2dEncoder from .resnet_factory import ResNetFactory +from .rnn_encoder import RNNEncoder +from .rnn_transducer_decoder import RNNTransducerDecoder from .spinenet import * from .spinenet_factory import SpineNetFactory from .tdnn import TDNNV1 from .tdnn_factory import TDNNFactory from .torch_na_loader import TorchNALoader from .transformer_encoder_v1 import TransformerEncoderV1 -from .rnn_transducer_decoder import RNNTransducerDecoder -from .rnn_encoder import RNNEncoder diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index 160ee61b..8a877d5e 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -2,10 +2,9 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ..layers import AudioFeatsFactory as AFF from ..layers import MeanVarianceNorm as MVN diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index 5d179fdb..028efe29 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -3,10 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from torch.nn import Linear from ..layer_blocks import FCBlock diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index 3acd44d2..98160a25 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -3,10 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ..layer_blocks import ConformerEncoderBlockV1 as EBlock from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index f5ab74d5..57d9adec 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -5,10 +5,9 @@ import math -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import DC1dDecBlock from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index 0c331a5e..aaf1bb2d 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -4,10 +4,9 @@ """ import math -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks.dc1d_blocks import DC1dEncBlock from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index 4106cbfd..87a18bfe 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -5,10 +5,9 @@ import math -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import DC2dDecBlock from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index ce7b9677..70eeac3c 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -5,10 +5,9 @@ import math -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import DC2dEncBlock from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/narchs/efficient_net.py b/hyperion/torch/narchs/efficient_net.py index b9efdcef..1eddc3ff 100644 --- a/hyperion/torch/narchs/efficient_net.py +++ b/hyperion/torch/narchs/efficient_net.py @@ -5,10 +5,9 @@ import math -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.nn import Dropout, Linear from ..layer_blocks import MBConvBlock, MBConvInOutBlock diff --git a/hyperion/torch/narchs/etdnn.py b/hyperion/torch/narchs/etdnn.py index a73439b7..d2b2d298 100644 --- a/hyperion/torch/narchs/etdnn.py +++ b/hyperion/torch/narchs/etdnn.py @@ -4,7 +4,6 @@ """ import numpy as np - import torch import torch.nn as nn from torch.nn import Conv1d, Linear diff --git a/hyperion/torch/narchs/net_arch.py b/hyperion/torch/narchs/net_arch.py index 9a3fc65f..4b39804c 100644 --- a/hyperion/torch/narchs/net_arch.py +++ b/hyperion/torch/narchs/net_arch.py @@ -4,7 +4,6 @@ """ import numpy as np - import torch.nn as nn from ..torch_model import TorchModel diff --git a/hyperion/torch/narchs/resetdnn.py b/hyperion/torch/narchs/resetdnn.py index eb964fa5..c4dc7784 100644 --- a/hyperion/torch/narchs/resetdnn.py +++ b/hyperion/torch/narchs/resetdnn.py @@ -4,7 +4,6 @@ """ import numpy as np - import torch import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index e3264f33..eb605bff 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -5,7 +5,6 @@ import logging import numpy as np - import torch import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index 0c577174..d65bab00 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -4,10 +4,9 @@ """ import math -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import (DC1dDecBlock, ResNet1dBasicDecBlock, ResNet1dBNDecBlock, SEResNet1dBasicDecBlock, diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index 5bdad186..ab184467 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -7,10 +7,9 @@ import math import numpy as np -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import (DC1dEncBlock, Res2Net1dBasicBlock, Res2Net1dBNBlock, ResNet1dBasicBlock, diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index 426b37f5..50369c8d 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -5,10 +5,9 @@ import math -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import (DC2dDecBlock, ResNet2dBasicDecBlock, ResNet2dBNDecBlock, SEResNet2dBasicDecBlock, diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index 84e6599e..8a76e348 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -6,10 +6,9 @@ import logging import math -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import (DC2dEncBlock, Res2Net2dBasicBlock, Res2Net2dBNBlock, ResNet2dBasicBlock, diff --git a/hyperion/torch/narchs/rnn_encoder.py b/hyperion/torch/narchs/rnn_encoder.py index dcf02564..593405c5 100644 --- a/hyperion/torch/narchs/rnn_encoder.py +++ b/hyperion/torch/narchs/rnn_encoder.py @@ -5,13 +5,12 @@ import logging import math -from typing import Dict, Optional, Union, Tuple +from typing import Dict, Optional, Tuple, Union import numpy as np -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from ..layer_blocks import TransformerConv2dSubsampler as Subsampler diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index 64c71dcd..265f2c9b 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -3,14 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser -from typing import Optional, Dict, List, Tuple from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn import torchaudio import torchaudio.functional +from jsonargparse import ActionParser, ArgumentParser try: import k2 @@ -19,7 +19,8 @@ from ...utils.misc import filter_func_args from ...utils.text import add_sos -from ..layer_blocks import TransducerPredictor as Predictor, TransducerJoiner as Joiner +from ..layer_blocks import TransducerJoiner as Joiner +from ..layer_blocks import TransducerRNNPredictor as RNNPredictor, TransducerConvPredictor as ConvPredictor from .net_arch import NetArch @@ -40,56 +41,117 @@ class RNNTransducerDecoder(NetArch): Attributes: in_feats: input features dimension (encoder output) vocab_size: Number of tokens of the modeling unit including blank. - embed_dim: Dimension of the predictor input embedding. - blank_id: The ID of the blank symbol. - num_layers: Number of LSTM layers. - hid_feats: Hidden dimension for predictor layers. - embed_dropout_rate: Dropout rate for the embedding layer. - rnn_dropout_rate: Dropout for LSTM layers. - + predictor: Dictionary with the predictor options. + joiner: Dictionary with the joiner options. + blank_id: id of the null symbol. + rnnt_loss: type of rnn-t loss between torchaudio, k2 or k2_pruned. + rnnt_type: rnn-t variation between regular, modified or constrained. + delay_penalty: penalize symbol delay, which is used to make symbol + emit earlier. + reduction: type of reduction for rnn-t loss between sum or mean + prune_range: how many symbols to keep for each frame in k2 rnn-t + pruned loss. + lm_scale: language model scale in rnn-t smoothed loss. + am_scale: acoustic model scale in rnn-t smoothed loss. + simple_loss_scale: weight of rnn-t simple loss when using k2 pruned loss. + pruned_warmup_steps: number of steps to warm up the k2 rnn-t pruned loss + from 0.1 to 1. """ - def __init__(self, - in_feats: int, - vocab_size: int, - embed_dim: int, - num_pred_layers: int, - pred_hid_feats: int, - embed_dropout_rate: float = 0.0, - rnn_dropout_rate: float = 0.0, - rnn_type: str = "lstm", - blank_id: int = 0): + def __init__( + self, + in_feats: int, + vocab_size: int, + predictor: Dict, + joiner: Dict, + blank_id: int = 0, + rnnt_loss: str = "k2_pruned", + rnnt_type: str = "regular", + delay_penalty: float = 0.0, + reduction: str = "sum", + prune_range: int = 5, + lm_scale: float = 0.25, + am_scale: float = 0.0, + simple_loss_scale: float = 0.5, + pruned_warmup_steps: int = 2000, + ): super().__init__() self.in_feats = in_feats self.vocab_size = vocab_size - self.embed_dim = embed_dim - self.num_pred_layers = num_pred_layers - self.pred_hid_feats = pred_hid_feats - self.embed_dropout_rate = embed_dropout_rate - self.rnn_dropout_rate = rnn_dropout_rate - self.rnn_type = rnn_type + self.predictor_args = predictor + self.joiner_args = joiner self.blank_id = blank_id + self.rnnt_loss = rnnt_loss + self.rnnt_type = rnnt_type + self.delay_penalty = delay_penalty + self.reduction = reduction + self.prune_range = prune_range + self.lm_scale = lm_scale + self.am_scale = am_scale + self.simple_loss_scale = simple_loss_scale + self.pruned_warmup_steps = pruned_warmup_steps + + self._make_predictor() + self._make_joiner() + + if self.rnnt_loss == "k2_pruned": + self.simple_am_proj = nn.Linear(in_feats, vocab_size) + self.simple_lm_proj = nn.Linear(self.predictor.out_feats, + vocab_size) + self.register_buffer("cur_step", torch.as_tensor(0, + dtype=torch.int)) + + def _make_predictor(self): + pred_type = self.predictor_args["pred_type"] + self.predictor_args["in_feats"] = self.in_feats + self.predictor_args["vocab_size"] = self.vocab_size + self.predictor_args["blank_id"] = self.blank_id + if pred_type == "rnn": + pred_args = filter_func_args(RNNPredictor.__init__, + self.predictor_args) + self.predictor = RNNPredictor(**pred_args) + elif pred_type == "conv": + pred_args = filter_func_args(ConvPredictor.__init__, + self.predictor_args) + self.predictor = ConvPredictor(**pred_args) + else: + raise ValueError(f"Unknown predictor type {pred_type}") + + def _make_joiner(self): + joiner_type = self.joiner_args["joiner_type"] + + if joiner_type == "basic": + pred_feats = self.predictor_args["out_feats"] + hid_feats = self.joiner_args["hid_feats"] + self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, + self.vocab_size) + else: + raise ValueError(f"Unknown joiner type {joiner_type}") - pred_args = filter_func_args(Predictor.__init__, locals()) - pred_args["num_layers"] = num_pred_layers - pred_args["hid_feats"] = pred_hid_feats - pred_args["out_feats"] = in_feats - self.predictor = Predictor(**pred_args) - self.joiner = Joiner(in_feats, vocab_size) - - def forward(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: k2.RaggedTensor) -> torch.Tensor: + def get_config(self): + config = { + "in_feats": self.in_feats, + "vocab_size": self.vocab_size, + "predictor": self.predictor_args, + "joiner": self.joiner_args, + "blank_id": self.blank_id, + "rnnt_loss": self.rnnt_loss, + "rnnt_type": self.rnnt_type, + "delay_penalty": self.delay_penalty, + "reduction": self.reduction, + "prune_range": self.prune_range, + "lm_scale": self.lm_scale, + "am_scale": self.am_scale, + "simple_loss_scale": self.simple_loss_scale, + "pruned_warmup_steps": self.pruned_warmup_steps, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) - # get y_lengths - row_splits = y.shape.row_splits(1) - y_lengths = row_splits[1:] - row_splits[:-1] - # shift y adding token - sos_y = add_sos(y, sos_id=self.blank_id) - sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id) - sos_y_padded = sos_y_padded.to(torch.int64) - # apply predictor and joiner - pred_out, _ = self.predictor(sos_y_padded) + def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: torch.Tensor, y_lengths: torch.Tensor, + pred_out: torch.Tensor): logits = self.joiner(x, pred_out) # rnnt_loss requires 0 padded targets # Note: y does not start with SOS @@ -101,9 +163,137 @@ def forward(self, x: torch.Tensor, x_lengths: torch.Tensor, logit_lengths=x_lengths, target_lengths=y_lengths, blank=self.blank_id, - reduction="sum", + reduction=self.reduction, ) - return logits, loss + return loss + + def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: torch.Tensor, y_lengths: torch.Tensor, + pred_out: torch.Tensor): + y_padded = y.pad(mode="constant", padding_value=0) + y_padded = y_padded.to(torch.int64) + boundary = torch.zeros((x.size(0), 4), + dtype=torch.int64, + device=x.device) + boundary[:, 2] = y_lengths + boundary[:, 3] = x_lengths + + logits = self.joiner(x, pred_out) + + with torch.cuda.amp.autocast(enabled=False): + loss = k2.rnnt_loss( + logits=logits.float(), + symbols=y_padded, + termination_symbol=self.blank_id, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + ) + return loss + + def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, + y: torch.Tensor, y_lengths: torch.Tensor, + pred_out: torch.Tensor): + + y_padded = y.pad(mode="constant", padding_value=0) + y_padded = y_padded.to(torch.int64) + boundary = torch.zeros((x.size(0), 4), + dtype=torch.int64, + device=x.device) + boundary[:, 2] = y_lengths + boundary[:, 3] = x_lengths + + am_simple = self.simple_am_proj(x) + lm_simple = self.simple_lm_proj(pred_out) + with torch.cuda.amp.autocast(enabled=False): + loss_simple, (px_grad, py_grad) = k2.rnnt_loss_smoothed( + lm=lm_simple.float(), + am=am_simple.float(), + symbols=y_padded, + termination_symbol=self.blank_id, + lm_only_scale=self.lm_scale, + am_only_scale=self.am_scale, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + return_grad=True, + ) + + # ranges : [B, T, prune_range] + ranges = k2.get_rnnt_prune_ranges( + px_grad=px_grad, + py_grad=py_grad, + boundary=boundary, + s_range=self.prune_range, + ) + + # am_pruned : [B, T, prune_range, encoder_dim] + # lm_pruned : [B, T, prune_range, decoder_dim] + am_pruned, lm_pruned = k2.do_rnnt_pruning( + am=self.joiner.enc_proj(x), + lm=self.joiner.pred_proj(pred_out), + ranges=ranges, + ) + + # logits : [B, T, prune_range, vocab_size] + + # project_input=False since we applied the decoder's input projections + # prior to do_rnnt_pruning (this is an optimization for speed). + logits = self.joiner(am_pruned, lm_pruned, project_input=False) + + with torch.cuda.amp.autocast(enabled=False): + loss_pruned = k2.rnnt_loss_pruned( + logits=logits.float(), + symbols=y_padded, + ranges=ranges, + termination_symbol=self.blank_id, + boundary=boundary, + rnnt_type=self.rnnt_type, + delay_penalty=self.delay_penalty, + reduction=self.reduction, + ) + + if self.cur_step > self.pruned_warmup_steps: + simple_loss_scale = self.simple_loss_scale + pruned_loss_scale = 1.0 + else: + r = self.cur_step / self.pruned_warmup_steps + simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale) + pruned_loss_scale = 0.1 + 0.9 * r + self.cur_step += 1 + print(simple_loss_scale, pruned_loss_scale) + + loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned + + return loss, loss_simple, loss_pruned + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + + # get y_lengths + row_splits = y.shape.row_splits(1) + y_lengths = row_splits[1:] - row_splits[:-1] + # shift y adding token + sos_y = add_sos(y, sos_id=self.blank_id) + sos_y_padded = sos_y.pad(mode="constant", padding_value=self.blank_id) + sos_y_padded = sos_y_padded.to(torch.int64) + # apply predictor and joiner + pred_out, _ = self.predictor(sos_y_padded) + loss_simple = loss_pruned = None + if self.rnnt_loss == "k2_pruned": + loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned( + x, x_lengths, y, y_lengths, pred_out) + elif self.rnnt_loss == "k2": + loss = self._rnnt_loss_k2(x, x_lengths, y, y_lengths, pred_out) + elif self.rnnt_loss == "torchaudio": + loss_simple = loss_pruned = None + loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, + pred_out) + + return loss, loss_simple, loss_pruned def decode(self, x: torch.Tensor, @@ -427,21 +617,6 @@ def change_config( self.predictor.change_config(override_dropouts, embed_dropout_rate, rnn_dropout_rate) - def get_config(self): - - config = { - "in_feats": self.in_feats, - "vocab_size": self.vocab_size, - "embed_dim": self.embed_dim, - "num_pred_layers": self.num_pred_layers, - "pred_hid_feats": self.pred_hid_feats, - "embed_dropout_rate": self.embed_dropout_rate, - "rnn_dropout_rate": self.rnn_dropout_rate, - "blank_id": self.blank_id, - } - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) - @staticmethod def filter_args(**kwargs): args = filter_func_args(RNNTransducerDecoder.__init__, kwargs) @@ -452,10 +627,79 @@ def filter_finetune_args(**kwargs): args = filter_func_args(RNNTransducerDecoder.change_config, kwargs) return args + @staticmethod + def add_pred_args(parser): + + pred_parser = ArgumentParser(prog="") + pred_parser.add_argument( + "--pred-type", + default="rnn", + choices=["rnn", "conv"], + help= + """type of predictor between RNN and Convolutional [rnn, conv]""") + pred_parser.add_argument("--embed-dim", + default=1024, + type=int, + help=("token embedding dimension")) + pred_parser.add_argument( + "--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for predictor input embeddings")) + pred_parser.add_argument("--rnn-dropout-rate", + default=0.0, + type=float, + help="""dropout prob for decoder RNN """) + pred_parser.add_argument( + "--rnn-type", + default="lstm", + choices=["lstm", "gru"], + help= + """type of recurrent network for thep predictor in [lstm, gru]""") + + pred_parser.add_argument("--num-layers", + default=2, + type=int, + help="""number of layers of the predictor """) + + pred_parser.add_argument("--hid-feats", + default=512, + type=int, + help="""hidden features of the predictor""") + pred_parser.add_argument("--out-feats", + default=512, + type=int, + help="""output features of the predictor""") + pred_parser.add_argument("--context-size", + default=2, + type=int, + help="""context length of the convolutional + predictor, 1->bigram, 2-> trigram,...""") + + parser.add_argument("--predictor", + action=ActionParser(parser=pred_parser)) + + @staticmethod + def add_joiner_args(parser): + + pred_parser = ArgumentParser(prog="") + pred_parser.add_argument( + "--joiner-type", + default="basic", + choices=["basic"], + help= + """type of joiner network, there is only basic joiner for now""") + pred_parser.add_argument("--hid-feats", + default=512, + type=int, + help="""hidden features of the joiner""") + parser.add_argument("--joiner", + action=ActionParser(parser=pred_parser)) + @staticmethod def add_class_args(parser, prefix=None, - skip=set(["in_feats", "blanck_id", "vocab_size"])): + skip=set(["in_feats", "blank_id", "vocab_size"])): if prefix is not None: outer_parser = parser @@ -476,35 +720,59 @@ def add_class_args(parser, type=int, required=True, help=("output prediction dimension")) - parser.add_argument("--embed-dim", - default=1024, - type=int, - help=("token embedding dimension")) + + RNNTransducerDecoder.add_pred_args(parser) + RNNTransducerDecoder.add_joiner_args(parser) parser.add_argument( - "--embed-dropout-rate", + "--rnnt-loss", + default="k2_pruned", + choices=["torchaudio", "k2", "k2_pruned"], + help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""") + parser.add_argument( + "--rnnt-type", + default="regular", + choices=["regular", "modified", "constrained"], + help= + """type of rnn-t loss between regular, modified or constrained.""") + parser.add_argument( + "--delay-penalty", default=0.0, type=float, - help=("dropout prob for predictor input embeddings")) - parser.add_argument("--rnn-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder RNN ")) + help= + """penalize symbol delay, which is used to make symbol emit earlier + for streaming models.""") parser.add_argument( - "--rnn-type", - default="lstm", - choices=["lstm", "gru"], - help=( - "type of recurrent network for thep predictor in [lstm, gru]")) - - parser.add_argument("--num-pred-layers", - default=2, - type=int, - help="""number of layers of the predictor """) - - parser.add_argument("--pred-hid-feats", - default=512, - type=int, - help="""hidden features of the predictor""") + "--reduction", + default="sum", + choices=["sum", "mean"], + help="""type of reduction for rnn-t loss between sum or mean""") + parser.add_argument( + "--prune-range", + default=5, + type=int, + help="""how many symbols to keep for each frame in k2 rnn-t + pruned loss.""") + parser.add_argument( + "--lm-scale", + default=0.25, + type=float, + help="""language model scale in rnn-t smoothed loss""") + parser.add_argument( + "--am-scale", + default=0.0, + type=float, + help="""acoustic model scale in rnn-t smoothed loss""") + parser.add_argument( + "--simple-loss-scale", + default=0.5, + type=float, + help="""weight of rnn-t simple loss when using k2 pruned loss""") + parser.add_argument( + "--pruned-warmup-steps", + default=2000, + type=int, + help="""number of steps to warm up the k2 rnn-t pruned loss + from 0.1 to 1""") if prefix is not None: outer_parser.add_argument("--" + prefix, diff --git a/hyperion/torch/narchs/spinenet.py b/hyperion/torch/narchs/spinenet.py index 117c0733..da47ffe5 100644 --- a/hyperion/torch/narchs/spinenet.py +++ b/hyperion/torch/narchs/spinenet.py @@ -6,7 +6,6 @@ import logging import numpy as np - import torch import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear diff --git a/hyperion/torch/narchs/tdnn.py b/hyperion/torch/narchs/tdnn.py index 55e47e6a..8ac9be79 100644 --- a/hyperion/torch/narchs/tdnn.py +++ b/hyperion/torch/narchs/tdnn.py @@ -4,7 +4,6 @@ """ import numpy as np - import torch import torch.nn as nn from torch.nn import Linear diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index 4468185e..fd3de235 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -3,10 +3,9 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler from ..layer_blocks import TransformerEncoderBlockV1 as EBlock diff --git a/hyperion/torch/optim/factory.py b/hyperion/torch/optim/factory.py index 95117b05..aa1acdc8 100644 --- a/hyperion/torch/optim/factory.py +++ b/hyperion/torch/optim/factory.py @@ -4,10 +4,9 @@ """ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.optim as optim +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_args from .radam import RAdam diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index b2198924..659f9dde 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import HubertConfig, HubertModel - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import HubertConfig, HubertModel from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index e1f21153..76d80aa4 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import Wav2Vec2Config, Wav2Vec2Model - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2Config, Wav2Vec2Model from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index b0a815c7..5dd6a539 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -8,11 +8,10 @@ from turtle import right from typing import List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor from ...torch_model import TorchModel from ...utils import scale_seq_lengths, seq_lengths_to_mask diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index 0d5c5ad3..eec88dec 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import WavLMConfig, WavLMModel - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import WavLMConfig, WavLMModel from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 9f5fafe6..69e97cc6 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index e2d2d1f6..0523ad44 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index a6f20a8e..00a218f9 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -11,13 +11,12 @@ from enum import Enum from pathlib import Path -from fairscale.optim.grad_scaler import ShardedGradScaler -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.distributed as dist import torch.nn as nn +from fairscale.optim.grad_scaler import ShardedGradScaler +from jsonargparse import ActionParser, ArgumentParser from torch.optim.swa_utils import SWALR, AveragedModel from ...utils.misc import filter_func_args diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index cbf94bc0..3f0b3f1f 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -2,21 +2,19 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging import os from collections import OrderedDict as ODict -import logging - -from jsonargparse import ActionParser, ArgumentParser - import torch -import torchaudio import torch.nn as nn +import torchaudio +from jsonargparse import ActionParser, ArgumentParser +from torch.distributed.elastic.multiprocessing.errors import record from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset from .torch_trainer import TorchTrainer -from torch.distributed.elastic.multiprocessing.errors import record class TransducerTrainer(TorchTrainer): @@ -87,35 +85,6 @@ def __init__( super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # None, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - @record def train_epoch(self, data_loader): """Training epoch loop @@ -146,13 +115,10 @@ def train_epoch(self, data_loader): batch_size = input_data.shape[0] with self.amp_autocast(): - # print("xx", data.shape, data.shape[0] * data.shape[1] / 16000, - # torch.sum(audio_length).item() / 16000, - # torch.min(audio_length).item() / 16000, - # torch.max(audio_length).item() / 16000) - output, loss = self.model(input_data, - x_lengths=input_lengths, - y=target) + output = self.model(input_data, + x_lengths=input_lengths, + y=target) + loss = output.loss loss = loss.mean() / self.grad_acc_steps if self.use_amp: @@ -165,7 +131,10 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics["loss"] = loss.item() * self.grad_acc_steps + for k, v in output.items(): + if "loss" in k and v is not None: + batch_metrics[k] = output[k].item() + for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) @@ -213,13 +182,14 @@ def validation_epoch(self, data_loader, swa_update_bn=False): # batch_size = data.shape[0] with self.amp_autocast(): - output, loss = self.model(input_data, - x_lengths=input_lengths, - y=target) - # output = self.model(data) - # loss = self.loss(output, target) + output = self.model(input_data, + x_lengths=input_lengths, + y=target) + + for k, v in output.items(): + if "loss" in k and v is not None: + batch_metrics[k] = output[k].item() - batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index f4877dc6..ba401cb7 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index fc9d98f1..03800e0d 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 35946e96..40b6b10d 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 303427de..af915d6b 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -7,11 +7,10 @@ import time from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 2a012dde..1e1b1778 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -7,11 +7,10 @@ import time from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 9d04af42..4e791347 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -6,11 +6,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 1aefb3d4..ad9c825c 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -6,13 +6,12 @@ import logging import os -from fairscale.nn.data_parallel import \ - FullyShardedDataParallel as FullyShardedDDP -from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP - import torch import torch.distributed as dist import torch.nn as nn +from fairscale.nn.data_parallel import \ + FullyShardedDataParallel as FullyShardedDDP +from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP from .devices import open_device diff --git a/hyperion/torch/utils/metric_acc.py b/hyperion/torch/utils/metric_acc.py index a82c174a..7b423a3e 100644 --- a/hyperion/torch/utils/metric_acc.py +++ b/hyperion/torch/utils/metric_acc.py @@ -6,7 +6,6 @@ from collections import OrderedDict as ODict import numpy as np - import torch import torch.distributed as dist diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index 67f492f9..a8adf9a1 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -5,6 +5,7 @@ from .class_info import ClassInfo from .feature_set import FeatureSet +from .hyp_dataclass import HypDataClass from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix from .recording_set import RecordingSet from .rttm import RTTM diff --git a/hyperion/utils/hyp_dataclass.py b/hyperion/utils/hyp_dataclass.py new file mode 100644 index 00000000..f1e86d2c --- /dev/null +++ b/hyperion/utils/hyp_dataclass.py @@ -0,0 +1,31 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from dataclasses import dataclass + + +@dataclass +class HypDataClass: + """Dataclass that can imitate a dict""" + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, val): + return setattr(self, key, val) + + def keys(self): + return self.__dict__.keys() + #return self.__annotations__.keys() + + def items(self): + return self.__dict__.items() + # for k in self.keys(): + # yield k, getattr(self, k) + + @classmethod + def from_parent(cls, parent, **kwargs): + args = parent.__dict__ + args.update(kwargs) + return cls(**args) From 7115dbb65004ca6d7f95e14409d314fef9119890 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 24 Mar 2023 10:47:06 -0400 Subject: [PATCH 089/154] change configuration of residual networks with batchnorm after nonlinearity --- egs/voxceleb/v1.1/README.md | 72 +++--- ...rain_ecapatdnn2048x4_xvec_stage1_v3.0.yaml | 4 +- ...rain_ecapatdnn2048x4_xvec_stage2_v3.0.yaml | 7 +- ...train_ecapatdnn512x3_xvec_stage1_v3.0.yaml | 93 +++++++ ...train_ecapatdnn512x3_xvec_stage2_v3.0.yaml | 69 ++++++ ...onfig_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | 8 +- ...config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | 45 ++++ egs/voxceleb/v1.1/run_040_eval_be.sh | 179 +------------- egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py | 28 ++- egs/voxceleb/v1/steps_be/train_qmf.py | 26 +- .../torch/layer_blocks/res2net1d_blocks.py | 107 ++++++++- .../torch/layer_blocks/res2net2d_blocks.py | 55 +++-- hyperion/torch/layer_blocks/res2net_blocks.py | 42 ++-- .../torch/layer_blocks/resnet1d_blocks.py | 226 ++++++++++-------- .../torch/layer_blocks/resnet2d_blocks.py | 183 +++++++------- hyperion/torch/layer_blocks/resnet_blocks.py | 30 ++- .../torch/layer_blocks/seresnet_blocks.py | 35 +-- hyperion/torch/layers/activation_factory.py | 12 +- hyperion/torch/layers/swish.py | 112 +++++++++ .../models/xvectors/efficient_net_xvector.py | 4 +- .../torch/models/xvectors/resnet1d_xvector.py | 3 +- .../torch/models/xvectors/resnet_xvector.py | 2 + .../torch/models/xvectors/spinenet_xvector.py | 2 + .../torch/models/xvectors/tdnn_xvector.py | 4 +- .../models/xvectors/transformer_xvector_v1.py | 9 +- hyperion/torch/models/xvectors/xvector.py | 128 ++++++---- hyperion/torch/narchs/classif_head.py | 73 +++--- .../trainers/xvector_trainer_from_wav.py | 33 --- 28 files changed, 968 insertions(+), 623 deletions(-) create mode 100644 egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 7b6b278f..1d438868 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -85,65 +85,67 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr ## Results - - ### VoxCeleb 1 Original-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.09 | 0.068 | 0.121 | -| | | | Cosine + AS-Norm | 1.0 | 0.064 | 0.110 | -| | | | Cosine + QMF | 0.87 | 0.059 | 0.076 | -| config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.053 | 0.083 | -| | | | Cosine + AS-Norm | 0.78 | 0.046 | 0.078 | -| | | | Cosine + QMF | 0.74 | 0.046 | 0.077 | -| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.78 | 0.061 | 0.110 | -| | | | Cosine + AS-Norm | 0.70 | 0.054 | 0.102 | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.10 | 0.069 | 0.124 | +| | | | Cosine + AS-Norm | 1.09 | 0.065 | 0.105 | +| | | | Cosine + QMF | 0.92 | 0.059 | 0.090 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | +| | | | Cosine + AS-Norm | 0.63 | 0.048 | 0.083 | | | | | Cosine + QMF | 0.66 | 0.047 | 0.090 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | || | + + + + ### VoxCeleb 1 Entire-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.21 | 0.075 | 0.129 | -| | | | Cosine + AS-Norm | 1.15 | 0.069 | 0.113 | -| | | | Cosine + QMF | 1.12 | 0.067 | 0.111 | -| config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.92 | 0.058 | 0.104 | -| | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.089 | -| | | | Cosine + QMF | 0.88 | 0.054 | 0.092 | -| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.93 | 0.058 | 0.103 | -| | | | Cosine + AS-Norm | 0.88 | 0.052 | 0.092 | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.16 | 0.073 | 0.130 | +| | | | Cosine + AS-Norm | 1.13 | 0.068 | 0.117 | +| | | | Cosine + QMF | 1.06 | 0.065 | 0.108 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | +| | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.088 | | | | | Cosine + QMF | 0.90 | 0.053 | 0.090 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | ### VoxCeleb 1 Hard-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.17 | 0.129 | 0.212 | -| | | | Cosine + AS-Norm | 1.98 | 0.116 | 0.190 | -| | | | Cosine + QMF | 1.88 | 0.112 | 0.181 | -| config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.80 | 0.106 | 0.171 | -| | | | Cosine + AS-Norm | 1.59 | 0.091 | 0.146 | -| | | | Cosine + QMF | 1.59 | 0.092 | 0.151 | -| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.78 | 0.110 | 0.180 | -| | | | Cosine + AS-Norm | 1.61 | 0.097 | 0.159 | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.10 | 0.128 | 0.209 | +| | | | Cosine + AS-Norm | 1.99 | 0.117 | 0.191 | +| | | | Cosine + QMF | 1.82 | 0.111 | 0.183 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | +| | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.150 | | | | | Cosine + QMF | 1.62 | 0.096 | 0.158 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | + ### VoxSRC2022 dev | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_ecapatdnn512x3.v2.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.85 | 0.187 | 0.310 | -| | | | Cosine + AS-Norm | 2.69 | 0.182 | 0.310 | -| | | | Cosine + QMF | 2.80 | 0.196 | 0.338 | -| config_fbank80_stmn_idrnd_resnet100.v2.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.50 | 0.160 | 0.270 | -| | | | Cosine + AS-Norm | 2.31 | 0.139 | 0.240 | -| | | | Cosine + QMF | 2.54 | 0.153 | 0.248 | -| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.42 | 0.160 | 0.265 | -| | | | Cosine + AS-Norm | 2.32 | 0.152 | 0.273 | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.87 | 0.185 | 0.301 | +| | | | Cosine + AS-Norm | 2.84 | 0.182 | 0.307 | +| | | | Cosine + QMF | 2.62 | 0.175 | 0.282 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | +| | | | Cosine + AS-Norm | 2.19 | 0.145 | 0.265 | | | | | Cosine + QMF | 2.54 | 0.179 | 0.304 | - +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | || | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | ## Results before 2023 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml index 408bad1a..1633f4a2 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -61,6 +61,7 @@ model: endpoint_channels: 4096 norm_before: false dropout_rate: 0.2 + hid_act: swish pool_net: pool_type: ch-wise-att-mean+stddev inner_feats: 128 @@ -87,7 +88,8 @@ trainer: min_lr: 1.0e-06 warmup_steps: 15000 update_lr_on_opt_step: true + grad_clip: 250 use_amp: true log_interval: 1000 - epochs: 40 + epochs: 35 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml index 91a7d0b8..877736b3 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -12,8 +12,6 @@ data: min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 - # max_chunk_length: 6.0 - # min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 class_name: class_id seg_weight_mode: data-prior @@ -31,8 +29,8 @@ data: sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 - max_chunk_length: 6.0 - min_chunk_length: 6.0 + max_chunk_length: 4.0 + min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 class_name: class_id seg_weight_mode: data-prior @@ -62,6 +60,7 @@ trainer: min_lr: 1.0e-6 warmup_steps: 8000 update_lr_on_opt_step: true + grad_clip: 250 use_amp: true log_interval: 1000 epochs: 15 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..f15d453d --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml @@ -0,0 +1,93 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..45e55d97 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0. +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh index 5f7ed094..b093b37a 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh @@ -12,14 +12,14 @@ nnet_data=voxceleb2cat_train # x-vector cfg nnet_type=resnet1d -nnet_name=${feat_type}_ecapatdnn2048x4.v3.0 +nnet_name=${feat_type}_ecapatdnn2048x4.v4.0 -nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v4.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0030.pth +nnet_s1=$nnet_s1_dir/model_ep0035.pth -nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v4.0.yaml nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh new file mode 100644 index 00000000..5288f66b --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN small + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3.v3.12 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v3.12.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v3.12.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index 37a344b6..4c7c2091 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -98,7 +98,7 @@ if [ $stage -le 3 ];then $xvector_dir/voxceleb1_test/xvector.scp \ $score_cosine_dir/voxceleb1_scores - $train_cmd --mem 10G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir for f in $(ls $score_cosine_dir/*_results); @@ -143,7 +143,7 @@ if [ "$do_snorm" == "true" ];then if [ $stage -le 5 ];then echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" steps_be/eval_be_cos_snorm.sh \ - --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + --cmd "$train_cmd --mem 22G" --coh-nbest 1000 \ data/voxceleb1_test/trials \ data/voxceleb1_test/utt2model \ $xvector_dir/voxceleb1_test/xvector.scp \ @@ -209,7 +209,7 @@ if [ "$do_qmf" == "true" ];then $score_cosine_qmf_dir/voxceleb2_qmf_scores fi - + stage=9 if [ $stage -le 8 ];then echo "Eval Voxceleb 1 with Cosine scoring" @@ -333,176 +333,3 @@ if [ $stage -le 11 ];then done fi -exit -if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then - - echo "Eval voxsrc2 with Cosine scoring" - steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxsrc22_dev/trials \ - data/voxsrc22_dev/utt2model \ - $xvector_dir/voxsrc22_dev/xvector.scp \ - $score_cosine_dir/voxsrc22_dev_scores & - - # steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - # data/voxsrc22_test/trials \ - # data/voxsrc22_test/utt2model \ - # $xvector_dir/voxsrc22_test/xvector.scp \ - # $score_cosine_dir/voxsrc22_test_scores - - wait - $train_cmd --mem 10G --num-threads 1 $score_cosine_dir/log/score_voxsrc22_dev.log \ - local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_dir - - for f in $(ls $score_cosine_dir/voxsrc22_dev_results); - do - echo $f - cat $f - echo "" - done - -fi - - -if [ "$do_snorm" == "true" ];then - if [ $stage -le 5 ];then - echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" - steps_be/eval_be_cos_snorm.sh \ - --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - data/voxceleb2cat_train/utt2spk \ - $xvector_dir/voxceleb2cat_train/xvector.scp \ - $score_cosine_snorm_dir/voxceleb1_scores - - $train_cmd --mem 10G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_snorm_dir - - for f in $(ls $score_cosine_snorm_dir/*_results); - do - echo $f - cat $f - echo "" - done - fi - - if [ $stage -le 6 ];then - echo "Eval voxsrc2 with Cosine scoring" - steps_be/eval_be_cos_snorm.sh \ - --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ - data/voxsrc22_dev/trials \ - data/voxsrc22_dev/utt2model \ - $xvector_dir/voxsrc22_dev/xvector.scp \ - data/voxceleb2cat_train/utt2spk \ - $xvector_dir/voxceleb2cat_train/xvector.scp \ - $score_cosine_snorm_dir/voxsrc22_dev_scores & - - # steps_be/eval_be_cos_snorm.sh --cmd "$train_cmd" \ - # data/voxsrc22_test/trials \ - # data/voxsrc22_test/utt2model \ - # $xvector_dir/voxsrc22_test/xvector.scp \ - # data/voxceleb2cat_train/utt2spk \ - # $xvector_dir/voxceleb2cat_train/xvector.scp \ - # $score_cosine_snorm_dir/voxsrc22_test_scores - - wait - $train_cmd --mem 10G --num-threads 1 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ - local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_snorm_dir - - for f in $(ls $score_cosine_snorm_dir/voxsrc22_dev_results); - do - echo $f - cat $f - echo "" - done - fi -fi - - -if [ "$do_qmf" == "true" ];then - if [ $stage -le 7 ];then - echo "Train QMF in Vox2" - steps_be/train_be_cos_qmf.sh \ - --cmd "$train_cmd" --coh-nbest 1000 \ - data/voxceleb2cat_train/trials \ - data/voxceleb2cat_train/utt2model \ - $xvector_dir/voxceleb2cat_train/xvector.scp \ - $xvector_dir/voxceleb2cat_train/utt2num_frames \ - data/voxceleb2cat_train/snorm_utt2spk \ - $xvector_dir/voxceleb2cat_train/xvector.scp \ - $score_cosine_qmf_dir/voxceleb2_qmf_scores - - fi - - if [ $stage -le 8 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_be/eval_be_cos_qmf.sh \ - --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $xvector_dir/voxceleb1_test/utt2num_frames \ - data/voxceleb2cat_train/utt2spk \ - $xvector_dir/voxceleb2cat_train/xvector.scp \ - $score_cosine_qmf_dir/qmf.h5 \ - $score_cosine_qmf_dir/voxceleb1_scores - - $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir - $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_snorm.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _snorm - $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_qmf.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _qmf - - for f in $(ls $score_cosine_qmf_dir/voxceleb1{,_snorm,_qmf}_[oeh]_clean_results); - do - echo $f - cat $f - echo "" - done - - fi - - if [ $stage -le 9 ];then - echo "Eval voxsrc2 with Cosine scoring" - steps_be/eval_be_cos_qmf.sh \ - --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ - data/voxsrc22_dev/trials \ - data/voxsrc22_dev/utt2model \ - $xvector_dir/voxsrc22_dev/xvector.scp \ - $xvector_dir/voxsrc22_dev/utt2num_frames \ - data/voxceleb2cat_train/utt2spk \ - $xvector_dir/voxceleb2cat_train/xvector.scp \ - $score_cosine_qmf_dir/qmf.h5 \ - $score_cosine_qmf_dir/voxsrc22_dev_scores & - - # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \ - # data/voxsrc22_test/trials \ - # data/voxsrc22_test/utt2model \ - # $xvector_dir/voxsrc22_test/xvector.scp \ - # $xvector_dir/voxsrc22_test/utt2num_frames \ - # data/voxceleb2cat_train/utt2spk \ - # $xvector_dir/voxceleb2cat_train/xvector.scp \ - # $score_cosine_qmf_dir/qmf.h5 \ - # $score_cosine_qmf_dir/voxsrc22_test_scores - - wait - $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev.log \ - local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir - $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_snorm.log \ - local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _snorm - $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_qmf.log \ - local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _qmf - - for f in $(ls $score_cosine_qmf_dir/voxsrc22_dev{,_snorm,_qmf}_results); - do - echo $f - cat $f - echo "" - done - fi - -fi - - diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py index e0e1c2da..e6b68ae8 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py @@ -52,6 +52,11 @@ def save_scores(s, score_file, q_name, i, j, p): s.save_txt(score_file) +def print_q_stats(scores, name): + s = f"{name} stats mean={np.mean(scores)} min={np.min(scores)} max={np.max(scores)} median={np.median(scores)}" + logging.info(s) + + def eval_plda( v_file, ndx_file, @@ -67,7 +72,7 @@ def eval_plda( seg_part_idx, num_seg_parts, coh_nbest, - **kwargs + **kwargs, ): if preproc_file is not None: @@ -105,20 +110,31 @@ def eval_plda( logging.info("read num_frames") u2nf = Utt2Info.load(num_frames_file) + # enroll_nf = np.log( + # np.clip( + # u2nf.filter(enroll_segs).info.astype(float) / 100, a_min=0.1, a_max=15.0, + # ) + # ) + # test_nf = np.log( + # np.clip( + # u2nf.filter(ndx.seg_set).info.astype(float) / 100, a_min=0.1, a_max=15.0, + # ) + # ) enroll_nf = np.log( np.clip( - u2nf.filter(enroll_segs).info.astype(float) / 100 - 2.0, + u2nf.filter(enroll_segs).info.astype(float) / 100 - 1.0, a_min=0.1, - a_max=12.0, # 6.0, + a_max=15.0, ) ) test_nf = np.log( np.clip( - u2nf.filter(ndx.seg_set).info.astype(float) / 100 - 2.0, + u2nf.filter(ndx.seg_set).info.astype(float) / 100 - 1.0, a_min=0.1, - a_max=12.0, # 6.0, + a_max=15.0, ) ) + t1 = time.time() logging.info("computing llr") scores = cosine_scoring(x_e, x_t) @@ -160,6 +176,8 @@ def eval_plda( "maxcohmu": np.maximum(mu_z, mu_t), "mincohmu": np.minimum(mu_z, mu_t), } + for k, v in q_measures.items(): + print_q_stats(v, k) f, loc = ismember(enroll, ndx.model_set) trial_mask = ndx.trial_mask[loc] diff --git a/egs/voxceleb/v1/steps_be/train_qmf.py b/egs/voxceleb/v1/steps_be/train_qmf.py index afd9d218..ee9733d8 100755 --- a/egs/voxceleb/v1/steps_be/train_qmf.py +++ b/egs/voxceleb/v1/steps_be/train_qmf.py @@ -26,6 +26,12 @@ from hyperion.np.classifiers import BinaryLogisticRegression as LR +def print_q_stats(q, name): + scores = q.scores[q.score_mask] + s = f"{name} stats mean={np.mean(scores)} min={np.min(scores)} max={np.max(scores)} median={np.median(scores)}" + logging.info(s) + + def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): logging.info("load key: %s", key_file) @@ -40,29 +46,37 @@ def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbo q_file = f"{score_file}_maxnf" logging.info("load max num-frames: %s", q_file) q = TrialScores.load_txt(q_file) + print_q_stats(q, "max-nf") maxnf_tar, maxnf_non = q.get_tar_non(key) q_file = f"{score_file}_minnf" logging.info("load min num-frames: %s", q_file) q = TrialScores.load_txt(q_file) + print_q_stats(q, "min-nf") minnf_tar, minnf_non = q.get_tar_non(key) q_file = f"{score_file}_maxcohmu" logging.info("load max cohort mean: %s", q_file) q = TrialScores.load_txt(q_file) + print_q_stats(q, "max-cohmu") maxcohmu_tar, maxcohmu_non = q.get_tar_non(key) q_file = f"{score_file}_mincohmu" logging.info("load min cohort mean: %s", q_file) q = TrialScores.load_txt(q_file) + print_q_stats(q, "min-cohmu") mincohmu_tar, mincohmu_non = q.get_tar_non(key) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) n_miss = p_miss * ntar n_fa = p_fa * nnon logging.info( - "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" - % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + min_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, ) logging.info("train calibration") @@ -92,8 +106,12 @@ def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbo n_miss = p_miss * ntar n_fa = p_fa * nnon logging.info( - "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" - % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + act_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, ) output_file = f"{score_file}_qmf" diff --git a/hyperion/torch/layer_blocks/res2net1d_blocks.py b/hyperion/torch/layer_blocks/res2net1d_blocks.py index 1decc327..0fbdc301 100644 --- a/hyperion/torch/layer_blocks/res2net1d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net1d_blocks.py @@ -76,7 +76,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation={"name": "relu6", "inplace": True}, + activation={"name": "relu", "inplace": True}, stride=1, dropout_rate=0, drop_connect_rate=0, @@ -195,6 +195,9 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, time). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) split_x = torch.split(x, split_size, 1) @@ -213,7 +216,7 @@ def forward(self, x, x_mask=None): if self.norm_before: x_i = self.bn1s[i](x_i) x_i = self.act1(x_i) - if not self.norm_before: + if self.norm_after: x_i = self.bn1s[i](x_i) x.append(x_i) @@ -223,23 +226,28 @@ def forward(self, x, x_mask=None): x = torch.cat(x, dim=1) x = self.conv2(x) - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.se_layer: - x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - if self.downsample is not None: - residual = self.downsample(residual) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - x += residual - x = self.act2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if not self.norm_before: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -377,6 +385,79 @@ def expansion(self): def forward(self, x, x_mask=None): """Forward function. + Args: + x: input tensor with shape = (batch, in_channels, in_heigh, in_width). + x_mask: Binary mask indicating which spatial dimensions are valid of + shape=(batch, time), (batch, 1, time). + + Returns: + Tensor with shape = (batch, out_channels, time). + """ + residual = x + if self.downsample is not None: + residual = self.downsample(residual) + + x = self.conv1(x) + if self.norm_before: + x = self.bn1(x) + x = self.act1(x) + if self.norm_after: + x = self.bn1(x) + + split_x = torch.split(x, self.width, 1) + x = [] + for i in range(self.num_k): + if i == 0 or self.stride > 1: + x_i = split_x[i] + else: + x_i = x_i + split_x[i] + x_i = self.conv2s[i](x_i) + if self.norm_before: + x_i = self.bn2s[i](x_i) + x_i = self.act2(x_i) + if self.norm_after: + x_i = self.bn2s[i](x_i) + x.append(x_i) + + if self.scale > 1: + if self.stride == 1: + x.append(split_x[-1]) + else: + x.append(self.pool(split_x[-1])) + + x = torch.cat(x, dim=1) + + x = self.conv3(x) + if self.norm_after: + x = self.act3(x) + x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) + + if self.drop_connect_rate > 0: + x = self.drop_connect(x) + + x += residual + else: + if self.norm_before: + x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) + + if self.drop_connect_rate > 0: + x = self.drop_connect(x) + + x += residual + x = self.act3(x) + + if self.dropout_rate > 0: + x = self.dropout(x) + + return x + + def forward0(self, x, x_mask=None): + """Forward function. + Args: x: input tensor with shape = (batch, in_channels, in_heigh, in_width). x_mask: Binary mask indicating which spatial dimensions are valid of diff --git a/hyperion/torch/layer_blocks/res2net2d_blocks.py b/hyperion/torch/layer_blocks/res2net2d_blocks.py index d833a5e3..4050f936 100644 --- a/hyperion/torch/layer_blocks/res2net2d_blocks.py +++ b/hyperion/torch/layer_blocks/res2net2d_blocks.py @@ -195,6 +195,9 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) split_x = torch.split(x, split_size, 1) @@ -213,7 +216,7 @@ def forward(self, x, x_mask=None): if self.norm_before: x_i = self.bn1s[i](x_i) x_i = self.act1(x_i) - if not self.norm_before: + if self.norm_after: x_i = self.bn1s[i](x_i) x.append(x_i) @@ -223,20 +226,22 @@ def forward(self, x, x_mask=None): x = torch.cat(x, dim=1) x = self.conv2(x) - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.downsample is not None: - residual = self.downsample(residual) - - if self.se_layer: - x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - x += residual - x = self.act2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if not self.norm_before: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -383,12 +388,14 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: x = self.bn1(x) x = self.act1(x) - if not self.norm_before: + if self.norm_after: x = self.bn1(x) split_x = torch.split(x, self.width, 1) @@ -402,7 +409,7 @@ def forward(self, x, x_mask=None): if self.norm_before: x_i = self.bn2s[i](x_i) x_i = self.act2(x_i) - if not self.norm_before: + if self.norm_after: x_i = self.bn2s[i](x_i) x.append(x_i) @@ -415,20 +422,22 @@ def forward(self, x, x_mask=None): x = torch.cat(x, dim=1) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.downsample is not None: - residual = self.downsample(residual) - - if self.se_layer: - x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - x += residual - x = self.act3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if not self.norm_before: - x = self.bn3(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 6a785956..73255a24 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -179,6 +179,9 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) + split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) split_x = torch.split(x, split_size, 1) @@ -209,18 +212,18 @@ def forward(self, x, x_mask=None): x = self.conv2(x) if self.norm_before: x = self.bn2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.downsample is not None: - residual = self.downsample(residual) - - if self.se_layer: - x = self.se_layer(x, x_mask=x_mask) - - x += residual - x = self.act2(x) - - if not self.norm_before: + x += residual + x = self.act2(x) + else: + x = self.act2(x) x = self.bn2(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) + + x += residual if self.dropout_rate > 0: x = self.dropout(x) @@ -358,6 +361,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -392,15 +397,18 @@ def forward(self, x, x_mask=None): x = self.conv3(x) if self.norm_before: x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - if self.downsample is not None: - residual = self.downsample(residual) - - if self.se_layer: - x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act3(x) + else: + x = self.act3(x) + x = self.bn3(x) + if self.se_layer: + x = self.se_layer(x, x_mask=x_mask) - x += residual - x = self.act3(x) + x += residual if not self.norm_before: x = self.bn3(x) diff --git a/hyperion/torch/layer_blocks/resnet1d_blocks.py b/hyperion/torch/layer_blocks/resnet1d_blocks.py index dd914eba..4ad9b8ce 100644 --- a/hyperion/torch/layer_blocks/resnet1d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet1d_blocks.py @@ -139,7 +139,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -210,6 +210,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -221,21 +223,22 @@ def forward(self, x, x_mask=None): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.downsample is not None: - residual = self.downsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - x += residual - x = self.act2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -270,7 +273,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -342,6 +345,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -353,21 +358,22 @@ def forward(self, x, x_mask=None): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.upsample is not None: - residual = self.upsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - x += residual - x = self.act2(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -400,7 +406,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -484,6 +490,8 @@ def forward(self, x, x_mask=None): """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -502,20 +510,22 @@ def forward(self, x, x_mask=None): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.downsample is not None: - residual = self.downsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - x += residual - x = self.act3(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn3(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -548,7 +558,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -625,6 +635,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -643,20 +655,22 @@ def forward(self, x, x_mask=None): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.upsample is not None: - residual = self.upsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - x += residual - x = self.act3(x) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -690,7 +704,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -731,6 +745,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -742,22 +758,24 @@ def forward(self, x, x_mask=None): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x = self.se_layer(x, x_mask=x_mask) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.downsample is not None: - residual = self.downsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - x += residual - x = self.act2(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -791,7 +809,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -836,6 +854,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -847,22 +867,24 @@ def forward(self, x, x_mask=None): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: + if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x = self.se_layer(x, x_mask=x_mask) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.upsample is not None: - residual = self.upsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn2(x) - x += residual - x = self.act2(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -896,7 +918,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -939,6 +961,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -957,21 +981,24 @@ def forward(self, x, x_mask=None): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x = self.se_layer(x, x_mask=x_mask) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.downsample is not None: - residual = self.downsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - x += residual - x = self.act3(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn3(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -1005,7 +1032,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, drop_connect_rate=0, @@ -1048,6 +1075,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -1066,21 +1095,24 @@ def forward(self, x, x_mask=None): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - x = self.se_layer(x, x_mask=x_mask) - if self.drop_connect_rate > 0: - x = self.drop_connect(x) - - if self.upsample is not None: - residual = self.upsample(residual) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - x += residual - x = self.act3(x) + x = self.se_layer(x, x_mask=x_mask) + if self.drop_connect_rate > 0: + x = self.drop_connect(x) - if self.norm_after: - x = self.bn3(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -1115,7 +1147,7 @@ def __init__( in_scale, scale, upsampling_mode="nearest", - activation={"name": "relu6", "inplace": True}, + activation={"name": "relu", "inplace": True}, use_norm=True, norm_layer=None, norm_before=True, diff --git a/hyperion/torch/layer_blocks/resnet2d_blocks.py b/hyperion/torch/layer_blocks/resnet2d_blocks.py index 7fe89b56..6c2dca74 100644 --- a/hyperion/torch/layer_blocks/resnet2d_blocks.py +++ b/hyperion/torch/layer_blocks/resnet2d_blocks.py @@ -103,7 +103,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -168,6 +168,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -180,17 +182,16 @@ def forward(self, x, x_mask=None): x = self.conv2(x) - if self.norm_before: - x = self.bn2(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x += residual - x = self.act2(x) - if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x += residual + else: + if self.norm_before: + x = self.bn2(x) + + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -223,7 +224,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -289,6 +290,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -300,18 +303,16 @@ def forward(self, x, x_mask=None): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: - x = self.bn2(x) - - if self.upsample is not None: - residual = self.upsample(residual) - - x += residual - x = self.act2(x) - if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x += residual + else: + if self.norm_before: + x = self.bn2(x) + + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -342,7 +343,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -419,6 +420,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -437,17 +440,16 @@ def forward(self, x, x_mask=None): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: - x = self.bn3(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x += residual - x = self.act3(x) - if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x += residual + else: + if self.norm_before: + x = self.bn3(x) + + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -478,7 +480,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -549,6 +551,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -567,17 +571,16 @@ def forward(self, x, x_mask=None): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: + if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x += residual + else: + if self.norm_before: + x = self.bn3(x) - if self.upsample is not None: - residual = self.upsample(residual) - - x += residual - x = self.act3(x) - - if self.norm_after: - x = self.bn2(x) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -611,7 +614,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -650,6 +653,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -661,19 +666,18 @@ def forward(self, x, x_mask=None): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: - x = self.bn2(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x = self.se_layer(x, x_mask=x_mask) - x += residual - x = self.act2(x) - if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn2(x) + + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -707,7 +711,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -750,6 +754,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -761,19 +767,18 @@ def forward(self, x, x_mask=None): x = self.bn1(x) x = self.conv2(x) - - if self.norm_before: - x = self.bn2(x) - - if self.upsample is not None: - residual = self.upsample(residual) - - x = self.se_layer(x, x_mask=x_mask) - x += residual - x = self.act2(x) - if self.norm_after: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn2(x) + + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act2(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -805,7 +810,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -846,6 +851,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -864,18 +871,18 @@ def forward(self, x, x_mask=None): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: - x = self.bn3(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x = self.se_layer(x, x_mask=x_mask) - x += residual - x = self.act3(x) - if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn3(x) + + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) @@ -907,7 +914,7 @@ def __init__( in_channels, channels, kernel_size=3, - activation="relu6", + activation="relu", stride=1, dropout_rate=0, groups=1, @@ -948,6 +955,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.upsample is not None: + residual = self.upsample(residual) x = self.conv1(x) if self.norm_before: @@ -966,18 +975,18 @@ def forward(self, x, x_mask=None): x = self.bn2(x) x = self.conv3(x) - if self.norm_before: - x = self.bn3(x) - - if self.upsample is not None: - residual = self.upsample(residual) - - x = self.se_layer(x, x_mask=x_mask) - x += residual - x = self.act3(x) - if self.norm_after: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + else: + if self.norm_before: + x = self.bn3(x) + + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act3(x) if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layer_blocks/resnet_blocks.py b/hyperion/torch/layer_blocks/resnet_blocks.py index e25c0cbb..c077a54b 100644 --- a/hyperion/torch/layer_blocks/resnet_blocks.py +++ b/hyperion/torch/layer_blocks/resnet_blocks.py @@ -190,6 +190,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -204,15 +206,12 @@ def forward(self, x, x_mask=None): if self.norm_before: x = self.bn2(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x += residual - x = self.act2(x) - - if not self.norm_before: + x += residual + x = self.act2(x) + else: + x = self.act2(x) x = self.bn2(x) + x += residual if self.dropout_rate > 0: x = self.dropout(x) @@ -303,6 +302,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -321,15 +322,12 @@ def forward(self, x, x_mask=None): x = self.conv3(x) if self.norm_before: x = self.bn3(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x += residual - x = self.act3(x) - - if not self.norm_before: + x += residual + x = self.act3(x) + else: + x = self.act3(x) x = self.bn3(x) + x += residual if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layer_blocks/seresnet_blocks.py b/hyperion/torch/layer_blocks/seresnet_blocks.py index 4807e94b..b13a7ff3 100644 --- a/hyperion/torch/layer_blocks/seresnet_blocks.py +++ b/hyperion/torch/layer_blocks/seresnet_blocks.py @@ -95,18 +95,19 @@ def forward(self, x, x_mask=None): x = self.conv2(x) - if self.norm_before: - x = self.bn2(x) - if self.downsample is not None: residual = self.downsample(residual) - x = self.se_layer(x, x_mask=x_mask) - x += residual - x = self.act2(x) - - if not self.norm_before: + if self.norm_before: + x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act2(x) + else: + x = self.act2(x) x = self.bn2(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual if self.dropout_rate > 0: x = self.dropout(x) @@ -186,6 +187,8 @@ def forward(self, x, x_mask=None): Tensor with shape = (batch, out_channels, out_heigh, out_width). """ residual = x + if self.downsample is not None: + residual = self.downsample(residual) x = self.conv1(x) if self.norm_before: @@ -204,16 +207,14 @@ def forward(self, x, x_mask=None): x = self.conv3(x) if self.norm_before: x = self.bn3(x) - - if self.downsample is not None: - residual = self.downsample(residual) - - x = self.se_layer(x, x_mask=x_mask) - x += residual - x = self.act3(x) - - if not self.norm_before: + x = self.se_layer(x, x_mask=x_mask) + x += residual + x = self.act3(x) + else: + x = self.act3(x) x = self.bn3(x) + x = self.se_layer(x, x_mask=x_mask) + x += residual if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layers/activation_factory.py b/hyperion/torch/layers/activation_factory.py index d07b184e..9d972f95 100644 --- a/hyperion/torch/layers/activation_factory.py +++ b/hyperion/torch/layers/activation_factory.py @@ -6,7 +6,7 @@ import torch.nn as nn -from .swish import Swish +from .swish import Swish, DoubleSwish, Swish6, DoubleSwish6 act_dict = { "elu": nn.ELU, @@ -33,6 +33,9 @@ "logsoftmax": nn.LogSoftmax, "alogsoftmax": nn.AdaptiveLogSoftmaxWithLoss, "swish": Swish, + "double_swish": DoubleSwish, + "swish6": Swish6, + "double_swish6": DoubleSwish6, } @@ -90,7 +93,6 @@ def create_from_str(activation_name, **kwargs): except: # activation didn't have inplace option del kwargs["inplace"] - pass return act_dict[activation_name](**kwargs) @@ -186,3 +188,9 @@ def get_config(activation): } if isinstance(activation, Swish): return {"name": "swish"} + if isinstance(activation, DoubleSwish): + return {"name": "double_swish"} + if isinstance(activation, Swish6): + return {"name": "swish6"} + if isinstance(activation, DoubleSwish6): + return {"name": "double_swish6"} diff --git a/hyperion/torch/layers/swish.py b/hyperion/torch/layers/swish.py index a313455e..62225ad9 100644 --- a/hyperion/torch/layers/swish.py +++ b/hyperion/torch/layers/swish.py @@ -36,3 +36,115 @@ def __repr__(self): def __str__(self): s = "{}()".format(self.__class__.__name__) return s + + +class Swish6(nn.Module): + """Swish activation class, clamped to 6 + y = min(x, 6) * sigmoid(min(x,6)) + """ + + def forward(self, x): + return SwishImplementation.apply(x.clamp(max=6)) + + def __repr__(self): + return self.__str__() + + def __str__(self): + s = "{}()".format(self.__class__.__name__) + return s + + +class DoubleSwishImplementation(torch.autograd.Function): + """ Implementation for DoubleSwish Activation from + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py + + f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), + where swish(x) = x * sigmoid(x). + + Memory-efficient derivative computation: + f'(x) = = x * s'(x) + x' * s(x) = x * s'(x) + s(x). + where s(x) = simoid(x), and s'(x) = s(x) * (1-s(x)). + + f'(x) = x * s(x) * (1-s(x)) + s(x) = f(x) * (1-s(x)) + s(x) + """ + + @staticmethod + def forward(ctx, x: torch.Tensor) -> torch.Tensor: + requires_grad = x.requires_grad + x_dtype = x.dtype + if x.dtype == torch.float16: + x = x.to(torch.float32) + + s = torch.sigmoid(x - 1.0) + y = x * s + + if requires_grad: + deriv = y * (1 - s) + s + # notes on derivative of x * sigmoid(x - 1): + # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29 + # min \simeq -0.043638. Take floor as -0.043637 so it's a lower bound + # max \simeq 1.1990. Take ceil to be 1.2 so it's an upper bound. + # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which + # floors), should be expectation-preserving. + floor = -0.043637 + ceil = 1.2 + d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like( + deriv + ) + d_int = d_scaled.to(torch.uint8) + ctx.save_for_backward(d_int) + if x_dtype == torch.float16 or torch.is_autocast_enabled(): + y = y.to(torch.float16) + return y + + @staticmethod + def backward(ctx, y_grad: torch.Tensor) -> torch.Tensor: + (d,) = ctx.saved_tensors + # the same constants as used in forward pass. + floor = -0.043637 + ceil = 1.2 + d = d * ((ceil - floor) / 255.0) + floor + return y_grad * d + + +class DoubleSwish(torch.nn.Module): + """ DoubleSwish activation + f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), + where swish(x) = x * sigmoid(x). + """ + + def forward(self, x: torch.Tensor) -> torch.Tensor: + + if torch.jit.is_scripting() or torch.jit.is_tracing(): + return (x * torch.sigmoid(x - 1.0)).clamp(max=6) + + return DoubleSwishImplementation.apply(x) + + def __repr__(self): + return self.__str__() + + def __str__(self): + s = "{}()".format(self.__class__.__name__) + return s + + +class DoubleSwish6(torch.nn.Module): + """ DoubleSwish activation clamped to 6 + x = min(x, 6) + f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), + where swish(x) = x * sigmoid(x). + """ + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.clamp(max=6) + if torch.jit.is_scripting() or torch.jit.is_tracing(): + return (x * torch.sigmoid(x - 1.0)).clamp(max=6) + + return DoubleSwishImplementation.apply(x) + + def __repr__(self): + return self.__str__() + + def __str__(self): + s = "{}()".format(self.__class__.__name__) + return s diff --git a/hyperion/torch/models/xvectors/efficient_net_xvector.py b/hyperion/torch/models/xvectors/efficient_net_xvector.py index a8663cd9..132bb51d 100644 --- a/hyperion/torch/models/xvectors/efficient_net_xvector.py +++ b/hyperion/torch/models/xvectors/efficient_net_xvector.py @@ -52,11 +52,12 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=True, + head_use_in_norm=False, embed_layer=0, proj_feats=None, ): - logging.info("making %s encoder network" % (effnet_type)) + logging.info("making %s encoder network", effnet_type) encoder_net = EN( effnet_type, in_channels, @@ -98,6 +99,7 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_in_norm=head_use_in_norm, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=in_feats, diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index 1bce0f87..7af207c4 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -35,7 +35,7 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=True, - in_norm=False, + head_use_in_norm=False, embed_layer=0, proj_feats=None, ): @@ -62,6 +62,7 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_in_norm=head_use_in_norm, dropout_rate=dropout_rate, embed_layer=embed_layer, proj_feats=proj_feats, diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index c6889626..0e9eba22 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -46,6 +46,7 @@ def __init__( use_norm=True, norm_before=True, in_norm=False, + head_use_in_norm=False, embed_layer=0, proj_feats=None, se_r=16, @@ -94,6 +95,7 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_in_norm=head_use_in_norm, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=in_feats, diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index 203008be..0b27a840 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -50,6 +50,7 @@ def __init__( use_norm=True, norm_before=True, in_norm=False, + head_use_in_norm=False, embed_layer=0, proj_feats=None, se_r=16, @@ -102,6 +103,7 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_in_norm=head_use_in_norm, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=in_feats, diff --git a/hyperion/torch/models/xvectors/tdnn_xvector.py b/hyperion/torch/models/xvectors/tdnn_xvector.py index 530ca63b..38262cc3 100644 --- a/hyperion/torch/models/xvectors/tdnn_xvector.py +++ b/hyperion/torch/models/xvectors/tdnn_xvector.py @@ -43,11 +43,12 @@ def __init__( use_norm=True, norm_before=False, in_norm=False, + head_use_in_norm=False, embed_layer=0, proj_feats=None, ): - logging.info("making %s encoder network" % (tdnn_type)) + logging.info("making %s encoder network", tdnn_type) encoder_net = TF.create( tdnn_type, num_enc_blocks, @@ -83,6 +84,7 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_in_norm=head_use_in_norm, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=None, diff --git a/hyperion/torch/models/xvectors/transformer_xvector_v1.py b/hyperion/torch/models/xvectors/transformer_xvector_v1.py index 7c55844a..25e9c894 100644 --- a/hyperion/torch/models/xvectors/transformer_xvector_v1.py +++ b/hyperion/torch/models/xvectors/transformer_xvector_v1.py @@ -47,7 +47,6 @@ class TransformerXVectorV1(XVector): use_norm: if True use batch/layer norm norm_before: if True, use layer norm before layers, otherwise after - in_norm: add batchnorm at the input embed_layer: which layer to use to extract x-vectors proj_feats: add linear projection layer after the encoder to project feature dimension to proj_feats """ @@ -84,7 +83,7 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=False, - in_norm=False, + head_use_in_norm=False, embed_layer=0, proj_feats=None, ): @@ -128,6 +127,7 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_in_norm=head_use_in_norm, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=None, @@ -186,10 +186,6 @@ def enc_concat_after(self): def enc_ff_type(self): return self.encoder_net.ff_type - # @property - # def in_norm(self): - # return self.encoder_net.in_norm - def get_config(self): """Gets network config Returns: @@ -215,7 +211,6 @@ def get_config(self): "in_layer_type": self.in_layer_type, "enc_concat_after": self.enc_concat_after, } - #'in_norm': self.in_norm } config.update(base_config) return config diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 3807bbd8..8dc15fbc 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -11,6 +11,7 @@ import torch import torch.nn as nn +from ....utils.misc import filter_func_args from ...layer_blocks import TDNNBlock from ...layers import GlobalPool1dFactory as PF from ...narchs import ClassifHead, TorchNALoader @@ -18,10 +19,10 @@ from ...utils import eval_nnet_by_chunks, scale_seq_lengths -class XVectorTrainMode(Enum): - full = 0 - frozen = 1 - ft_embed_affine = 2 +# class XVectorTrainMode(Enum): +# full = 0 +# frozen = 1 +# ft_embed_affine = 2 class XVector(TorchModel): @@ -46,6 +47,7 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=True, + head_use_in_norm=False, dropout_rate=0, embed_layer=0, in_feats=None, @@ -106,7 +108,7 @@ def __init__( # if head_norm_layer is none we use the global norm_layer if head_norm_layer is None and norm_layer is not None: - if norm_layer == "instance-norm" or norm_layer == "instance-norm-affine": + if norm_layer in ("instance-norm", "instance-norm-affine"): head_norm_layer = "batch-norm" else: head_norm_layer = norm_layer @@ -130,6 +132,7 @@ def __init__( use_norm=use_norm, norm_before=norm_before, dropout_rate=dropout_rate, + use_in_norm=head_use_in_norm, ) self.hid_act = hid_act @@ -137,6 +140,7 @@ def __init__( self.head_norm_layer = head_norm_layer self.use_norm = use_norm self.norm_before = norm_before + self.head_use_in_norm = head_use_in_norm self.dropout_rate = dropout_rate self.embed_layer = embed_layer @@ -282,12 +286,47 @@ def forward_logits(self, x, x_lengths=None, y=None): Returns: class logits tensor with shape=(batch, num_classes). """ + f = x max_in_length = x.size(-1) x = self._pre_enc(x) x = self.encoder_net(x) x, x_lengths = self._post_enc(x, x_lengths, max_in_length) p = self.pool_net(x, x_lengths=x_lengths) y = self.classif_net(p, y) + # if not self.training: + # fnf = ( + # torch.any(torch.any(torch.logical_not(torch.isfinite(f)), dim=1), dim=1) + # .sum() + # .cpu() + # .item() + # ) + # xnf = ( + # torch.any(torch.any(torch.logical_not(torch.isfinite(x)), dim=1), dim=1) + # .sum() + # .cpu() + # .item() + # ) + # pnf = ( + # torch.any(torch.logical_not(torch.isfinite(p)), dim=1) + # .sum() + # .cpu() + # .item() + # ) + # ynf = ( + # torch.any(torch.logical_not(torch.isfinite(y)), dim=1) + # .sum() + # .cpu() + # .item() + # ) + # # if xnf + pnf + ynf > 0: + # logging.warning("ff %d xnf %d pnf %d ynf %d", fnf, xnf, pnf, ynf) + # if xnf > 0: + # ii = torch.any( + # torch.any(torch.logical_not(torch.isfinite(x)), dim=1), dim=1 + # ) + # xx = x[ii] + # logging.info(f"xx={xx}") + return y def forward_hid_feats( @@ -510,6 +549,7 @@ def get_config(self): "head_norm_layer": self.head_norm_layer, "use_norm": self.use_norm, "norm_before": self.norm_before, + "head_use_in_norm": self.head_use_in_norm, "dropout_rate": self.dropout_rate, "embed_layer": self.embed_layer, "in_feats": self.in_feats, @@ -656,42 +696,38 @@ def valid_train_modes(): @staticmethod def filter_args(**kwargs): - # if "wo_norm" in kwargs: - # kwargs["use_norm"] = not kwargs["wo_norm"] - # del kwargs["wo_norm"] - - # if "norm_after" in kwargs: - # kwargs["norm_before"] = not kwargs["norm_after"] - # del kwargs["norm_after"] - # get arguments for pooling pool_args = PF.filter_args(**kwargs["pool_net"]) - - valid_args = ( - "num_classes", - "embed_dim", - "num_embed_layers", - "hid_act", - "loss_type", - "cos_scale", - "margin", - "margin_warmup_epochs", - "intertop_k", - "intertop_margin", - "num_subcenters", - "use_norm", - "norm_before", - "in_feats", - "proj_feats", - "dropout_rate", - "norm_layer", - "head_norm_layer", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - + args = filter_func_args(ClassifHead.__init__, kwargs) args["pool_net"] = pool_args return args + # valid_args = ( + # "num_classes", + # "embed_dim", + # "num_embed_layers", + # "hid_act", + # "loss_type", + # "cos_scale", + # "margin", + # "margin_warmup_epochs", + # "intertop_k", + # "intertop_margin", + # "num_subcenters", + # "use_norm", + # "norm_before", + # "in_feats", + # "proj_feats", + # "dropout_rate", + # "norm_layer", + # "head_norm_layer", + # "head_use_in_norm", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + # args["pool_net"] = pool_args + # return args + @staticmethod def add_class_args(parser, prefix=None, skip=set()): @@ -793,19 +829,6 @@ def add_class_args(parser, prefix=None, skip=set()): except: pass - # parser.add_argument( - # "--wo-norm", - # default=False, - # action="store_true", - # help="without batch normalization", - # ) - - # parser.add_argument( - # "--norm-after", - # default=False, - # action="store_true", - # help="batch normalizaton after activation", - # ) parser.add_argument( "--use-norm", default=True, @@ -820,6 +843,13 @@ def add_class_args(parser, prefix=None, skip=set()): help="batch normalizaton before activation", ) + parser.add_argument( + "--head-use-in-norm", + default=False, + action=ActionYesNo, + help="batch normalizaton at the head input", + ) + try: parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") except: diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index 5d179fdb..3e2997a6 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -3,12 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ArgumentParser, ActionYesNo import torch import torch.nn as nn from torch.nn import Linear +from ...utils.misc import filter_func_args from ..layer_blocks import FCBlock from ..layers import ActivationFactory as AF from ..layers import ArcLossOutput, CosLossOutput @@ -37,6 +38,7 @@ class ClassifHead(NetArch): norm_layer: norm_layer object or str indicating type norm layer, if None it uses BatchNorm1d use_norm: it True it uses layer/batch-normalization norm_before: if True, layer-norm is before the activation function + use_in_norm: put batchnorm at the input """ def __init__( @@ -57,6 +59,7 @@ def __init__( use_norm=True, norm_before=True, dropout_rate=0, + use_in_norm=False, ): super().__init__() @@ -67,6 +70,7 @@ def __init__( self.embed_dim = embed_dim self.num_classes = num_classes self.norm_layer = norm_layer + self.use_in_norm = use_in_norm if use_norm: norm_groups = None @@ -88,6 +92,10 @@ def __init__( self.intertop_margin = intertop_margin self.num_subcenters = num_subcenters + if self.use_in_norm: + assert not self.norm_before + self.in_norm = self._norm_layer(prev_feats) + prev_feats = in_feats fc_blocks = [] for i in range(num_embed_layers - 1): @@ -109,16 +117,21 @@ def __init__( else: act = hid_act - fc_blocks.append( - FCBlock( - prev_feats, - embed_dim, - activation=act, - norm_layer=self._norm_layer, - use_norm=use_norm, - norm_before=norm_before, + if self.use_in_norm: + fc_blocks.append( + FCBlock(prev_feats, embed_dim, activation=act, use_norm=False) + ) + else: + fc_blocks.append( + FCBlock( + prev_feats, + embed_dim, + activation=act, + norm_layer=self._norm_layer, + use_norm=use_norm, + norm_before=norm_before, + ) ) - ) self.fc_blocks = nn.ModuleList(fc_blocks) @@ -270,6 +283,9 @@ def put_layers_in_eval_mode(self, layer_list): def forward(self, x, y=None): + if self.use_in_norm: + x = self.in_norm(x) + for l in range(self.num_embed_layers): x = self.fc_blocks[l](x) @@ -286,6 +302,9 @@ def forward_hid_feats(self, x, y=None, return_layers=None, return_logits=False): if return_layers is None: return_layers = [] + if self.use_in_norm: + x = self.in_norm(x) + h = [] for l in range(self.num_embed_layers): x = self.fc_blocks[l](x) @@ -303,6 +322,9 @@ def forward_hid_feats(self, x, y=None, return_layers=None, return_logits=False): def extract_embed(self, x, embed_layer=0): + if self.use_in_norm: + x = self.in_norm(x) + for l in range(embed_layer): x = self.fc_blocks[l](x) @@ -341,6 +363,7 @@ def get_config(self): "use_norm": self.use_norm, "norm_before": self.norm_before, "dropout_rate": self.dropout_rate, + "use_in_norm": self.use_in_norm, } base_config = super().get_config() @@ -357,24 +380,7 @@ def filter_args(**kwargs): kwargs["norm_before"] = not kwargs["norm_after"] del kwargs["norm_after"] - valid_args = ( - "num_classes", - "embed_dim", - "num_embed_layers", - "hid_act", - "loss_type", - "s", - "margin", - "margin_warmup_epochs", - "intertop_k", - "intertop_margin", - "num_subcenters", - "use_norm", - "norm_before", - "dropout_rate", - "norm_layer", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args = filter_func_args(ClassifHead.__init__, kwargs) return args @staticmethod @@ -455,17 +461,24 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--wo-norm", default=False, - action="store_true", + action=ActionYesNo, help="without batch normalization", ) parser.add_argument( "--norm-after", default=False, - action="store_true", + action=ActionYesNo, help="batch normalizaton after activation", ) + parser.add_argument( + "--use-in-norm", + default=False, + action=ActionYesNo, + help="batch normalizaton in the classif head input", + ) + try: parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") except: diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 4a66f0eb..9541d7b0 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -84,35 +84,6 @@ def __init__( super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # loss=loss, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - self.feat_extractor = feat_extractor if device is not None: self.feat_extractor.to(device) @@ -135,10 +106,6 @@ def train_epoch(self, data_loader): if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - # input_data, target = ( - # data[self.input_key].to(self.device), - # data[self.target_key].to(self.device), - # ) input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) with torch.no_grad(): From 69884850fbad2a701c6e024671199f9d4ee34011 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 27 Mar 2023 16:30:02 -0400 Subject: [PATCH 090/154] add results with new ecapa v3 --- egs/voxceleb/v1.1/README.md | 46 ++++++++----------- ...onfig_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | 6 +-- ...config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | 6 +-- egs/voxceleb/v1.1/run_030_extract_xvectors.sh | 4 +- egs/voxceleb/v1.1/run_040_eval_be.sh | 2 +- egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py | 36 +++++++-------- 6 files changed, 46 insertions(+), 54 deletions(-) diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 1d438868..1ee9468f 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -89,63 +89,57 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.10 | 0.069 | 0.124 | -| | | | Cosine + AS-Norm | 1.09 | 0.065 | 0.105 | -| | | | Cosine + QMF | 0.92 | 0.059 | 0.090 | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.11 | 0.069 | 0.126 | +| | | | Cosine + AS-Norm | 1.10 | 0.065 | 0.108 | +| | | | Cosine + QMF | 0.95 | 0.059 | 0.084 | | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | -| | | | Cosine + AS-Norm | 0.63 | 0.048 | 0.083 | -| | | | Cosine + QMF | 0.66 | 0.047 | 0.090 | +| | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | +| | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | | | | | Cosine + AS-Norm | | | | | | | | Cosine + QMF | || | - - - - ### VoxCeleb 1 Entire-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | | config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.16 | 0.073 | 0.130 | -| | | | Cosine + AS-Norm | 1.13 | 0.068 | 0.117 | -| | | | Cosine + QMF | 1.06 | 0.065 | 0.108 | +| | | | Cosine + AS-Norm | 1.13 | 0.068 | 0.118 | +| | | | Cosine + QMF | 1.06 | 0.064 | 0.112 | | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | -| | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.088 | -| | | | Cosine + QMF | 0.90 | 0.053 | 0.090 | +| | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | | | | | Cosine + AS-Norm | | | | | | | | Cosine + QMF | | | | - ### VoxCeleb 1 Hard-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | | config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.10 | 0.128 | 0.209 | -| | | | Cosine + AS-Norm | 1.99 | 0.117 | 0.191 | -| | | | Cosine + QMF | 1.82 | 0.111 | 0.183 | +| | | | Cosine + AS-Norm | 1.99 | 0.118 | 0.190 | +| | | | Cosine + QMF | 1.84 | 0.111 | 0.184 | | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | -| | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.150 | -| | | | Cosine + QMF | 1.62 | 0.096 | 0.158 | +| | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | +| | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | | | | | Cosine + AS-Norm | | | | | | | | Cosine + QMF | | | | - ### VoxSRC2022 dev | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.87 | 0.185 | 0.301 | -| | | | Cosine + AS-Norm | 2.84 | 0.182 | 0.307 | -| | | | Cosine + QMF | 2.62 | 0.175 | 0.282 | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.87 | 0.185 | 0.304 | +| | | | Cosine + AS-Norm | 2.84 | 0.182 | 0.304 | +| | | | Cosine + QMF | 2.61 | 0.172 | 0.283 | | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | -| | | | Cosine + AS-Norm | 2.19 | 0.145 | 0.265 | -| | | | Cosine + QMF | 2.54 | 0.179 | 0.304 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | || | +| | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | +| | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | || | | | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| | | | Cosine + QMF | | | | ## Results before 2023 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh index b093b37a..f2622b0e 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh @@ -12,14 +12,14 @@ nnet_data=voxceleb2cat_train # x-vector cfg nnet_type=resnet1d -nnet_name=${feat_type}_ecapatdnn2048x4.v4.0 +nnet_name=${feat_type}_ecapatdnn2048x4.v3.0 -nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v4.0.yaml +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/model_ep0035.pth -nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v4.0.yaml +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh index 5288f66b..a3ad0c29 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh @@ -12,14 +12,14 @@ nnet_data=voxceleb2cat_train # x-vector cfg nnet_type=resnet1d -nnet_name=${feat_type}_ecapatdnn512x3.v3.12 +nnet_name=${feat_type}_ecapatdnn512x3.v3.0 -nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v3.12.yaml +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/model_ep0040.pth -nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v3.12.yaml +nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name nnet_s2=$nnet_s2_dir/model_ep0030.pth diff --git a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh index 5bd2c17d..8c0949f4 100755 --- a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh @@ -51,14 +51,14 @@ if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qm if [ $plda_num_augs -eq 0 ]; then steps_xvec/extract_xvectors_from_wav.sh \ --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ - --random-utt-length true --min-utt-length 200 --max-utt-length 14000 \ + --random-utt-length true --min-utt-length 200 --max-utt-length 3000 \ --feat-config $feat_config \ $nnet data/${name} \ $xvector_dir/${name} else steps_xvec/extract_xvectors_from_wav.sh \ --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ - --random-utt-length true --min-utt-length 200 --max-utt-length 14000 \ + --random-utt-length true --min-utt-length 200 --max-utt-length 3000 \ --feat-config $feat_config --aug-config $plda_aug_config --num-augs $plda_num_augs \ $nnet data/${name} \ $xvector_dir/${name}_augx${plda_num_augs} \ diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index 4c7c2091..0780584c 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -209,7 +209,7 @@ if [ "$do_qmf" == "true" ];then $score_cosine_qmf_dir/voxceleb2_qmf_scores fi - stage=9 + if [ $stage -le 8 ];then echo "Eval Voxceleb 1 with Cosine scoring" diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py index e6b68ae8..7034126a 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py @@ -110,30 +110,27 @@ def eval_plda( logging.info("read num_frames") u2nf = Utt2Info.load(num_frames_file) - # enroll_nf = np.log( - # np.clip( - # u2nf.filter(enroll_segs).info.astype(float) / 100, a_min=0.1, a_max=15.0, - # ) - # ) - # test_nf = np.log( - # np.clip( - # u2nf.filter(ndx.seg_set).info.astype(float) / 100, a_min=0.1, a_max=15.0, - # ) - # ) + min_dur = 0.1 + max_dur = 30.0 + enroll_nf = np.log( np.clip( - u2nf.filter(enroll_segs).info.astype(float) / 100 - 1.0, - a_min=0.1, - a_max=15.0, + u2nf.filter(enroll_segs).info.astype(float) / 100, + a_min=min_dur, + a_max=max_dur, ) ) test_nf = np.log( np.clip( - u2nf.filter(ndx.seg_set).info.astype(float) / 100 - 1.0, - a_min=0.1, - a_max=15.0, + u2nf.filter(ndx.seg_set).info.astype(float) / 100, + a_min=min_dur, + a_max=max_dur, ) ) + log_min_dur = np.log(min_dur) + log_max_dur = np.log(max_dur) + enroll_nf = (enroll_nf - log_min_dur) / (log_max_dur - log_min_dur) + test_nf = (test_nf - log_min_dur) / (log_max_dur - log_min_dur) t1 = time.time() logging.info("computing llr") @@ -166,8 +163,9 @@ def eval_plda( dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] logging.info( - "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." - % (dt, dt / num_trials * 1000) + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, ) q_measures = { @@ -193,7 +191,7 @@ def eval_plda( return logging.info("applying qmf") - scores_fus = [scores.ravel()] + # scores_fus = [scores.ravel()] scores_fus = [scores_norm.ravel()] for q_name in ["maxnf", "minnf", "maxcohmu", "mincohmu"]: scores_fus.append(q_measures[q_name].ravel()) From b475d370538e2579a3d7b58e0f7650268f81e7c8 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 27 Mar 2023 17:09:31 -0400 Subject: [PATCH 091/154] changed default config --- egs/voxceleb/v1.1/default_config.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/voxceleb/v1.1/default_config.sh b/egs/voxceleb/v1.1/default_config.sh index 8f713463..fd0e1bb1 120000 --- a/egs/voxceleb/v1.1/default_config.sh +++ b/egs/voxceleb/v1.1/default_config.sh @@ -1 +1 @@ -global_conf/config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh \ No newline at end of file +global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh \ No newline at end of file From 537071616378bafd38ee046e16604574fa525a77 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 14 Apr 2023 08:59:35 -0400 Subject: [PATCH 092/154] adapted adv.v1 recipe to persephone --- egs/voxceleb/adv.v1.1/README.md | 51 +- egs/voxceleb/adv.v1.1/conf | 1 - .../adv.v1.1/conf/advft_resnet34_xvec.yaml | 67 + egs/voxceleb/adv.v1.1/conf/clsp.conf | 11 + egs/voxceleb/adv.v1.1/conf/coe_gpu_long.conf | 13 + egs/voxceleb/adv.v1.1/conf/coe_gpu_rtx.conf | 11 + egs/voxceleb/adv.v1.1/conf/coe_gpu_short.conf | 11 + .../adv.v1.1/conf/fbank80_stmn_16k.yaml | 12 + .../adv.v1.1/conf/reverb_noise_aug.yaml | 35 + .../adv.v1.1/conf/train_lresnet34_xvec.yaml | 68 + .../adv.v1.1/conf/train_resetdnn_xvec.yaml | 79 ++ .../adv.v1.1/conf/train_resnet34_xvec.yaml | 68 + egs/voxceleb/adv.v1.1/conf/vad_16k.yaml | 8 + ...k80_stmn_lresnet34_transfer_resetdnn.v1.sh | 86 +- ...34_transfer_resetdnn.v1_wavegan_defense.sh | 86 +- ...net34_transfer_fbank40_stmn_resetdnn.v1.sh | 68 +- ...ank80_stmn_resnet34_transfer_lresnet.v1.sh | 69 +- ...nk80_stmn_resnet34_transfer_resetdnn.v1.sh | 70 +- egs/voxceleb/adv.v1.1/local | 1 - .../adv.v1.1/local/attack_analysis.py | 200 +++ .../adv.v1.1/local/attack_analysis.sh | 79 ++ .../local/calibrate_voxceleb1_o_clean.sh | 55 + egs/voxceleb/adv.v1.1/local/make_musan.py | 189 +++ egs/voxceleb/adv.v1.1/local/make_musan.sh | 48 + egs/voxceleb/adv.v1.1/local/make_rirs_data.sh | 29 + .../adv.v1.1/local/make_trials_subset.py | 61 + .../adv.v1.1/local/make_voxceleb1_o.pl | 180 +++ .../adv.v1.1/local/make_voxceleb2cat.pl | 136 ++ egs/voxceleb/adv.v1.1/local/score_dcf.py | 88 ++ .../adv.v1.1/local/score_voxceleb1_o_clean.sh | 23 + .../local/score_voxceleb1_single_cond.sh | 21 + egs/voxceleb/adv.v1.1/run_002_compute_evad.sh | 1 - ...run_004_prepare_victim_xvec_train_data.sh} | 4 +- .../adv.v1.1/run_005_train_victim_xvector.sh | 58 + ...n_006_prepare_transfer_xvec_train_data.sh} | 0 .../run_007_train_transfer_xvector.sh | 68 + .../run_008_adv_finetune_victim_xvector.sh | 131 ++ ... run_009_extract_xvectors_victim_model.sh} | 10 +- ...un_010_extract_xvectors_transfer_model.sh} | 11 +- ...del.sh => run_011_eval_be_victim_model.sh} | 0 ...l.sh => run_012_eval_be_transfer_model.sh} | 0 ...ks.sh => run_013_eval_whitebox_attacks.sh} | 0 ...run_014_eval_transfer_blackbox_attacks.sh} | 0 ...itebox_attacks_with_randsmooth_defense.sh} | 178 --- ...ttacks_with_randsmooth_wavegan_defense.sh} | 0 ...h => run_017_eval_art_whitebox_attacks.sh} | 842 ++++++----- ..._018_eval_art_transfer_blackbox_attacks.sh | 633 +++++++++ ..._054_eval_art_transfer_blackbox_attacks.sh | 1260 ----------------- egs/voxceleb/adv.v1/conf | 1 - ...g_victim_lresnet34_transfer_resetdnn.v1.sh | 62 +- ...ig_victim_resnet34_transfer_resetdnn.v1.sh | 128 +- egs/voxceleb/adv.v1/local | 1 - egs/voxceleb/adv.v1/run_002_compute_evad.sh | 1 - egs/voxceleb/adv.v1/run_003_compute_fbank.sh | 69 - .../adv.v1/run_003_prepare_noises_rirs.sh | 67 + .../adv.v1/run_004_prepare_augment.sh | 123 -- .../run_004_prepare_victim_xvec_train_data.sh | 42 + .../adv.v1/run_005_compute_fbank_augment.sh | 57 - ...un_006_prepare_transfer_xvec_train_data.sh | 48 + .../run_008_extract_xvectors_victim_model.sh | 37 + .../run_010_prepare_victim_xvec_train_data.sh | 45 - .../adv.v1/run_011_train_victim_xvector.sh | 76 - ...un_012_prepare_transfer_xvec_train_data.sh | 53 - .../adv.v1/run_013_train_transfer_xvector.sh | 102 -- .../run_030_extract_xvectors_victim_model.sh | 38 - egs/voxceleb/v1.1/README.md | 24 +- ...rain_idrnd_resnet100_xvec_stage1_v3.0.yaml | 72 + ...rain_idrnd_resnet100_xvec_stage2_v3.0.yaml | 69 + ...onfig_fbank80_stmn_idrnd_resnet100.v3.0.sh | 44 + egs/voxceleb/v1.2/README.md | 263 ++++ egs/voxceleb/v1.2/cmd.sh | 28 + egs/voxceleb/v1.2/conf/clsp.conf | 11 + egs/voxceleb/v1.2/conf/coe_gpu_bigmem.conf | 11 + egs/voxceleb/v1.2/conf/coe_gpu_long.conf | 13 + egs/voxceleb/v1.2/conf/coe_gpu_rtx.conf | 11 + egs/voxceleb/v1.2/conf/coe_gpu_short.conf | 11 + egs/voxceleb/v1.2/conf/coe_gpu_v100.conf | 11 + .../v1.2/conf/fbank80_specaug1_stmn_16k.yaml | 24 + egs/voxceleb/v1.2/conf/fbank80_stmn_16k.yaml | 12 + ...rain_ecapatdnn2048x4_xvec_stage1_v3.0.yaml | 95 ++ ...rain_ecapatdnn2048x4_xvec_stage2_v3.0.yaml | 70 + ...train_ecapatdnn512x3_xvec_stage1_v3.0.yaml | 93 ++ ...train_ecapatdnn512x3_xvec_stage2_v3.0.yaml | 69 + egs/voxceleb/v1.2/conf/vad_16k.yaml | 8 + egs/voxceleb/v1.2/datapath.sh | 23 + egs/voxceleb/v1.2/default_config.sh | 1 + ...onfig_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | 44 + ...config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | 45 + egs/voxceleb/v1.2/path.sh | 5 + egs/voxceleb/v1.2/run_001_prepare_data.sh | 50 + .../eval_cosine_scoring_from_adv_test_wav.sh | 2 +- ...osine_scoring_from_adv_test_wav_wavegan.sh | 2 +- .../eval_cosine_scoring_from_art_test_wav.sh | 2 +- ...sine_scoring_from_transfer_adv_test_wav.sh | 2 +- ...sine_scoring_from_transfer_art_test_wav.sh | 2 +- .../eval_cosine_scoring_from_test_wav.sh | 2 +- .../xvectors/eval_xvec_logits_from_wav.sh | 2 +- hyperion/bin/adv_finetune_xvector_from_wav.py | 482 +++++++ ...l_xvec_cosine_scoring_from_adv_test_wav.py | 14 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 15 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 57 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 44 +- ...sine_scoring_from_transfer_adv_test_wav.py | 12 +- ...sine_scoring_from_transfer_art_test_wav.py | 27 +- hyperion/bin/prepare_data.py | 41 + hyperion/bin/train_xvector_from_wav.py | 9 +- hyperion/data_prep/__init__.py | 8 + hyperion/data_prep/data_prep.py | 56 + hyperion/data_prep/voxceleb2.py | 169 +++ .../torch/adv_attacks/art_attack_factory.py | 346 +++-- hyperion/torch/adv_attacks/attack_factory.py | 16 +- hyperion/utils/misc.py | 50 + 112 files changed, 5507 insertions(+), 3030 deletions(-) delete mode 120000 egs/voxceleb/adv.v1.1/conf create mode 100644 egs/voxceleb/adv.v1.1/conf/advft_resnet34_xvec.yaml create mode 100644 egs/voxceleb/adv.v1.1/conf/clsp.conf create mode 100644 egs/voxceleb/adv.v1.1/conf/coe_gpu_long.conf create mode 100644 egs/voxceleb/adv.v1.1/conf/coe_gpu_rtx.conf create mode 100644 egs/voxceleb/adv.v1.1/conf/coe_gpu_short.conf create mode 100644 egs/voxceleb/adv.v1.1/conf/fbank80_stmn_16k.yaml create mode 100644 egs/voxceleb/adv.v1.1/conf/reverb_noise_aug.yaml create mode 100644 egs/voxceleb/adv.v1.1/conf/train_lresnet34_xvec.yaml create mode 100644 egs/voxceleb/adv.v1.1/conf/train_resetdnn_xvec.yaml create mode 100644 egs/voxceleb/adv.v1.1/conf/train_resnet34_xvec.yaml create mode 100644 egs/voxceleb/adv.v1.1/conf/vad_16k.yaml delete mode 120000 egs/voxceleb/adv.v1.1/local create mode 100755 egs/voxceleb/adv.v1.1/local/attack_analysis.py create mode 100755 egs/voxceleb/adv.v1.1/local/attack_analysis.sh create mode 100755 egs/voxceleb/adv.v1.1/local/calibrate_voxceleb1_o_clean.sh create mode 100755 egs/voxceleb/adv.v1.1/local/make_musan.py create mode 100755 egs/voxceleb/adv.v1.1/local/make_musan.sh create mode 100755 egs/voxceleb/adv.v1.1/local/make_rirs_data.sh create mode 100755 egs/voxceleb/adv.v1.1/local/make_trials_subset.py create mode 100755 egs/voxceleb/adv.v1.1/local/make_voxceleb1_o.pl create mode 100755 egs/voxceleb/adv.v1.1/local/make_voxceleb2cat.pl create mode 100755 egs/voxceleb/adv.v1.1/local/score_dcf.py create mode 100755 egs/voxceleb/adv.v1.1/local/score_voxceleb1_o_clean.sh create mode 100755 egs/voxceleb/adv.v1.1/local/score_voxceleb1_single_cond.sh rename egs/voxceleb/adv.v1.1/{run_010_prepare_victim_xvec_train_data.sh => run_004_prepare_victim_xvec_train_data.sh} (94%) create mode 100755 egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh rename egs/voxceleb/adv.v1.1/{run_012_prepare_transfer_xvec_train_data.sh => run_006_prepare_transfer_xvec_train_data.sh} (100%) create mode 100755 egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh create mode 100755 egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh rename egs/voxceleb/adv.v1.1/{run_030_extract_xvectors_victim_model.sh => run_009_extract_xvectors_victim_model.sh} (81%) rename egs/voxceleb/adv.v1.1/{run_031_extract_xvectors_transfer_model.sh => run_010_extract_xvectors_transfer_model.sh} (81%) rename egs/voxceleb/adv.v1.1/{run_040_eval_be_victim_model.sh => run_011_eval_be_victim_model.sh} (100%) rename egs/voxceleb/adv.v1.1/{run_041_eval_be_transfer_model.sh => run_012_eval_be_transfer_model.sh} (100%) rename egs/voxceleb/adv.v1.1/{run_043_eval_whitebox_attacks.sh => run_013_eval_whitebox_attacks.sh} (100%) rename egs/voxceleb/adv.v1.1/{run_044_eval_transfer_blackbox_attacks.sh => run_014_eval_transfer_blackbox_attacks.sh} (100%) rename egs/voxceleb/adv.v1.1/{run_045_eval_whitebox_attacks_with_randsmooth_defense.sh => run_015_eval_whitebox_attacks_with_randsmooth_defense.sh} (67%) rename egs/voxceleb/adv.v1.1/{run_046_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh => run_016_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh} (100%) rename egs/voxceleb/adv.v1.1/{run_053_eval_art_whitebox_attacks.sh => run_017_eval_art_whitebox_attacks.sh} (54%) create mode 100755 egs/voxceleb/adv.v1.1/run_018_eval_art_transfer_blackbox_attacks.sh delete mode 100755 egs/voxceleb/adv.v1.1/run_054_eval_art_transfer_blackbox_attacks.sh delete mode 120000 egs/voxceleb/adv.v1/conf delete mode 120000 egs/voxceleb/adv.v1/local delete mode 100755 egs/voxceleb/adv.v1/run_003_compute_fbank.sh create mode 100755 egs/voxceleb/adv.v1/run_003_prepare_noises_rirs.sh delete mode 100755 egs/voxceleb/adv.v1/run_004_prepare_augment.sh create mode 100755 egs/voxceleb/adv.v1/run_004_prepare_victim_xvec_train_data.sh delete mode 100755 egs/voxceleb/adv.v1/run_005_compute_fbank_augment.sh create mode 100755 egs/voxceleb/adv.v1/run_006_prepare_transfer_xvec_train_data.sh create mode 100755 egs/voxceleb/adv.v1/run_008_extract_xvectors_victim_model.sh delete mode 100755 egs/voxceleb/adv.v1/run_010_prepare_victim_xvec_train_data.sh delete mode 100755 egs/voxceleb/adv.v1/run_011_train_victim_xvector.sh delete mode 100755 egs/voxceleb/adv.v1/run_012_prepare_transfer_xvec_train_data.sh delete mode 100755 egs/voxceleb/adv.v1/run_013_train_transfer_xvector.sh delete mode 100755 egs/voxceleb/adv.v1/run_030_extract_xvectors_victim_model.sh create mode 100644 egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh create mode 100644 egs/voxceleb/v1.2/README.md create mode 100755 egs/voxceleb/v1.2/cmd.sh create mode 100644 egs/voxceleb/v1.2/conf/clsp.conf create mode 100644 egs/voxceleb/v1.2/conf/coe_gpu_bigmem.conf create mode 100644 egs/voxceleb/v1.2/conf/coe_gpu_long.conf create mode 100644 egs/voxceleb/v1.2/conf/coe_gpu_rtx.conf create mode 100644 egs/voxceleb/v1.2/conf/coe_gpu_short.conf create mode 100644 egs/voxceleb/v1.2/conf/coe_gpu_v100.conf create mode 100644 egs/voxceleb/v1.2/conf/fbank80_specaug1_stmn_16k.yaml create mode 100644 egs/voxceleb/v1.2/conf/fbank80_stmn_16k.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/vad_16k.yaml create mode 100644 egs/voxceleb/v1.2/datapath.sh create mode 120000 egs/voxceleb/v1.2/default_config.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh create mode 100755 egs/voxceleb/v1.2/path.sh create mode 100755 egs/voxceleb/v1.2/run_001_prepare_data.sh create mode 100755 hyperion/bin/adv_finetune_xvector_from_wav.py create mode 100755 hyperion/bin/prepare_data.py create mode 100644 hyperion/data_prep/__init__.py create mode 100644 hyperion/data_prep/data_prep.py create mode 100644 hyperion/data_prep/voxceleb2.py diff --git a/egs/voxceleb/adv.v1.1/README.md b/egs/voxceleb/adv.v1.1/README.md index bccc494b..49801544 100644 --- a/egs/voxceleb/adv.v1.1/README.md +++ b/egs/voxceleb/adv.v1.1/README.md @@ -1,25 +1,15 @@ # VoxCeleb Adversarial Attacks Version 1.1 -Last update 2021/05/17 +Last update 2023/04/10 Recipe to evaluate Adversarial Attacks to x-Vector Speaker Verification Systems -## Differences w.r.t VoxCeleb adv.v1 recipe - -In recipe version V1: - - We compute speech augmentations and acoustic features offline and dump them to disk. - - Augmentation is performed using Kaldi scripts and wav-reverbate tool - - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. - -In this recipe: - - We compute speech augmentations and acoustic features are computed always on-the-fly, - we don't dump any features to disk. - - Augmentation is performed using Hyperin SpeechAugment class. - - The behavior of this class is controlled - by the the configuration file `conf/reverb_noise_aug.yml`, - which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. - - Babble noise is created offline by mixing 3-10 single speaker files. +## Setup +To run attacks with Adversarial Robustness toolbox, you need to install it in the environment by +``` +pip install adversarial-robustness-toolbox[pytorch] +``` ## Threat Model @@ -92,48 +82,45 @@ run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.s - `run_002_compute_evad.sh` - Computes Energy VAD for all datasets - - `run_002b_compute_fbank.sh` - - Computes log-filter-banks acoustic features for all datasets - - `run_003_prepare_noises_rirs.sh` - Prepares MUSAN noises, music to be used by SpeechAugment class. - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. - - `run_010_prepare_xvec_train_data.sh` + - `run_004_prepare_victim_xvec_train_data.sh` - Prepares audios train the victim x-vector model - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. - Removes silence from the audios - Removes utterances shorter than 4secs and speakers with less than 8 utterances. - Creates training and validation lists for x-vector training - - `run_011_train_victim_xvector.sh` + - `run_005_train_victim_xvector.sh` - Trains the victim x-vector network - - `run_012_prepare_transfer_xvec_train_data.sh` + - `run_006_prepare_transfer_xvec_train_data.sh` - Prepares audios train the transfer white-box x-vector model - If training data for victim and tranfer models is the same, it does nothing - - `run_013_train_transfer_xvector.sh` + - `run_007_train_transfer_xvector.sh` - Trains the transfer white-box x-vector network - - `run_030_extract_xvectors_victim_model.sh` + - `run_009_extract_xvectors_victim_model.sh` - Exctracts x-vectors for VoxCeleb1 test set using the victim model - - `run_031_extract_xvectors_transfer_model.sh` + - `run_010_extract_xvectors_transfer_model.sh` - Exctracts x-vectors for VoxCeleb1 test set using the transfer model - - `run_040_eval_be_victim_model.sh` + - `run_011_eval_be_victim_model.sh` - Eval cosine scoring back-end without attack on victim model x-vectors - Trains calibration for the victim model scores - Results are left in `exp/scores/$nnet_name/cosine/voxceleb1_o_clean_results` - - `run_041_eval_be_tranfer_model.sh` + - `run_012_eval_be_tranfer_model.sh` - Eval cosine scoring back-end without attack on transfer model x-vectors - Trains calibration for the tranfer model scores - Results are left in `exp/scores/$transfer_nnet_name/cosine/voxceleb1_o_clean_results` - - `run_043_eval_whitebox_attacks.sh` + - `run_013_eval_whitebox_attacks.sh` - Eval white box attacks implemented in Hyperion toolkit: FGSM, Iter-FGSM, PGD, Carlini-Wagner - Results are left in `exp/scores/$nnet_name/cosine_${attack_related_label}/voxceleb1_o_clean_results` - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF @@ -141,7 +128,7 @@ run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.s - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - Wavs are saves to `exp/scores/$nnet_name/cosine_${attack_related_label}/wav` - - `run_044_eval_transfer_blackbox_attacks.sh` + - `run_014_eval_transfer_blackbox_attacks.sh` - Eval transfer black box attacks implemented in Hyperion toolkit: FGSM, Iter-FGSM, PGD, Carlini-Wagner - Results are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}/voxceleb1_o_clean_results` - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF @@ -149,11 +136,11 @@ run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.s - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - Wavs are saves to `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}/wav` - - `run_045_eval_whitebox_attacks_with_randsmooth_defense.sh` + - `run_015_eval_whitebox_attacks_with_randsmooth_defense.sh` - Eval white box attacks with Gaussian randomized smoothing defense. - Results are left in `exp/scores/$nnet_name/cosine_${attack_related_label}_randsmooth${smooth_sigma}/voxceleb1_o_clean_results` - - `run_053_eval_art_whitebox_attacks.sh` + - `run_017_eval_art_whitebox_attacks.sh` - Eval white box attacks implemented in IBM's Adversarial Robustness Toolkit (ART): FGSM, Iter-FGSM, PGD, Carlini-Wagner - Results are left in `exp/scores/$nnet_name/cosine_art_${attack_related_label}/voxceleb1_o_clean_results` - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF @@ -161,7 +148,7 @@ run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.s - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - Wavs are saves to `exp/scores/$nnet_name/cosine_art_${attack_related_label}/wav` - - `run_054_eval_art_transfer_blackbox_attacks.sh` + - `run_018_eval_art_transfer_blackbox_attacks.sh` - Eval transfer black box attacks implemented in IBM's Adversarial Robustness Toolkit (ART): FGSM, Iter-FGSM, PGD, Carlini-Wagner - Results are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_art_${attack_related_label}/voxceleb1_o_clean_results` - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF diff --git a/egs/voxceleb/adv.v1.1/conf b/egs/voxceleb/adv.v1.1/conf deleted file mode 120000 index 7dfe9dce..00000000 --- a/egs/voxceleb/adv.v1.1/conf +++ /dev/null @@ -1 +0,0 @@ -../../sre19-cmn2/v1/conf \ No newline at end of file diff --git a/egs/voxceleb/adv.v1.1/conf/advft_resnet34_xvec.yaml b/egs/voxceleb/adv.v1.1/conf/advft_resnet34_xvec.yaml new file mode 100644 index 00000000..fd9c95e1 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/advft_resnet34_xvec.yaml @@ -0,0 +1,67 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 +attack: + attack_type: pgd + max_iters: 10 + eps: 0.004 + alpha: 0.0008 + random_eps: true + p_attack: 0.5 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 8000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 20 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v1.1/conf/clsp.conf b/egs/voxceleb/adv.v1.1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/adv.v1.1/conf/coe_gpu_long.conf b/egs/voxceleb/adv.v1.1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/adv.v1.1/conf/coe_gpu_rtx.conf b/egs/voxceleb/adv.v1.1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/adv.v1.1/conf/coe_gpu_short.conf b/egs/voxceleb/adv.v1.1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/adv.v1.1/conf/fbank80_stmn_16k.yaml b/egs/voxceleb/adv.v1.1/conf/fbank80_stmn_16k.yaml new file mode 100644 index 00000000..f4091f5d --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/fbank80_stmn_16k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/adv.v1.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/adv.v1.1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/adv.v1.1/conf/train_lresnet34_xvec.yaml b/egs/voxceleb/adv.v1.1/conf/train_lresnet34_xvec.yaml new file mode 100644 index 00000000..609f6829 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/train_lresnet34_xvec.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v1.1/conf/train_resetdnn_xvec.yaml b/egs/voxceleb/adv.v1.1/conf/train_resetdnn_xvec.yaml new file mode 100644 index 00000000..c379ee76 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/train_resetdnn_xvec.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + tdnn_type: resetdnn + in_feats: 80 + num_enc_blocks: 5 + enc_hid_units: 512 + enc_expand_units: 1536 + kernel_size: + - 5 + - 3 + - 3 + - 3 + - 1 + dilation: + - 1 + - 2 + - 3 + - 4 + - 1 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.1 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v1.1/conf/train_resnet34_xvec.yaml b/egs/voxceleb/adv.v1.1/conf/train_resnet34_xvec.yaml new file mode 100644 index 00000000..73ddcb68 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/train_resnet34_xvec.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v1.1/conf/vad_16k.yaml b/egs/voxceleb/adv.v1.1/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/adv.v1.1/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1.sh index d102a77a..b569604d 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1.sh @@ -3,93 +3,41 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml + # victim x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=$nnet_num_augs -lr=0.05 - -nnet_type=lresnet34 -dropout=0 -embed_dim=256 -s=30 -margin_warmup=20 -margin=0.3 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 +nnet_cfg=conf/train_lresnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=$feat_config transfer_feat_type=$feat_type # transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_name=${transfer_feat_type}_resetdnn5x512 +transfer_nnet_cfg=conf/train_resetdnn_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth +# adversarial finetuning +advft_nnet_name=${nnet_name}_advft +advft_nnet_cfg=conf/advft_lresnet34_xvec.yaml +advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name +advft_nnet=$advft_nnet_dir/model_ep0070.pth -# options for adversarial finetuning of the victim model -advft_batch_size_1gpu=32 -advft_eff_batch_size=128 # effective batch size -advft_margin=0.3 -advft_margin_warmup=20 -advft_nnet_num_epochs=20 -advft_eps=0.004 -advft_eps_step=$(echo $advft_eps | awk '{ print $1/5}') -advft_p=0.5 -advft_lr=0.05 -advft_iters=10 -advft_attack_opts="--attack.attack-type pgd --attack.max-iter $advft_iters --attack.eps $advft_eps --attack.alpha $advft_eps_step --attack.random-eps --p-attack $advft_p" -advft_opt_opt="--optim.opt-type adam --optim.lr $advft_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -advft_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 8000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -advft_nnet_name=$nnet_name.advft_p${advft_p}_pgd${advft_iters}e${advft_eps}step${advft_eps_step}_arcm${advft_margin}wup${advft_margin_warmup}_optv1_adam_lr${advft_lr} -advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name -advft_nnet=$advft_nnet_dir/model_ep0020.pth diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1_wavegan_defense.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1_wavegan_defense.sh index 09d6b993..8105df2c 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1_wavegan_defense.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_lresnet34_transfer_resetdnn.v1_wavegan_defense.sh @@ -3,96 +3,42 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml + # victim x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=$nnet_num_augs -lr=0.05 - -nnet_type=lresnet34 -dropout=0 -embed_dim=256 -s=30 -margin_warmup=20 -margin=0.3 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 +nnet_cfg=conf/train_lresnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=$feat_config transfer_feat_type=$feat_type # transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_name=${transfer_feat_type}_resetdnn5x512 +transfer_nnet_cfg=conf/train_resetdnn_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth - -# options for adversarial finetuning of the victim model -advft_batch_size_1gpu=32 -advft_eff_batch_size=128 # effective batch size -advft_margin=0.3 -advft_margin_warmup=20 -advft_nnet_num_epochs=20 -advft_eps=0.004 -advft_eps_step=$(echo $advft_eps | awk '{ print $1/5}') -advft_p=0.5 -advft_lr=0.05 -advft_iters=10 -advft_attack_opts="--attack.attack-type pgd --attack.max-iter $advft_iters --attack.eps $advft_eps --attack.alpha $advft_eps_step --attack.random-eps --p-attack $advft_p" -advft_opt_opt="--optim.opt-type adam --optim.lr $advft_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -advft_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 8000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -advft_nnet_name=$nnet_name.advft_p${advft_p}_pgd${advft_iters}e${advft_eps}step${advft_eps_step}_arcm${advft_margin}wup${advft_margin_warmup}_optv1_adam_lr${advft_lr} +# adversarial finetuning +advft_nnet_name=${nnet_name}_advft +advft_nnet_cfg=conf/advft_lresnet34_xvec.yaml advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name -advft_nnet=$advft_nnet_dir/model_ep0020.pth +advft_nnet=$advft_nnet_dir/model_ep0070.pth # WaveGAN configs smoothing_after_wavegan=true diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_fbank40_stmn_resetdnn.v1.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_fbank40_stmn_resetdnn.v1.sh index 54e47a29..3e7739d0 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_fbank40_stmn_resetdnn.v1.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_fbank40_stmn_resetdnn.v1.sh @@ -3,7 +3,7 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn @@ -12,70 +12,32 @@ vad_config=conf/vad_16k.yaml # victim x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=1 -lr=0.05 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34 -nnet_type=resnet34 -dropout=0 -embed_dim=256 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 +nnet_cfg=conf/train_resnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=conf/fbank40_stmn_16k.yaml transfer_feat_type=fbank40_stmn # transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 40 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_name=${transfer_feat_type}_resetdnn5x512 +transfer_nnet_cfg=conf/train_resetdnn_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth +# adversarial finetuning +advft_nnet_name=${nnet_name}_advft +advft_nnet_cfg=conf/advft_resnet34_xvec.yaml +advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name +advft_nnet=$advft_nnet_dir/model_ep0070.pth + diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_lresnet.v1.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_lresnet.v1.sh index 777b8b5d..00dfd4ff 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_lresnet.v1.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_lresnet.v1.sh @@ -3,70 +3,39 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# Victim model ResNet34 x-vector +# For the black-box attacks we use Residual E-TDNN to generate the attack and transfer them to the ResNet34 +# Both models uses the same features: 80 fbanks +# Both models uses the same training data. + +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn -# victim x-vector training -nnet_data=voxceleb2cat -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=$nnet_num_augs -lr=0.05 +#vad +vad_config=conf/vad_16k.yaml -nnet_type=resnet34 -dropout=0 -embed_dim=256 +# victim x-vector training +nnet_data=voxceleb2cat_train -s=30 -margin_warmup=20 -margin=0.3 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 +nnet_cfg=conf/train_resnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=$feat_config transfer_feat_type=$feat_type # transfer model training -transfer_nnet_data=voxceleb2cat #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 - -transfer_nnet_type=lresnet34 -transfer_dropout=0 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--resnet-type $transfer_nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 +transfer_nnet_data=voxceleb2cat_train +transfer_nnet_type=resnet +transfer_nnet_name=${transfer_feat_type}_lresnet34 +transfer_nnet_cfg=conf/train_lresnet34_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_resetdnn.v1.sh index 482f3b7b..6570f4a2 100644 --- a/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_resetdnn.v1.sh +++ b/egs/voxceleb/adv.v1.1/global_conf/config_victim_fbank80_stmn_resnet34_transfer_resetdnn.v1.sh @@ -3,75 +3,41 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim acoustic features +# acoustic features feat_config=conf/fbank80_stmn_16k.yaml feat_type=fbank80_stmn +#vad +vad_config=conf/vad_16k.yaml + # victim x-vector training nnet_data=voxceleb2cat_train -nnet_num_augs=6 -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=4 -max_chunk=4 -ipe=$nnet_num_augs -lr=0.05 - -nnet_type=resnet34 -dropout=0 -embed_dim=256 -s=30 -margin_warmup=20 -margin=0.3 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${feat_type}_${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 +nnet_cfg=conf/train_resnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - -# transfer model acoustic features +# transfer feature extractor transfer_feat_config=$feat_config transfer_feat_type=$feat_type # transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb1cat -transfer_nnet_num_augs=6 -transfer_aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=4 -transfer_max_chunk=4 -transfer_ipe=$transfer_nnet_num_augs -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_feat_type}_${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_name=${transfer_feat_type}_resetdnn5x512 +transfer_nnet_cfg=conf/train_resetdnn_xvec.yaml transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth +# adversarial finetuning +advft_nnet_name=${nnet_name}_advft +advft_nnet_cfg=conf/advft_resnet34_xvec.yaml +advft_nnet_dir=exp/xvector_nnets/$advft_nnet_name +advft_nnet=$advft_nnet_dir/model_ep0070.pth + diff --git a/egs/voxceleb/adv.v1.1/local b/egs/voxceleb/adv.v1.1/local deleted file mode 120000 index ce1cbf90..00000000 --- a/egs/voxceleb/adv.v1.1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local \ No newline at end of file diff --git a/egs/voxceleb/adv.v1.1/local/attack_analysis.py b/egs/voxceleb/adv.v1.1/local/attack_analysis.py new file mode 100755 index 00000000..2e0fdb42 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/attack_analysis.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +import sys +import os +import argparse +import time +import logging + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.np.metrics.verification_evaluator import ( + VerificationAdvAttackEvaluator as Eval, +) + + +def evaluate_attacks( + key_file, + clean_score_file, + attack_score_files, + attack_stats_files, + output_path, + prior, +): + + output_dir = os.path.dirname(output_path) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + evaluator = Eval( + key_file, clean_score_file, attack_score_files, attack_stats_files, prior + ) + + # performance vs SNR + logging.info("compute perf vs snr for all trials") + df_clean = evaluator.compute_dcf_eer(return_df=True) + df_clean.insert(0, "snr", np.inf) + + df = evaluator.compute_dcf_eer_vs_stats( + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "all", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_all_snr_results.csv" % (output_path) + df = pd.concat([df_clean, df], ignore_index=True) + df.to_csv(file_path) + file_path = "%s_attack_all_snr" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) + + logging.info("compute perf vs snr for tar trials") + df = evaluator.compute_dcf_eer_vs_stats( + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "tar", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_tar_snr_results.csv" % (output_path) + df = pd.concat([df_clean, df], ignore_index=True) + df.to_csv(file_path) + file_path = "%s_attack_tar_snr" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) + + logging.info("compute perf vs snr for non trials") + df = evaluator.compute_dcf_eer_vs_stats( + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "non", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_non_snr_results.csv" % (output_path) + df = pd.concat([df_clean, df], ignore_index=True) + df.to_csv(file_path) + file_path = "%s_attack_non_snr" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) + + logging.info("find best attacks from snr point of view") + for i in range(len(attack_score_files)): + file_path = "%s_best_snr_tar_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "snr", + "tar", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=True, + ) + + file_path = "%s_best_snr_non_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "snr", + "non", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=True, + ) + + # performance vs Linf + logging.info("compute perf vs linf for all trials") + eps = np.ceil(np.asarray([0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]) * 2 ** 15) + df = evaluator.compute_dcf_eer_vs_stats( + "n_linf", eps, "all", higher_better=False, return_df=True + ) + file_path = "%s_attack_all_linf_results.csv" % (output_path) + df.to_csv(file_path) + file_path = "%s_attack_all_linf" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) + + logging.info("compute perf vs linf for tar trials") + df = evaluator.compute_dcf_eer_vs_stats( + "n_linf", eps, "tar", higher_better=False, return_df=True + ) + file_path = "%s_attack_tar_linf_results.csv" % (output_path) + df.to_csv(file_path) + file_path = "%s_attack_tar_linf" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) + + logging.info("compute perf vs linf for non trials") + df = evaluator.compute_dcf_eer_vs_stats( + "n_linf", eps, "non", higher_better=False, return_df=True + ) + file_path = "%s_attack_non_linf_results.csv" % (output_path) + df.to_csv(file_path) + file_path = "%s_attack_non_linf" % (output_path) + evaluator.plot_dcf_eer_vs_stat_v1( + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) + + # find the best attacks in terms of linf + logging.info("find best attacks from linf point of view") + for i in range(len(attack_score_files)): + file_path = "%s_best_linf_tar_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "n_linf", + "tar", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=False, + ) + + file_path = "%s_best_linf_non_attacks_%d.csv" % (output_path, i) + evaluator.save_best_attacks( + file_path, + "n_linf", + "non", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=False, + ) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Analyses performance of adversarial attacks for spk. verif.", + ) + + parser.add_argument("--key-file", required=True) + parser.add_argument("--clean-score-file", required=True) + parser.add_argument("--attack-score-files", required=True, nargs="+") + parser.add_argument("--attack-stats-files", required=True, nargs="+") + parser.add_argument("--output-path", required=True) + parser.add_argument("--prior", default=0.05, type=float) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + evaluate_attacks(**vars(args)) diff --git a/egs/voxceleb/adv.v1.1/local/attack_analysis.sh b/egs/voxceleb/adv.v1.1/local/attack_analysis.sh new file mode 100755 index 00000000..42249873 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/attack_analysis.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +cmd=run.pl +prior=0.05 +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: $0 " + exit 1; +fi + +set -e + +key=$1 +clean_scores=$2 +adv_scores="$3" +adv_stats="$4" +output_path=$5 + +output_dir=$(dirname $output_path) +base=$(basename $output_path) +logdir=$output_dir/log +mkdir -p $logdir + +if [ "$(hostname --domain)" == "cm.gemini" ];then + module load texlive +fi + +$cmd $logdir/analysis_${base}.log \ + local/attack_analysis.py \ + --key-file $key \ + --clean-score-file $clean_scores \ + --attack-score-files $adv_scores \ + --attack-stats-files $adv_stats \ + --output-path $output_path + +scores_v=($adv_scores) +for((i=0;i<${#scores_v[@]};i++)) +do + scores_dir=$(dirname ${scores_v[$i]}) + wav_out_dir0=${output_path}_wavs + + for t in tar non + do + if [ "$t" == "tar" ];then + t2=tar2non + else + t2=non2tar + fi + wav_in_dir=$scores_dir/wav/$t2 + if [ ! -d "$wav_in_dir" ];then + continue + fi + for m in snr linf + do + best_file=${output_path}_best_${m}_${t}_attacks_$i.csv + if [ ! -f $best_file ];then + continue + fi + wav_out_dir=${wav_out_dir0}/best_${m}_${t}_attacks_$i + mkdir -p $wav_out_dir + for f in $(awk -F "," 'BEGIN{getline;}{ print $2"-"$3".wav"}' $best_file) + do + ff=$wav_in_dir/$f + if [ -f $ff ];then + cp -v $ff $wav_out_dir > $logdir/copywavs_${base}.log 2>&1 + fi + done + done + done +done + + diff --git a/egs/voxceleb/adv.v1.1/local/calibrate_voxceleb1_o_clean.sh b/egs/voxceleb/adv.v1.1/local/calibrate_voxceleb1_o_clean.sh new file mode 100755 index 00000000..736c3fb0 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/calibrate_voxceleb1_o_clean.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2019 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# + +set -e + +cmd=run.pl +prior=0.05 +l2_reg=1e-5 + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1; +fi + +score_dir=$1 +cal_score_dir=${score_dir}_cal_v1 + +mkdir -p $cal_score_dir + +echo "$0 train calibration on VoxCeleb1 Original Clean" + +model_file=$cal_score_dir/cal_tel.h5 +train_scores=$score_dir/voxceleb1_scores +train_key=data/voxceleb1_test/trials_o_clean + +$cmd $cal_score_dir/train_cal_tel.log \ + steps_be/train-calibration-v1.py --score-file $train_scores \ + --key-file $train_key --model-file $model_file --prior $prior --lambda-reg $l2_reg + +ndxs=(voxceleb1_test/trials_o_clean) +scores=(voxceleb1) +n_ndx=${#ndxs[*]} +for((i=0;i<$n_ndx;i++)) +do + echo "$0 eval calibration on ${scores[$i]}" + scores_in=$score_dir/${scores[$i]}_scores + scores_out=$cal_score_dir/${scores[$i]}_scores + ndx=data/${ndxs[$i]} + $cmd $cal_score_dir/eval_cal_${scores[$i]}.log \ + steps_be/eval-calibration-v1.py --in-score-file $scores_in \ + --ndx-file $ndx --model-file $model_file --out-score-file $scores_out & + +done +wait + + + + + diff --git a/egs/voxceleb/adv.v1.1/local/make_musan.py b/egs/voxceleb/adv.v1.1/local/make_musan.py new file mode 100755 index 00000000..b0ae6846 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/make_musan.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + + +def prepare_music(root_dir, fs, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_speech(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_noise(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def main(): + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) + + +if __name__ == "__main__": + main() diff --git a/egs/voxceleb/adv.v1.1/local/make_musan.sh b/egs/voxceleb/adv.v1.1/local/make_musan.sh new file mode 100755 index 00000000..4a6d30f9 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/make_musan.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +use_vocals='Y' + +. parse_options.sh || exit 1; + +if [ $# -ne 3 ];then + echo "Usage: $0 [options] "; + echo "e.g.: $0 /export/corpora/JHU/musan 8 data" + exit 1; +fi + +in_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} $fs ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf $data_dir/musan.tmp + diff --git a/egs/voxceleb/adv.v1.1/local/make_rirs_data.sh b/egs/voxceleb/adv.v1.1/local/make_rirs_data.sh new file mode 100755 index 00000000..c6652eda --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/make_rirs_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# +# Apache 2.0. +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom" +fi + +rir_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir + +rir_list=$rir_dir/rir_list +if [ "$fs" -eq 16 ];then + awk '{ key=$5; sub(/.*\//,"",key); print key,$5 }' $rir_list > $data_dir/wav.scp +else + awk '{ +key=$5; sub(/.*\//,"",key); +print key,"sox "$5" -r 8000 -t wav -b 16 -e signed-integer - |" }' \ + $rir_list > $data_dir/wav.scp +fi +awk '{ key=$5; sub(/.*\//,"",key); print key,$4 }' $rir_list > $data_dir/rir2room + diff --git a/egs/voxceleb/adv.v1.1/local/make_trials_subset.py b/egs/voxceleb/adv.v1.1/local/make_trials_subset.py new file mode 100755 index 00000000..da230842 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/make_trials_subset.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import logging +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils import SparseTrialKey + + +def make_trials(in_key_file, out_key_file, ntar, nnon, seed): + + rng = np.random.RandomState(seed=seed) + + logging.info("Load key: %s" % in_key_file) + key = SparseTrialKey.load_txt(in_key_file) + + nz_idx = key.tar.nonzero() + nnz = len(nz_idx[0]) + p = rng.permutation(nnz)[ntar:] + nz_idx = (nz_idx[0][p], nz_idx[1][p]) + key.tar[nz_idx] = False + + nz_idx = key.non.nonzero() + nnz = len(nz_idx[0]) + p = rng.permutation(nnz)[nnon:] + nz_idx = (nz_idx[0][p], nz_idx[1][p]) + key.non[nz_idx] = False + + logging.info("Saving key: %s" % out_key_file) + key.save_txt(out_key_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Makes a subset of a trial key") + + parser.add_argument("--in-key-file", required=True) + parser.add_argument("--out-key-file", required=True) + parser.add_argument("--ntar", required=True, type=int) + parser.add_argument("--nnon", required=True, type=int) + parser.add_argument("--seed", default=112358, type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + make_trials(**namespace_to_dict(args)) diff --git a/egs/voxceleb/adv.v1.1/local/make_voxceleb1_o.pl b/egs/voxceleb/adv.v1.1/local/make_voxceleb1_o.pl new file mode 100755 index 00000000..dce92245 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/make_voxceleb1_o.pl @@ -0,0 +1,180 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# 2020 Jesus Villalba +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ +# Create trial lists for Voxceleb1 original, +# with cleaned and non-cleaned versions +# Attention: +# - This script is for the old version of the dataset without anonymized speaker-ids +# - This script assumes that the voxceleb1 dataset has all speaker directories +# dumped in the same wav directory, NOT separated dev and test directories + + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_dir = "$out_dir/voxceleb1_test"; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +my $url_base="http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta"; +my @trials_basename = ("very_test.txt", "very_test2.txt"); +my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt"); +my @trials = ("trials_o", "trials_o_clean"); + +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; +my $meta_path = "$data_base/vox1_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox1_meta.csv"; + system("wget -O $meta_path $meta_url"); +} + +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; +my %id2spkr = (); +my %spkr2gender = (); +my %spkr2nation = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; + $id2spkr{$vox_id} = $spkr_id; + $spkr2gender{$spkr_id} = $gender; + $nation =~ s@ @-@g; + $spkr2nation{$spkr_id} = $nation; +} +close(META_IN) or die; + +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; +my $lid_path = "$data_base/lang_vox1_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox1_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; + my $spkr_id = $id2spkr{$vox_id}; + my $utt_id = "$spkr_id-$vid_id-00$file_id"; + $utt_id =~ s@\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + +#download trials from voxceleb web page +for($i = 0; $i <= $#trials; $i++) { + + my $file_i = "$out_dir/$trials_basename[$i]"; + my $url_i = $trials_url[$i]; + my $trial_i = "$out_dir/$trials[$i]"; + if (! -e $file_i) { + system("wget -O $file_i $url_i"); + } + #mapping from new speaker ids and file-names to old ones + open(TRIAL_IN, "<", "$file_i") or die "Could not open the verification trials file $file_i"; + open(TRIAL_OUT, ">", "$trial_i") or die "Could not open the output file $trial_i"; + while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($vox_id, $rec_id, $segment) = split('/', $path1); + $segment =~ s/\.wav$//; + my $spkr_id = $id2spkr{$vox_id}; + my $utt_id1 = "$spkr_id-$rec_id-00$segment"; + + # Create entry for right-hand side of trial + my ($vox_id, $rec_id, $segment) = split('/', $path2); + $segment =~ s/\.wav$//; + my $spkr_id = $id2spkr{$vox_id}; + my $utt_id2 = "$spkr_id-$rec_id-00$segment"; + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; + } + + close(TRIAL_IN) or die; + close(TRIAL_OUT) or die; + +} + + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + my $new_spkr_id = $spkr_id; + # If we're using a newer version of VoxCeleb1, we need to "deanonymize" + # the speaker labels. + if (exists $id2spkr{$spkr_id}) { + $new_spkr_id = $id2spkr{$spkr_id}; + } + print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; + print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; + + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + my $utt_id = "$new_spkr_id-$rec_id-$segment"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $new_spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } + } +} + +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; +close(NAT) or die; + +if (system( + "cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) { + die "Error creating trials file in directory $out_dir"; +} + +if (system( + "awk '{ print \$1,\$1 }' $out_dir/trials | sort -u > $out_dir/utt2model") != 0) { + die "Error creating utt2model file in directory $out_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} + diff --git a/egs/voxceleb/adv.v1.1/local/make_voxceleb2cat.pl b/egs/voxceleb/adv.v1.1/local/make_voxceleb2cat.pl new file mode 100755 index 00000000..93b6ad5a --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/make_voxceleb2cat.pl @@ -0,0 +1,136 @@ +#!/usr/bin/perl +# +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Copyright 2018 Ewald Enzinger +# +# Apache 2.0 +# Usage: make_voxceleb2cat.pl /export/voxceleb2cat_train dev 16 data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 4) { + print STDERR "Usage: $0 fs \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev 16 data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $fs, $out_dir) = @ARGV; + +print "Preparing VoxCeleb2 Cat in $out_dir \n"; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +my $dataset_path = "" ; +if ( -d "$data_base/$dataset/aac" ){ + $dataset_path = "$data_base/$dataset/aac" +} +else { + $dataset_path = "$data_base/$dataset" +} + + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +if (system("mkdir -p $out_dir/lists_cat") != 0) { + die "Error making directory $out_dir/lists_cat"; +} + +print "Reading metadata\n"; +my $meta_url = "https://www.openslr.org/resources/49/vox2_meta.csv"; +my $meta_path = "$data_base/vox2_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox2_meta.csv"; + system("wget --no-check-certificate -O $meta_path $meta_url"); +} +open(META_IN, "<", "$meta_path") or die "Could not open the output file $meta_path"; +my %spkr2gender = (); +while () { + chomp; + my ($spkr, $vox_id, $vgg_id, $gender, $set) = split; + $spkr2gender{$vox_id} = $gender; +} +close(META_IN) or die; + +print "Reading languages estimated voxlingua \n"; +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox2_final.csv"; +my $lid_path = "$data_base/lang_vox2_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox2_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + $utt_id =~ s@/@-@g; + $utt_id =~ s@-[^-]*\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; + +opendir my $dh, "$dataset_path" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$dataset_path/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +my $num_spkrs = @spkr_dirs; +my $count = 0; +foreach (@spkr_dirs) { + my $spkr_id = $_; + + $count++ ; + print " processing speaker $spkr_id $count / $num_spkrs \n"; + print GENDER "$spkr_id $spkr2gender{$spkr_id}\n"; + + opendir my $dh, "$dataset_path/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$dataset_path/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + my $utt_id = "$spkr_id-$rec_id"; + my $file_list = "$out_dir/lists_cat/$utt_id.txt"; + if (system("find $dataset_path/$spkr_id/$rec_id -name \"*.m4a\" -printf \"file %p\\n\" > $file_list") != 0){ + die "Error creating $file_list"; + } + my $wav = "ffmpeg -v 8 -f concat -safe 0 -i $file_list -f wav -acodec pcm_s16le -|"; + if($fs == 8){ + $wav = $wav." sox -t wav - -t wav -r 8k - |" + } + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/voxceleb/adv.v1.1/local/score_dcf.py b/egs/voxceleb/adv.v1.1/local/score_dcf.py new file mode 100755 index 00000000..3524d222 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/score_dcf.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import argparse +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils import SparseTrialScores, SparseTrialKey +from hyperion.np.metrics import fast_eval_dcf_eer as fast_eval + + +def score_dcf(key_file, score_file, output_path): + + logging.info("Load key: %s" % key_file) + key = SparseTrialKey.load_txt(key_file) + logging.info("Load scores: %s" % score_file) + scr = SparseTrialScores.load_txt(score_file) + logging.info("separating tar/non") + tar, non = scr.get_tar_non(key) + logging.info("computing EER/DCF") + priors = np.array([0.001, 0.005, 0.01, 0.05]) + min_dcf, act_dcf, eer, _, min_pmiss, min_pfa, act_pmiss, act_pfa = fast_eval( + tar, non, priors, return_probs=True + ) + + output_dir = os.path.dirname(output_path) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + + ntar = len(tar) + nnon = len(non) + + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f} ntar: {9:d} nnon: {10:d}\n".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ntar, + nnon, + ) + f.write(s) + logging.info(s) + s = "min-pmiss={} min-pfa={} act-pmiss={} act-pfa={}".format( + min_pmiss, min_pfa, act_pmiss, act_pfa + ) + logging.info(s) + s = "min-Nmiss={} min-Nfa={} act-Nmiss={} act-Nfa={}".format( + min_pmiss * ntar, min_pfa * nnon, act_pmiss * ntar, act_pfa * nnon + ) + logging.info(s) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + score_dcf(**vars(args)) diff --git a/egs/voxceleb/adv.v1.1/local/score_voxceleb1_o_clean.sh b/egs/voxceleb/adv.v1.1/local/score_voxceleb1_o_clean.sh new file mode 100755 index 00000000..b8247efc --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/score_voxceleb1_o_clean.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1; +fi + +set -e + +data_dir=$1 +score_dir=$2 + +for cond in o_clean +do + echo "Voxceleb $cond" + key=$data_dir/trials_$cond + #Compute performance + python local/score_dcf.py --key-file $key --score-file $score_dir/voxceleb1_scores --output-path $score_dir/voxceleb1_${cond} & +done +wait + diff --git a/egs/voxceleb/adv.v1.1/local/score_voxceleb1_single_cond.sh b/egs/voxceleb/adv.v1.1/local/score_voxceleb1_single_cond.sh new file mode 100755 index 00000000..7531037e --- /dev/null +++ b/egs/voxceleb/adv.v1.1/local/score_voxceleb1_single_cond.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0. +# +if [ $# -ne 3 ]; then + echo "Usage: $0 " + exit 1; +fi + +set -e + +data_dir=$1 +cond=$2 +score_dir=$3 + +echo "Voxceleb $cond" +key=$data_dir/trials_$cond +#Compute performance +python local/score_dcf.py --key-file $key --score-file $score_dir/voxceleb1_scores --output-path $score_dir/voxceleb1_${cond} + + diff --git a/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh b/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh index eeae00ac..e854b393 100755 --- a/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh @@ -9,7 +9,6 @@ set -e nodes=fs01 storage_name=$(date +'%m_%d_%H_%M') vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml stage=1 config_file=default_config.sh diff --git a/egs/voxceleb/adv.v1.1/run_010_prepare_victim_xvec_train_data.sh b/egs/voxceleb/adv.v1.1/run_004_prepare_victim_xvec_train_data.sh similarity index 94% rename from egs/voxceleb/adv.v1.1/run_010_prepare_victim_xvec_train_data.sh rename to egs/voxceleb/adv.v1.1/run_004_prepare_victim_xvec_train_data.sh index f89c9822..0e10ea68 100755 --- a/egs/voxceleb/adv.v1.1/run_010_prepare_victim_xvec_train_data.sh +++ b/egs/voxceleb/adv.v1.1/run_004_prepare_victim_xvec_train_data.sh @@ -16,7 +16,7 @@ config_file=default_config.sh if [ $stage -le 2 ]; then # This script preprocess audio for x-vector training steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-adv.v2.1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + --storage_name voxceleb-adv.v1.1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ data/${nnet_data} data/${nnet_data}_proc_audio_no_sil exp/${nnet_data}_proc_audio_no_sil utils/fix_data_dir.sh data/${nnet_data}_proc_audio_no_sil @@ -29,7 +29,6 @@ if [ $stage -le 3 ]; then # We also want several utterances per speaker. Now we'll throw out speakers # with fewer than 4 utterances. hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${nnet_data}_proc_audio_no_sil - fi if [ $stage -le 4 ]; then @@ -39,4 +38,3 @@ if [ $stage -le 4 ]; then data/${nnet_data}_proc_audio_no_sil/lists_xvec fi -exit diff --git a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh new file mode 100755 index 00000000..37a91211 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-adv.v1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu + +fi + diff --git a/egs/voxceleb/adv.v1.1/run_012_prepare_transfer_xvec_train_data.sh b/egs/voxceleb/adv.v1.1/run_006_prepare_transfer_xvec_train_data.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_012_prepare_transfer_xvec_train_data.sh rename to egs/voxceleb/adv.v1.1/run_006_prepare_transfer_xvec_train_data.sh diff --git a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh new file mode 100755 index 00000000..70bab280 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ "$nnet" == "$transfer_nnet" ];then + echo "Victim and transfer model are the same" + echo "Skipping this step" + exit 0 +fi + +list_dir=data/${transfer_nnet_data}_proc_audio_no_sil +nnet_type=$transfer_nnet_type +nnet_dir=$transfer_nnet_dir +nnet_cfg=$transfer_nnet_cfg +nnet_args=$transfer_nnet_args + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-adv.v1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu + +fi + diff --git a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh new file mode 100755 index 00000000..12f1e5fd --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ "$nnet" == "$transfer_nnet" ];then + echo "Victim and transfer model are the same" + echo "Skipping this step" + exit 0 +fi + +list_dir=data/${nnet_data}_proc_audio_no_sil +nnet_dir=$advft_nnet_dir +nnet_cfg=$advft_nnet_cfg +nnet_args=$advft_nnet_args + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-adv.v1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + adv_finetune_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu + +fi + + + +# #!/bin/bash +# # Copyright +# # 2019 Johns Hopkins University (Author: Jesus Villalba) +# # Apache 2.0. +# # +# . ./cmd.sh +# . ./path.sh +# set -e + +# stage=1 +# ngpu=4 +# config_file=default_config.sh +# resume=false +# interactive=false +# num_workers=8 + +# . parse_options.sh || exit 1; +# . $config_file +# . datapath.sh + +# batch_size=$(($advft_batch_size_1gpu*$ngpu)) +# grad_acc_steps=$(echo $batch_size $advft_eff_batch_size | awk '{ print int($2/$1+0.5)}') +# log_interval=$(echo 100*$grad_acc_steps | bc) +# list_dir=data/${nnet_data}_proc_audio_no_sil + +# args="" +# if [ "$resume" == "true" ];then +# args="--resume" +# fi + +# if [ "$interactive" == "true" ];then +# export cuda_cmd=run.pl +# fi + +# # Network Training +# if [ $stage -le 1 ]; then +# mkdir -p $advft_nnet_dir/log +# $cuda_cmd --gpu $ngpu $advft_nnet_dir/log/train.log \ +# hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ +# torch-adv-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ +# --audio-path $list_dir/wav.scp \ +# --time-durs-file $list_dir/utt2dur \ +# --train-list $list_dir/lists_xvec/train.scp \ +# --val-list $list_dir/lists_xvec/val.scp \ +# --class-file $list_dir/lists_xvec/class2int \ +# --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ +# --iters-per-epoch $ipe \ +# --batch-size $batch_size \ +# --num-workers $num_workers \ +# --grad-acc-steps $grad_acc_steps $advft_opt_opt $advft_lrs_opt \ +# --epochs $advft_nnet_num_epochs \ +# --s $s --margin $advft_margin --margin-warmup-epochs $advft_margin_warmup \ +# --num-gpus $ngpu \ +# --train-mode ft-full \ +# --log-interval $log_interval \ +# --in-model-path $nnet \ +# --exp-path $advft_nnet_dir $advft_attack_opts $args + +# fi +# # + +# exit diff --git a/egs/voxceleb/adv.v1.1/run_030_extract_xvectors_victim_model.sh b/egs/voxceleb/adv.v1.1/run_009_extract_xvectors_victim_model.sh similarity index 81% rename from egs/voxceleb/adv.v1.1/run_030_extract_xvectors_victim_model.sh rename to egs/voxceleb/adv.v1.1/run_009_extract_xvectors_victim_model.sh index ff068c1b..2df747e6 100755 --- a/egs/voxceleb/adv.v1.1/run_030_extract_xvectors_victim_model.sh +++ b/egs/voxceleb/adv.v1.1/run_009_extract_xvectors_victim_model.sh @@ -36,11 +36,11 @@ if [ $stage -le 1 ]; then do num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - --feat-config $feat_config \ - $nnet data/$name \ - $xvector_dir/$name + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name done fi -exit diff --git a/egs/voxceleb/adv.v1.1/run_031_extract_xvectors_transfer_model.sh b/egs/voxceleb/adv.v1.1/run_010_extract_xvectors_transfer_model.sh similarity index 81% rename from egs/voxceleb/adv.v1.1/run_031_extract_xvectors_transfer_model.sh rename to egs/voxceleb/adv.v1.1/run_010_extract_xvectors_transfer_model.sh index df29fc12..7e2488b3 100755 --- a/egs/voxceleb/adv.v1.1/run_031_extract_xvectors_transfer_model.sh +++ b/egs/voxceleb/adv.v1.1/run_010_extract_xvectors_transfer_model.sh @@ -33,11 +33,12 @@ if [ $stage -le 1 ]; then do num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors_from_wav.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - --feat-config $feat_config \ - $nnet data/$name \ - $xvector_dir/$name + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name done fi -exit + diff --git a/egs/voxceleb/adv.v1.1/run_040_eval_be_victim_model.sh b/egs/voxceleb/adv.v1.1/run_011_eval_be_victim_model.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_040_eval_be_victim_model.sh rename to egs/voxceleb/adv.v1.1/run_011_eval_be_victim_model.sh diff --git a/egs/voxceleb/adv.v1.1/run_041_eval_be_transfer_model.sh b/egs/voxceleb/adv.v1.1/run_012_eval_be_transfer_model.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_041_eval_be_transfer_model.sh rename to egs/voxceleb/adv.v1.1/run_012_eval_be_transfer_model.sh diff --git a/egs/voxceleb/adv.v1.1/run_043_eval_whitebox_attacks.sh b/egs/voxceleb/adv.v1.1/run_013_eval_whitebox_attacks.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_043_eval_whitebox_attacks.sh rename to egs/voxceleb/adv.v1.1/run_013_eval_whitebox_attacks.sh diff --git a/egs/voxceleb/adv.v1.1/run_044_eval_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1.1/run_014_eval_transfer_blackbox_attacks.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_044_eval_transfer_blackbox_attacks.sh rename to egs/voxceleb/adv.v1.1/run_014_eval_transfer_blackbox_attacks.sh diff --git a/egs/voxceleb/adv.v1.1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh b/egs/voxceleb/adv.v1.1/run_015_eval_whitebox_attacks_with_randsmooth_defense.sh similarity index 67% rename from egs/voxceleb/adv.v1.1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh rename to egs/voxceleb/adv.v1.1/run_015_eval_whitebox_attacks_with_randsmooth_defense.sh index 3077ecf6..fc9d6a7d 100755 --- a/egs/voxceleb/adv.v1.1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh +++ b/egs/voxceleb/adv.v1.1/run_015_eval_whitebox_attacks_with_randsmooth_defense.sh @@ -376,181 +376,3 @@ fi exit - -# #!/bin/bash -# # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# # -# # Apache 2.0. -# # -# . ./cmd.sh -# . ./path.sh -# set -e - -# stage=1 -# config_file=default_config.sh -# use_gpu=false -# do_analysis=false -# save_wav=false - -# . parse_options.sh || exit 1; -# . $config_file -# . datapath.sh - -# if [ "$use_gpu" == "true" ];then -# eval_args="--use-gpu true" -# eval_cmd="$cuda_eval_cmd" -# else -# eval_cmd="$train_cmd" -# fi - -# xvector_dir=exp/xvectors/$nnet_name -# score_dir=exp/scores/$nnet_name - -# score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -# cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -# #thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -# thr005=2.94 -# thr001=4.60 -# thr0001=6.90 -# declare -a score_array -# declare -a stats_array - -# if [ $stage -le 1 ];then - -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_fgsm_e${eps}_randsmooth${sigma} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgsm --eps $eps \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_fgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done - -# fi - - - - -# if [ $stage -le 3 ];then -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha}_randsmooth$sigma -# echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type rand-fgsm --eps $eps --alpha $alpha --smooth-sigma $sigma\ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# done - -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_randfgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done -# fi - - -# if [ $stage -le 4 ];then -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha}_randsmooth$sigma -# echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type iter-fgsm --eps $eps --alpha $alpha \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_iterfgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done -# fi - -# wait diff --git a/egs/voxceleb/adv.v1.1/run_046_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh b/egs/voxceleb/adv.v1.1/run_016_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh similarity index 100% rename from egs/voxceleb/adv.v1.1/run_046_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh rename to egs/voxceleb/adv.v1.1/run_016_eval_whitebox_attacks_with_randsmooth_wavegan_defense.sh diff --git a/egs/voxceleb/adv.v1.1/run_053_eval_art_whitebox_attacks.sh b/egs/voxceleb/adv.v1.1/run_017_eval_art_whitebox_attacks.sh similarity index 54% rename from egs/voxceleb/adv.v1.1/run_053_eval_art_whitebox_attacks.sh rename to egs/voxceleb/adv.v1.1/run_017_eval_art_whitebox_attacks.sh index 92fbcc92..3a92630f 100755 --- a/egs/voxceleb/adv.v1.1/run_053_eval_art_whitebox_attacks.sh +++ b/egs/voxceleb/adv.v1.1/run_017_eval_art_whitebox_attacks.sh @@ -510,7 +510,7 @@ if [ $stage -le 12 ];then echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner Linf attack confidence=$confidence" steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ + --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.initial-c 1e-5" \ --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ --cal-file $cal_file --threshold $thr005 \ $trial_list \ @@ -540,434 +540,510 @@ if [ $stage -le 12 ];then fi +if [ $stage -le 14 ];then + score_array=() + stats_array=() + for norm in inf 1 2 + do + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_autopgdl${norm}_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with Auto-PGD $norm attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ + --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type auto-pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm $norm" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done -# #!/bin/bash -# # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# # -# # Apache 2.0. -# # -# . ./cmd.sh -# . ./path.sh -# set -e - -# stage=1 -# config_file=default_config.sh -# use_gpu=false -# do_analysis=false -# save_wav=false - -# . parse_options.sh || exit 1; -# . $config_file -# . datapath.sh - -# if [ "$use_gpu" == "true" ];then -# eval_args="--use-gpu true" -# eval_cmd="$cuda_eval_cmd" -# else -# eval_cmd="$train_cmd" -# fi + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) -# xvector_dir=exp/xvectors/$nnet_name -# score_dir=exp/scores/$nnet_name + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_autopgdl${norm}_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + done +fi + +if [ $stage -le 15 ];then + score_array=() + stats_array=() + for norm in inf 1 2 + do + for eps in 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_autocgdl${norm}_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with Auto-CGD $norm attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ + --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type auto-cgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm $norm" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done -# score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -# cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) -# #thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -# thr005=2.94 -# thr001=4.60 -# thr0001=6.90 + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_autocgdl${norm}_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + done +fi -# declare -a score_array -# declare -a stats_array +if [ $stage -le 16 ];then + score_array=() + stats_array=() + for eps in 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_deepfool_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with DeepFool attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ + --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type deepfool --attack.eps $eps --attack.max-iter 100" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_deepfool_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi -# if [ $stage -le 1 ];then +if [ $stage -le 17 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_art_fgsm_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + for confidence in 0 #1 + do + score_plda_dir=$score_dir/cosine_art_elasticnet_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with ElasticNet attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type elasticnet --attack.confidence $confidence --attack.max-iter 100 --attack.lr 0.01" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) + done -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgsm_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi +fi -# if [ $stage -le 2 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_fgsm_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM minimal attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-minimal" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats +if [ $stage -le 20 ];then + + for norm in inf 2 + do + score_plda_dir=$score_dir/cosine_art_hopskipjump_norm${norm} + echo "Eval Voxceleb 1 with Cosine scoring with Hopskipjump attack norm=$norm" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type hop-skip-jump --attack.norm $norm --attack.max-iter 50 --attack.max-eval 10000 --attack.init-eval 10 --attack.init-size 100" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) + done -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgsm_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi +fi -# fi -# if [ $stage -le 3 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_art_fgml1_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats +if [ $stage -le 23 ];then + + for eta in 0.01 + do + score_plda_dir=$score_dir/cosine_art_newtonfool_eta$eta + echo "Eval Voxceleb 1 with Cosine scoring with NewtonFool attack eta=$eta" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type newtonfool --attack.eta $eta" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) + done -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgml1_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi +fi -# if [ $stage -le 4 ];then -# score_array=() -# stats_array=() +if [ $stage -le 25 ];then -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_fgml1_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 minimal attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-minimal --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + for lambda_tv in 0.3 + do + score_plda_dir=$score_dir/cosine_art_shadow_theta$theta + echo "Eval Voxceleb 1 with Cosine scoring with Shadow attack lambda=$lambda_tv" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type shadow --attack.lambda-tv $lambda_tv" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgml1_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi -# fi + done +fi -# if [ $stage -le 5 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_art_fgml2_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-norm 2" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats +if [ $stage -le 26 ];then + score_array=() + stats_array=() + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/cosine_art_wass_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with Wassertein attack eps=$eps" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type wasserstein --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.reg 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgml2_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# fi + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/cosine_art_wass_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi +fi -# if [ $stage -le 6 ];then -# score_array=() -# stats_array=() +if [ $stage -le 27 ];then -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_fgml2_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 minimal attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-minimal --attack-norm 2" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + for confidence in 0 #1 + do + score_plda_dir=$score_dir/cosine_art_zoo_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with Zoo attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ + --feat-config $feat_config \ + --attack-opts "--attack.attack-type zoo --attack.confidence $confidence" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) + done -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_fgml2_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi +fi -# fi +# The attacks below have issues when applying to audio -# if [ $stage -le 7 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_iterfgsm_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with IterFGM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type bim --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done +# if [ $stage -le 13 ];then -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) +# for eps in 0.0001 +# do +# score_plda_dir=$score_dir/cosine_art_boundary_eps${eps} +# alpha=$(echo $eps | awk '{ print $0/5.}') +# echo "Eval Voxceleb 1 with Cosine scoring with boundary attack eps=$eps" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ +# --cmd "$eval_cmd" $eval_args --nj 400 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type boundary --attack.eps $eps --attack.delta $eps --attack.max-iter 5000" \ +# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ +# --cal-file $cal_file --threshold $thr005 \ +# $trial_list \ +# data/voxceleb1_test/utt2model \ +# data/voxceleb1_test \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + +# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ +# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + +# for f in $(ls $score_plda_dir/*_results); +# do +# echo $f +# cat $f +# echo "" +# done +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_plda_dir +# local/attack_analysis.sh \ +# --cmd "$train_cmd --mem 10G" \ +# $trial_list $score_clean \ +# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ +# $score_analysis_dir/voxceleb1 & +# fi # done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_iterfgsm_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi + # fi -# if [ $stage -le 8 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 +# it needs acces to hidden layers +# if [ $stage -le 18 ];then +# for eps in 0.00001 0.0001 0.001 0.01 0.1 +# do +# alpha=$(echo $eps | awk '{ print $0/5.}') +# score_plda_dir=$score_dir/cosine_art_fadv_e${eps} +# echo "Eval Voxceleb 1 with Cosine scoring with feature adversaries attack eps=$eps" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ +# --cmd "$eval_cmd" $eval_args --nj 80 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type feature-adv --attack.delta $eps --attack.eps-step $alpha --attack.max-iter 100" \ +# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ +# --cal-file $cal_file --threshold $thr005 \ +# $trial_list \ +# data/voxceleb1_test/utt2model \ +# data/voxceleb1_test \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + +# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ +# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + +# for f in $(ls $score_plda_dir/*_results); # do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_pgdlinf_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - +# echo $f +# cat $f +# echo "" # done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_pgdlinf_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi + +# score_array+=($score_plda_dir/voxceleb1_scores) +# stats_array+=($score_plda_dir/voxceleb1_stats) + +# done +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_dir/cosine_art_fadv_eall +# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ +# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ +# $score_analysis_dir/voxceleb1 & +# fi # fi - -# if [ $stage -le 9 ];then +# if [ $stage -le 19 ];then # score_array=() # stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 +# for norm in inf 1 2 # do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_pgdl1_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10 --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats +# for sigma in 0.0002 +# do +# score_plda_dir=$score_dir/cosine_art_geoda${norm}_s${sigma} +# echo "Eval Voxceleb 1 with Cosine scoring with GeoDA $norm sigma=$sigma" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh \ +# --cmd "$eval_cmd" $eval_args --nj 80 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type geoda --attack.max-iter 4000 --attack.sigma-geoda $sigma --attack.norm $norm" \ +# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ +# --cal-file $cal_file --threshold $thr005 \ +# $trial_list \ +# data/voxceleb1_test/utt2model \ +# data/voxceleb1_test \ +# $xvector_dir/voxceleb1_test/xvector.scp \ +# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats # $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir +# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir # for f in $(ls $score_plda_dir/*_results); # do -# echo $f -# cat $f -# echo "" +# echo $f +# cat $f +# echo "" # done # score_array+=($score_plda_dir/voxceleb1_scores) # stats_array+=($score_plda_dir/voxceleb1_stats) -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_pgdl1_eall +# done +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_dir/cosine_art_geoda${norm}_sall # local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi +# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ +# $score_analysis_dir/voxceleb1 & +# fi +# done # fi -# if [ $stage -le 10 ];then -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 +# +# if [ $stage -le 21 ];then + +# for norm in inf 1 2 # do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_art_pgdl2_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10 --attack-norm 2" \ +# score_plda_dir=$score_dir/cosine_art_brendel_norm${norm} +# echo "Eval Voxceleb 1 with Cosine scoring with Brendel attack norm=$norm" +# steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type brendel --attack.norm $norm --attack.max-iter 1000 --attack.lr 1e-3 --attack.binary-search-steps 10 --attack.init-size 100" \ # --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ # --cal-file $cal_file --threshold $thr005 \ # $trial_list \ @@ -985,28 +1061,28 @@ fi # cat $f # echo "" # done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) +# if [ "${do_analysis}" == "true" ];then +# score_analysis_dir=$score_plda_dir +# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ +# $trial_list $score_clean \ +# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ +# $score_analysis_dir/voxceleb1 & +# fi # done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_art_pgdl2_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi + # fi -# if [ $stage -le 11 ];then +## it needs to train some importance vector +# if [ $stage -le 22 ];then -# for confidence in 0 #1 +# for norm in 2 # do -# score_plda_dir=$score_dir/cosine_art_cwl2_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" +# score_plda_dir=$score_dir/cosine_art_lowprofool_norm${norm} +# echo "Eval Voxceleb 1 with Cosine scoring with LowProFool attack norm=$norm" # steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type cw-l2 --attack-opt "--attack-confidence $confidence" \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type low-pro-fool --attack.norm $norm --attack.max-iter 100" \ # --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ # --cal-file $cal_file --threshold $thr005 \ # $trial_list \ @@ -1036,16 +1112,16 @@ fi # fi +## Too SLOW +# if [ $stage -le 24 ];then -# if [ $stage -le 12 ];then - -# for confidence in 0 #1 +# for theta in 0.1 # do -# score_plda_dir=$score_dir/cosine_art_cwlinf_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner Linf attack confidence=$confidence" +# score_plda_dir=$score_dir/cosine_art_jsma_theta$theta +# echo "Eval Voxceleb 1 with Cosine scoring with JSMA attack theta=$theta" # steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type cw-linf --attack-opt "--attack-confidence $confidence --attack-eps 0.3" \ +# --feat-config $feat_config \ +# --attack-opts "--attack.attack-type jsma --attack.theta $theta" \ # --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ # --cal-file $cal_file --threshold $thr005 \ # $trial_list \ @@ -1074,5 +1150,3 @@ fi # done # fi - - diff --git a/egs/voxceleb/adv.v1.1/run_018_eval_art_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1.1/run_018_eval_art_transfer_blackbox_attacks.sh new file mode 100755 index 00000000..bc6390f2 --- /dev/null +++ b/egs/voxceleb/adv.v1.1/run_018_eval_art_transfer_blackbox_attacks.sh @@ -0,0 +1,633 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +use_gpu=false +do_analysis=false +save_wav=false +use_trials_subset=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +transfer_feat_config=$feat_config + +if [ "$use_gpu" == "true" ];then + eval_args="--use-gpu true" + eval_cmd="$cuda_eval_cmd" +else + eval_cmd="$train_cmd" +fi + +if [ "$use_trials_subset" == "true" ];then + condition=o_clean_1000_1000 +else + condition=o_clean +fi +trial_list=data/voxceleb1_test/trials_$condition + +xvector_dir=exp/xvectors/$nnet_name +score_dir=exp/scores/$nnet_name + +score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores +cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 + +transfer_xvector_dir=exp/xvectors/$transfer_nnet_name +transfer_score_dir=exp/scores/$transfer_nnet_name +transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 + +#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) +thr005=2.94 +thr001=4.60 +thr0001=6.90 +declare -a score_array +declare -a stats_array + +if [ $stage -le 1 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 2 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + + +if [ $stage -le 3 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM L1 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 4 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM minimal L1 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + + +if [ $stage -le 5 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with FGM L2 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 6 ];then + + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring FGM minimal L2 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 2" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 7 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with iter FGSM attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + + +if [ $stage -le 8 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 9 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 10 ];then + score_array=() + stats_array=() + + for eps in 0.00001 0.0001 0.001 0.01 0.1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_e${eps} + echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + score_array+=($score_plda_dir/voxceleb1_scores) + stats_array+=($score_plda_dir/voxceleb1_stats) + + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_eall + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ + $score_analysis_dir/voxceleb1 & + fi + +fi + + +if [ $stage -le 11 ];then + + for confidence in 0 #1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwl2_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + + +if [ $stage -le 12 ];then + + for confidence in 0 #1 + do + alpha=$(echo $eps | awk '{ print $0/5.}') + score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwlinf_conf${confidence} + echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" + steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ + --feat-config $feat_config \ + --transfer-feat-config $transfer_feat_config \ + --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ + --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ + --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ + --threshold $thr005 \ + $trial_list \ + data/voxceleb1_test/utt2model \ + data/voxceleb1_test \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $nnet \ + $transfer_xvector_dir/voxceleb1_test/xvector.scp \ + $transfer_nnet \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats + + $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ + local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir + + for f in $(ls $score_plda_dir/*_results); + do + echo $f + cat $f + echo "" + done + if [ "${do_analysis}" == "true" ];then + score_analysis_dir=$score_plda_dir + local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ + $trial_list $score_clean \ + $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ + $score_analysis_dir/voxceleb1 & + fi + + done + +fi + +wait + diff --git a/egs/voxceleb/adv.v1.1/run_054_eval_art_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1.1/run_054_eval_art_transfer_blackbox_attacks.sh deleted file mode 100755 index bdcdeae4..00000000 --- a/egs/voxceleb/adv.v1.1/run_054_eval_art_transfer_blackbox_attacks.sh +++ /dev/null @@ -1,1260 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -use_trials_subset=false - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -transfer_feat_config=$feat_config - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -if [ "$use_trials_subset" == "true" ];then - condition=o_clean_1000_1000 -else - condition=o_clean -fi -trial_list=data/voxceleb1_test/trials_$condition - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -transfer_xvector_dir=exp/xvectors/$transfer_nnet_name -transfer_score_dir=exp/scores/$transfer_nnet_name -transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 2 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 3 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 4 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM minimal L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 5 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 6 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring FGM minimal L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 7 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with iter FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 8 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 9 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 10 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 11 ];then - - for confidence in 0 #1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwl2_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - -if [ $stage -le 12 ];then - - for confidence in 0 #1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwlinf_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - $trial_list \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - $trial_list $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - -wait - - -# #!/bin/bash -# # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# # -# # Apache 2.0. -# # -# . ./cmd.sh -# . ./path.sh -# set -e - -# stage=1 -# config_file=default_config.sh -# use_gpu=false -# do_analysis=false -# save_wav=false - -# . parse_options.sh || exit 1; -# . $config_file -# . datapath.sh - -# if [ "$use_gpu" == "true" ];then -# eval_args="--use-gpu true" -# eval_cmd="$cuda_eval_cmd" -# else -# eval_cmd="$train_cmd" -# fi - -# xvector_dir=exp/xvectors/$nnet_name -# score_dir=exp/scores/$nnet_name - -# score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -# cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -# transfer_xvector_dir=exp/xvectors/$transfer_nnet_name -# transfer_score_dir=exp/scores/$transfer_nnet_name -# transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 - -# #thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -# thr005=2.94 -# thr001=4.60 -# thr0001=6.90 -# declare -a score_array -# declare -a stats_array - -# if [ $stage -le 1 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 2 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-minimal" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - - -# if [ $stage -le 3 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 4 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM minimal L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-minimal --attack-norm 1" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - - -# if [ $stage -le 5 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with FGM L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-norm 2" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 6 ];then - -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring FGM minimal L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type fgm --attack-opt "--attack-eps $eps --attack-minimal --attack-norm 2" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 7 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with iter FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type bim --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - - -# if [ $stage -le 8 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10" \ - -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 9 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10 --attack-norm 1" \ - -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 10 ];then -# score_array=() -# stats_array=() - -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_e${eps} -# echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat -# --attack-type pgd --attack-opt "--attack-eps $eps --attack-eps-step $alpha --attack-max-iter 10 --attack-norm 2" \ - -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_eall -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi - -# fi - - -# if [ $stage -le 11 ];then - -# for confidence in 0 #1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwl2_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type cw-l2 --attack-opt "--attack-confidence $confidence" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi - -# done - -# fi - - -# if [ $stage -le 12 ];then - -# for confidence in 0 #1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwlinf_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type cw-linf --attack-opt "--attack-confidence $confidence --attack-eps 0.3" \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# $trial_list \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_single_cond.sh data/voxceleb1_test $condition $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# $trial_list $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi - -# done - -# fi - -# wait - diff --git a/egs/voxceleb/adv.v1/conf b/egs/voxceleb/adv.v1/conf deleted file mode 120000 index 7dfe9dce..00000000 --- a/egs/voxceleb/adv.v1/conf +++ /dev/null @@ -1 +0,0 @@ -../../sre19-cmn2/v1/conf \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh index 172da763..39016679 100644 --- a/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh +++ b/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh @@ -3,63 +3,29 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. -# victim x-vector training -nnet_data=voxceleb2cat_train_combined - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.05 +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn -nnet_type=lresnet34 -dropout=0 -embed_dim=256 +#vad +vad_config=conf/vad_16k.yaml -s=30 -margin_warmup=20 -margin=0.3 +# victim x-vector training +nnet_data=voxceleb2cat_train -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34 +nnet_cfg=conf/train_lresnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # transfer model training -transfer_nnet_data=voxceleb2cat_train_combined #this can be voxceleb2cat or voxceleb2cat_combined - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=400 -transfer_max_chunk=400 -transfer_ipe=1 -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb2cat_combined transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_cfg=train_resetdnn_xvec.yaml +transfer_nnet_name=${feat_type}_resetdnn5x512 transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh index 407c0cfd..81f78c60 100644 --- a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh +++ b/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh @@ -3,64 +3,92 @@ # Both models uses the same features: 80 fbanks # Both models uses the same training data. +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + # victim x-vector training -nnet_data=voxceleb2cat_train_combined - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.05 - -nnet_type=resnet34 -dropout=0 -embed_dim=256 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 +nnet_data=voxceleb2cat_train + +# victim x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34 + +nnet_cfg=conf/train_lresnet34_xvec.yaml nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth - # transfer model training -transfer_nnet_data=voxceleb2cat_train_combined #this can be voxceleb2cat or voxceleb2cat_combined - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=400 -transfer_max_chunk=400 -transfer_ipe=1 -transfer_lr=0.05 +transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb2cat_combined transfer_nnet_type=resetdnn -transfer_num_layers=5 -transfer_layer_dim=512 -transfer_expand_dim=1536 -transfer_dilation="1 2 3 4 1" -transfer_kernel_sizes="5 3 3 3 1" -transfer_dropout=0.1 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - +transfer_nnet_name=${feat_type}_resetdnn5x512 transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name transfer_nnet=$transfer_nnet_dir/model_ep0070.pth + +# # victim x-vector training +# nnet_data=voxceleb2cat_train_combined + +# batch_size_1gpu=32 +# eff_batch_size=512 # effective batch size +# min_chunk=400 +# max_chunk=400 +# ipe=1 +# lr=0.05 + +# nnet_type=resnet34 +# dropout=0 +# embed_dim=256 + +# s=30 +# margin_warmup=20 +# margin=0.3 + +# nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" +# opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" +# lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +# nnet_name=${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +# nnet_num_epochs=70 +# num_augs=5 +# nnet_dir=exp/xvector_nnets/$nnet_name +# nnet=$nnet_dir/model_ep0070.pth + + +# # transfer model training +# transfer_nnet_data=voxceleb2cat_train_combined #this can be voxceleb2cat or voxceleb2cat_combined + +# transfer_batch_size_1gpu=128 +# transfer_eff_batch_size=512 # effective batch size +# transfer_min_chunk=400 +# transfer_max_chunk=400 +# transfer_ipe=1 +# transfer_lr=0.05 + +# transfer_nnet_type=resetdnn +# transfer_num_layers=5 +# transfer_layer_dim=512 +# transfer_expand_dim=1536 +# transfer_dilation="1 2 3 4 1" +# transfer_kernel_sizes="5 3 3 3 1" +# transfer_dropout=0.1 +# transfer_embed_dim=256 + +# transfer_s=30 +# transfer_margin_warmup=20 +# transfer_margin=0.3 + +# transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" +# transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" +# transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" +# transfer_nnet_name=${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 +# transfer_nnet_num_epochs=70 + +# transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name +# transfer_nnet=$transfer_nnet_dir/model_ep0070.pth + + diff --git a/egs/voxceleb/adv.v1/local b/egs/voxceleb/adv.v1/local deleted file mode 120000 index ce1cbf90..00000000 --- a/egs/voxceleb/adv.v1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/run_002_compute_evad.sh b/egs/voxceleb/adv.v1/run_002_compute_evad.sh index 98b0db7d..cc3d8296 100755 --- a/egs/voxceleb/adv.v1/run_002_compute_evad.sh +++ b/egs/voxceleb/adv.v1/run_002_compute_evad.sh @@ -9,7 +9,6 @@ set -e nodes=fs01 storage_name=$(date +'%m_%d_%H_%M') vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml stage=1 config_file=default_config.sh diff --git a/egs/voxceleb/adv.v1/run_003_compute_fbank.sh b/egs/voxceleb/adv.v1/run_003_compute_fbank.sh deleted file mode 100755 index 7bd8b6a3..00000000 --- a/egs/voxceleb/adv.v1/run_003_compute_fbank.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -fbankdir=`pwd`/exp/fbank -vaddir=`pwd`/exp/fbank -vaddir_gt=`pwd`/exp/vad_gt - -stage=1 -config_file=default_config.sh -feat_vers="numpy" - -. parse_options.sh || exit 1; - -if [ "$feat_vers" == "kaldi" ];then - make_fbank=steps/make_fbank.sh - fbank_cfg=conf/fbank80_16k.conf -else - fbank_cfg=conf/fbank80_16k.yaml - if [ "$feat_vers" == "numpy" ];then - make_fbank=steps_pyfe/make_fbank.sh - else - make_fbank=steps_pyfe/make_torch_fbank.sh - fi -fi - -# Make filterbanks -if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $fbankdir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/fbank/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $fbankdir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $fbankdir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $fbankdir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $fbankdir/storage - else - echo "we don't distribute data between multiple machines" - fi - fi -fi - -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - $make_fbank --write-utt2num-frames true --fbank-config $fbank_cfg --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_fbank/$name $fbankdir - utils/fix_data_dir.sh data/${name} - done -fi - - diff --git a/egs/voxceleb/adv.v1/run_003_prepare_noises_rirs.sh b/egs/voxceleb/adv.v1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..a448af9a --- /dev/null +++ b/egs/voxceleb/adv.v1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation + +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then + ln -s ../../sre19-cmn2/v1/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/voxceleb/adv.v1/run_004_prepare_augment.sh b/egs/voxceleb/adv.v1/run_004_prepare_augment.sh deleted file mode 100755 index 7d78ae92..00000000 --- a/egs/voxceleb/adv.v1/run_004_prepare_augment.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -# In this script, we augment the SWBD,SRE,MX6 and Voxceleb data with reverberation, -# noise, music, and babble, and combined it with the clean data. -# The combined list will be used to train the xvector DNN. - -frame_shift=0.01 - -if [ $stage -le 1 ]; then - - if [ ! -d "RIRS_NOISES" ]; then - if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then - ln -s ../../sre19-cmn2/v1/RIRS_NOISES - else - # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip - fi - fi - - # Prepare the MUSAN corpus, which consists of music, speech, and noise - # suitable for augmentation. - local/make_musan.sh $musan_root 16 data - - # Get the duration of the MUSAN recordings. This will be used by the - # script augment_data_dir.py. - for name in speech noise music; do - utils/data/get_utt2dur.sh data/musan_${name} - mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur - done - -fi - - -if [ $stage -le 2 ]; then - - for name in voxceleb2cat_train - do - export TMPDIR=data/tmp - mkdir -p $TMPDIR - - awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/$name/utt2num_frames > data/$name/reco2dur - - # Make a reverberated version of the list. Note that we don't add any - # additive noise here. - - # Make a version with reverberated speech - rvb_opts=() - rvb_opts+=(--rir-set-parameters "0.2, RIRS_NOISES/real_rirs_isotropic_noises/rir_list") - rvb_opts+=(--rir-set-parameters "0.4, RIRS_NOISES/simulated_rirs/smallroom/rir_list") - rvb_opts+=(--rir-set-parameters "0.4, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") - - python steps/data/reverberate_data_dir.py \ - "${rvb_opts[@]}" \ - --speech-rvb-probability 1 \ - --pointsource-noise-addition-probability 0 \ - --isotropic-noise-addition-probability 0 \ - --num-replications 1 \ - --source-sampling-rate 16000 \ - data/${name} data/${name}_reverb - cp data/${name}/vad.scp data/${name}_reverb/ - utils/copy_data_dir.sh --utt-suffix "-reverb" data/${name}_reverb data/${name}_reverb.new - rm -rf data/${name}_reverb - mv data/${name}_reverb.new data/${name}_reverb - - - # Augment with musan_noise - python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0:13:8" --fg-noise-dir "data/musan_noise" data/${name} data/${name}_noise - # Augment with musan_music - python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/${name} data/${name}_music - # Augment with musan_speech - python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13:10" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/${name} data/${name}_babble - - - awk '{ $1=$1"-reverb"; print $0}' data/${name}/reco2dur > data/${name}_reverb/reco2dur - - # Augment with musan_noise - python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0:13:8" --fg-noise-dir "data/musan_noise" data/${name}_reverb data/${name}_reverb_noise - # Augment with musan_music - python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/${name}_reverb data/${name}_reverb_music - # Augment with musan_speech - python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13:10" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/${name}_reverb data/${name}_reverb_babble - - - # Combine noise only - utils/combine_data.sh data/${name}_noise_all \ - data/${name}_noise data/${name}_music data/${name}_babble - - # Combine reverbs - utils/combine_data.sh data/${name}_reverb_all data/${name}_reverb \ - data/${name}_reverb_noise data/${name}_reverb_music data/${name}_reverb_babble - - # Combine reverb, noise, music, and babble into one directory. - utils/combine_data.sh data/${name}_aug data/${name}_reverb_all data/${name}_noise_all - unset TMPDIR - done - -fi - - -if [ $stage -le 3 ];then - # Take a random subset of the augmentations - utils/subset_data_dir.sh data/voxceleb2cat_train_aug \ - $(wc -l data/voxceleb2cat_train/utt2spk | awk '{ print int('$num_augs'*$1)}') \ - data/voxceleb2cat_train_augx${num_augs} - utils/fix_data_dir.sh data/voxceleb2cat_train_augx${num_augs} -fi - - -exit diff --git a/egs/voxceleb/adv.v1/run_004_prepare_victim_xvec_train_data.sh b/egs/voxceleb/adv.v1/run_004_prepare_victim_xvec_train_data.sh new file mode 100755 index 00000000..6939052e --- /dev/null +++ b/egs/voxceleb/adv.v1/run_004_prepare_victim_xvec_train_data.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 2 ]; then + # This script preprocess audio for x-vector training + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ + --storage_name voxceleb-adv.v1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${nnet_data} data/${nnet_data}_proc_audio_no_sil exp/${nnet_data}_proc_audio_no_sil + hyp_utils/kaldi/utils/fix_data_dir.sh data/${nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 4s + hyp_utils/remove_short_audios.sh --min-len 4 data/${nnet_data}_proc_audio_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 4 utterances. + hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 4 ]; then + # Prepare train and validation lists for x-vectors + local/make_train_lists_sup_embed_with_augm.sh \ + data/${nnet_data}_proc_audio_no_sil \ + data/${nnet_data}_proc_audio_no_sil/lists_xvec +fi + +exit diff --git a/egs/voxceleb/adv.v1/run_005_compute_fbank_augment.sh b/egs/voxceleb/adv.v1/run_005_compute_fbank_augment.sh deleted file mode 100755 index 10d13e03..00000000 --- a/egs/voxceleb/adv.v1/run_005_compute_fbank_augment.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -fbankdir=`pwd`/exp/fbank - -stage=1 -config_file=default_config.sh -feat_vers="numpy" - -. parse_options.sh || exit 1; -. $config_file - -if [ "$feat_vers" == "kaldi" ];then - make_fbank=steps/make_fbank.sh - fbank_cfg=conf/fbank80_16k.conf -else - fbank_cfg=conf/fbank80_16k.yaml - if [ "$feat_vers" == "numpy" ];then - make_fbank=steps_pyfe/make_fbank.sh - else - make_fbank=steps_pyfe/make_torch_fbank.sh - fi -fi - -export TMPDIR=data/tmp -mkdir -p $TMPDIR - -if [ $stage -le 1 ];then - - # Make filterbanks for the augmented data. Note that we do not compute a new - # vad.scp file here. Instead, we use the vad.scp from the clean version of - # the list. - for name in voxceleb2cat_train_augx${num_augs} - do - $make_fbank --write-utt2num-frames true \ - --fbank-config $fbank_cfg --nj 120 --cmd "$train_cmd" \ - data/$name exp/make_fbank/$name $fbankdir - fix_data_dir.sh data/$name - done - -fi - - -if [ $stage -le 2 ];then - - # Combine the clean and augmented lists. - utils/combine_data.sh --extra-files "utt2num_frames" data/voxceleb2cat_train_combined data/voxceleb2cat_train_augx${num_augs} data/voxceleb2cat_train - -fi - -exit - diff --git a/egs/voxceleb/adv.v1/run_006_prepare_transfer_xvec_train_data.sh b/egs/voxceleb/adv.v1/run_006_prepare_transfer_xvec_train_data.sh new file mode 100755 index 00000000..f80d2924 --- /dev/null +++ b/egs/voxceleb/adv.v1/run_006_prepare_transfer_xvec_train_data.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ "$transfer_nnet_data" == "$nnet_data" ];then + echo "Training data for victim and transfer model are the same" + echo "Skipping this step" + exit 0 +fi + +if [ $stage -le 2 ]; then + # This script preprocess audio for x-vector training + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ + --storage_name voxceleb-adv.v1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${transfer_nnet_data} data/${transfer_nnet_data}_proc_audio_no_sil exp/${transfer_nnet_data}_proc_audio_no_sil + hyp_utils/kaldi/utils/fix_data_dir.sh data/${transfer_nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 4s + hyp_utils/remove_short_audios.sh --min-len 4 data/${transfer_nnet_data}_proc_audio_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 4 utterances. + hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${transfer_nnet_data}_proc_audio_no_sil + +fi + +if [ $stage -le 4 ]; then + # Prepare train and validation lists for x-vectors + local/make_train_lists_sup_embed_with_augm.sh \ + data/${transfer_nnet_data}_proc_audio_no_sil \ + data/${transfer_nnet_data}_proc_audio_no_sil/lists_xvec +fi + +exit diff --git a/egs/voxceleb/adv.v1/run_008_extract_xvectors_victim_model.sh b/egs/voxceleb/adv.v1/run_008_extract_xvectors_victim_model.sh new file mode 100755 index 00000000..03234eaa --- /dev/null +++ b/egs/voxceleb/adv.v1/run_008_extract_xvectors_victim_model.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=1 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=12800 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 4G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + for name in voxceleb1_test + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name + done +fi diff --git a/egs/voxceleb/adv.v1/run_010_prepare_victim_xvec_train_data.sh b/egs/voxceleb/adv.v1/run_010_prepare_victim_xvec_train_data.sh deleted file mode 100755 index 25a59571..00000000 --- a/egs/voxceleb/adv.v1/run_010_prepare_victim_xvec_train_data.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -# Now we prepare the features to generate examples for xvector training. -if [ $stage -le 2 ]; then - # This script applies CMVN and removes nonspeech frames. Note that this is somewhat - # wasteful, as it roughly doubles the amount of training data on disk. After - # creating training examples, this can be removed. - steps_xvec/prepare_feats_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-adv.v1-$(date +'%m_%d_%H_%M') \ - data/${nnet_data} data/${nnet_data}_no_sil exp/${nnet_data}_no_sil - utils/fix_data_dir.sh data/${nnet_data}_no_sil - -fi - - -if [ $stage -le 3 ]; then - # Now, we need to remove features that are too short after removing silence - # frames. We want atleast 4s (400 frames) per utterance. - hyp_utils/remove_short_utts.sh --min-len 400 data/${nnet_data}_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 8 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 8 data/${nnet_data}_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh data/${nnet_data}_no_sil data/${nnet_data}_no_sil/lists_xvec -fi - -exit diff --git a/egs/voxceleb/adv.v1/run_011_train_victim_xvector.sh b/egs/voxceleb/adv.v1/run_011_train_victim_xvector.sh deleted file mode 100755 index 141afa62..00000000 --- a/egs/voxceleb/adv.v1/run_011_train_victim_xvector.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=8 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - - if [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]]; then - train_exec=torch-train-resnet-xvec.py - elif [[ ${nnet_type} =~ efficientnet ]]; then - train_exec=torch-train-efficientnet-xvec.py - elif [[ ${nnet_type} =~ tdnn ]]; then - train_exec=torch-train-tdnn-xvec.py - elif [[ ${nnet_type} =~ transformer ]]; then - train_exec=torch-train-transformer-xvec-v1.py - else - echo "$nnet_type not supported" - exit 1 - fi - - mkdir -p $nnet_dir/log - $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec \ - --data-rspec scp:$list_dir/feats.scp \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --num-frames-file $list_dir/utt2num_frames \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --embed-dim $embed_dim $nnet_opt $opt_opt $lrs_opt \ - --epochs $nnet_num_epochs \ - --s $s --margin $margin --margin-warmup-epochs $margin_warmup \ - --dropout-rate $dropout \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args - -fi - - -exit diff --git a/egs/voxceleb/adv.v1/run_012_prepare_transfer_xvec_train_data.sh b/egs/voxceleb/adv.v1/run_012_prepare_transfer_xvec_train_data.sh deleted file mode 100755 index b622e992..00000000 --- a/egs/voxceleb/adv.v1/run_012_prepare_transfer_xvec_train_data.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -if [ "$transfer_nnet_data" == "$nnet_data" ];then - echo "Training data for victim and transfer model are the same" - echo "Skipping this step" - exit 0 -fi - -# Now we prepare the features to generate examples for xvector training. -if [ $stage -le 2 ]; then - # This script applies CMVN and removes nonspeech frames. Note that this is somewhat - # wasteful, as it roughly doubles the amount of training data on disk. After - # creating training examples, this can be removed. - steps_xvec/prepare_feats_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-$(date +'%m_%d_%H_%M') \ - data/${transfer_nnet_data} data/${transfer_nnet_data}_no_sil \ - exp/${transfer_nnet_data}_no_sil - utils/fix_data_dir.sh data/${transfer_nnet_data}_no_sil - -fi - - -if [ $stage -le 3 ]; then - # Now, we need to remove features that are too short after removing silence - # frames. We want atleast 4s (400 frames) per utterance. - hyp_utils/remove_short_utts.sh --min-len 400 data/${transfer_nnet_data}_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 8 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 8 data/${transfer_nnet_data}_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh data/${transfer_nnet_data}_no_sil \ - data/${transfer_nnet_data}_no_sil/lists_xvec -fi - -exit diff --git a/egs/voxceleb/adv.v1/run_013_train_transfer_xvector.sh b/egs/voxceleb/adv.v1/run_013_train_transfer_xvector.sh deleted file mode 100755 index ad2c0177..00000000 --- a/egs/voxceleb/adv.v1/run_013_train_transfer_xvector.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=8 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$nnet" == "$transfer_nnet" ];then - echo "Victim and transfer model are the same" - echo "Skipping this step" - exit 0 -fi - -nnet_data=$transfer_nnet_data -batch_size_1gpu=$transfer_batch_size_1gpu -eff_batch_size=$transfer_eff_batch_size -min_chunk=$transfer_min_chunk -max_chunk=$transfer_max_chunk -ipe=$transfer_ipe - -nnet_type=$transfer_nnet_type -dropout=$transfer_dropout -embed_dim=$transfer_embed_dim - -s=$transfer_s -margin_warmup=$transfer_margin_warmup -margin=$transfer_margin - -nnet_dir=$transfer_nnet_dir -nnet_opt=$transfer_nnet_opt -opt_opt=$transfer_opt_opt -lrs_opt=$transfer_lrs_opt - -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - - if [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]]; then - train_exec=torch-train-resnet-xvec.py - elif [[ ${nnet_type} =~ efficientnet ]]; then - train_exec=torch-train-efficientnet-xvec.py - elif [[ ${nnet_type} =~ tdnn ]]; then - train_exec=torch-train-tdnn-xvec.py - elif [[ ${nnet_type} =~ transformer ]]; then - train_exec=torch-train-transformer-xvec-v1.py - else - echo "$nnet_type not supported" - exit 1 - fi - - mkdir -p $nnet_dir/log - $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec \ - --data-rspec scp:$list_dir/feats.scp \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --num-frames-file $list_dir/utt2num_frames \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --embed-dim $embed_dim $nnet_opt $opt_opt $lrs_opt \ - --epochs $nnet_num_epochs \ - --s $s --margin $margin --margin-warmup-epochs $margin_warmup \ - --dropout-rate $dropout \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args - -fi - - -exit diff --git a/egs/voxceleb/adv.v1/run_030_extract_xvectors_victim_model.sh b/egs/voxceleb/adv.v1/run_030_extract_xvectors_victim_model.sh deleted file mode 100755 index 02eb78de..00000000 --- a/egs/voxceleb/adv.v1/run_030_extract_xvectors_victim_model.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false - -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length 12800" - xvec_cmd="$cuda_eval_cmd" -else - xvec_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name - -if [ $stage -le 1 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - $nnet data/$name \ - $xvector_dir/$name - done -fi - -exit diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 1ee9468f..b8a17dc6 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -95,9 +95,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | | | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | | | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | || | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.40 | 0.065 | +| | | | Cosine + AS-Norm | 0.52 | 0.33 | 0.045 | +| | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | ### VoxCeleb 1 Entire-Clean trial list @@ -109,9 +109,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | | | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | | | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| +| | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | +| | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | ### VoxCeleb 1 Hard-Clean trial list @@ -123,9 +123,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | | | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | | | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | +| | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | +| | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | ### VoxSRC2022 dev @@ -137,9 +137,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | | | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | | | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | || | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | +| | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | +| | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | ## Results before 2023 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..9e302200 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.05 + se_r: 4 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..469e166b --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh new file mode 100644 index 00000000..003bf978 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/README.md b/egs/voxceleb/v1.2/README.md new file mode 100644 index 00000000..1ee9468f --- /dev/null +++ b/egs/voxceleb/v1.2/README.md @@ -0,0 +1,263 @@ +# VoxCeleb V1.1 + +Recipe for the VoxCeleb Speaker Verification Task + +## Differences w.r.t VoxCeleb V1 recipe + +In recipe version V1: + - We compute speech augmentations and acoustic features offline and dump them to disk. + - Augmentation is performed using Kaldi scripts and wav-reverbate tool + - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. + +In this recipe: + - We compute speech augmentations and acoustic features are computed always on-the-fly, + we don't dump any features to disk. + - Augmentation is performed using Hyperin SpeechAugment class. + - The behavior of this class is controlled + by the the configuration file `conf/reverb_noise_aug.yml`, + which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. + - Babble noise is created offline by mixing 3-10 single speaker files. + + +## Citing + +## Training Data + + - x-Vector network is trained on Voxceleb2 dev + test with augmentations + - MUSAN noise + - RIR reverberation + +## Test data + + - Test data is VoxCeleb 1 + - We evaluate 6 conditions: + - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers + - Voxceleb-O-cleaned: VoxCeleb-O cleaned-up of some errors + - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 + - Voxceleb-E-cleaned: VoxCeleb-E cleaned-up of some errors + - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. + - Voxceleb-H-cleaned: VoxCeleb-H cleaned-up of some errors + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it will use Light ResNet (16 base channels) + - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as +```bash +run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true +run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +``` + + - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` + +## Recipe Steps: + + - `run_001_prepare_data.sh` + - Data preparation script to generate Kaldi style data directories for + - VoxCeleb2 train+test + - VoxCeleb1 O/E/H eval sets + + - `run_002_compute_evad.sh` + - Computes Energy VAD for all datasets + + - `run_003_prepare_noises_rirs.sh` + - Prepares MUSAN noises, music to be used by SpeechAugment class. + - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. + - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. + + - `run_010_prepare_xvec_train_data.sh` + - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. + - Removes silence from the audios + - Removes utterances shorter than 4secs and speakers with less than 8 utterances. + - Creates training and validation lists for x-vector training + + - `run_011_train_xvector.sh` + - Trains the x-vector network + + - `run_030_extract_xvectors.sh` + - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training + - Exctracts x-vectors for VoxCeleb1 test sets + + - `run_040_eval_be.sh` + - Trains PLDA and evals PLDA and cosine scoring back-ends + + +## Results + +### VoxCeleb 1 Original-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.11 | 0.069 | 0.126 | +| | | | Cosine + AS-Norm | 1.10 | 0.065 | 0.108 | +| | | | Cosine + QMF | 0.95 | 0.059 | 0.084 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | +| | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | +| | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | || | + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.16 | 0.073 | 0.130 | +| | | | Cosine + AS-Norm | 1.13 | 0.068 | 0.118 | +| | | | Cosine + QMF | 1.06 | 0.064 | 0.112 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | +| | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.10 | 0.128 | 0.209 | +| | | | Cosine + AS-Norm | 1.99 | 0.118 | 0.190 | +| | | | Cosine + QMF | 1.84 | 0.111 | 0.184 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | +| | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | +| | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.87 | 0.185 | 0.304 | +| | | | Cosine + AS-Norm | 2.84 | 0.182 | 0.304 | +| | | | Cosine + QMF | 2.61 | 0.172 | 0.283 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | +| | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | +| | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | || | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | + +## Results before 2023 + +### VoxCeleb 1 Original-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | +| | | | Cosine | 2.04 | 0.138 | 0.210 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.35 | 0.091 | 0.159 | +| | | | Cosine | 1.22 | 0.082 | 0.129 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 | +| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.090 | 0.160 | +| | | | Cosine | 1.44 | 0.100 | 0.173 | +| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 | +| | | | Cosine | 1.17 | 0.081 | 0.110 | +| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 | +| | | | Cosine | 1.31 | 0.080 | 0.139 | +| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 | +| | | | Cosine | 1.23 | 0.083 | 0.136 | +| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.095 | 0.156 | +| | | | Cosine | 1.29 | 0.089 | 0.146 | +| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.084 | 0.136 | +| | | | Cosine | 1.18 | 0.078 | 0.115 | +| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.084 | 0.145 | +| | | | Cosine | 1.12 | 0.073 | 0.131 | +| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.53 | 0.104 | 0.189 | +| | | | Cosine | 1.31 | 0.084 | 0.132 | +| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 0.98 | 0.066 | 0.116 | +| | | | Cosine | 1.12 | 0.071 | 0.103 | +| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.05 | 0.077 | 0.123 | +| | | | Cosine | 0.96 | 0.065 | 0.110 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.04 | 0.071 | 0.118 | +| | | | Cosine | 0.93 | 0.067 | 0.108 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 0.90 | 0.067 | 0.118 | +| | | | Cosine | 0.85 | 0.060 | 0.094 | +| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 | +| | | | Cosine | 1.29 | 0.084 | 0.140 | +| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 | +| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 | +| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 | + + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 | +| | | | Cosine | 1.93 | 0.122 | 0.201 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 | +| | | | Cosine | 1.24 | 0.080 | 0.136 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 | +| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 | +| | | | Cosine | 1.30 | 0.082 | 0.150 | +| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 | +| | | | Cosine | 1.09 | 0.071 | 0.124 | +| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 | +| | | | Cosine | 1.15 | 0.076 | 0.132 | +| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 | +| | | | Cosine | 1.27 | 0.082 | 0.148 | +| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.31 | 0.086 | 0.149 | +| | | | Cosine | 1.22 | 0.079 | 0.134 | +| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.082 | 0.145 | +| | | | Cosine | 1.16 | 0.074 | 0.130 | +| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.077 | 0.136 | +| | | | Cosine | 1.11 | 0.071 | 0.125 | +| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.46 | 0.097 | 0.173 | +| | | | Cosine | 1.24 | 0.080 | 0.140 | +| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.071 | 0.127 | +| | | | Cosine | 1.05 | 0.067 | 0.117 | +| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.078 | 0.134 | +| | | | Cosine | 1.05 | 0.069 | 0.121 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.18 | 0.075 | 0.131 | +| | | | Cosine | 0.98 | 0.063 | 0.110 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA | 1.17 | 0.072 | 0.123 | +| | | | Cosine | 0.94 | 0.061 | 0.107 | +| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 | +| | | | Cosine | 1.27 | 0.079 | 0.142 | +| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 | +| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 | +| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 | + + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 | +| | | | Cosine | 3.27 | 0.188 | 0.303 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 | +| | | | Cosine | 2.32 | 0.139 | 0.232 | +| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 | +| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 | +| | | | Cosine | 2.33 | 0.142 | 0.235 | +| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 | +| | | | Cosine | 2.14 | 0.126 | 0.203 | +| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 | +| | | | Cosine | 2.11 | 0.127 | 0.205 | +| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 | +| | | | Cosine | 2.33 | 0.141 | 0.232 | +| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 2.42 | 0.144 | 0.245 | +| | | | Cosine | 2.26 | 0.133 | 0.224 +| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.39 | 0.141 | 0.235 | +| | | | Cosine | 2.17 | 0.128 | 0.215 +| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.28 | 0.131 | 0.225 | +| | | | Cosine | 2.11 | 0.124 | 0.204 | +| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 2.77 | 0.172 | 0.271 | +| | | | Cosine | 2.45 | 0.141 | 0.225 | +| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 2.07 | 0.124 | 0.201 | +| | | | Cosine | 1.95 | 0.113 | 0.181 | +| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 2.34 | 0.136 | 0.230 | +| | | | Cosine | 1.99 | 0.119 | 0.196 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 2.18 | 0.127 | 0.211 | +| | | | Cosine | 1.89 | 0.112 | 0.184 | +| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 2.14 | 0.125 | 0.209 | +| | | | Cosine | 1.84 | 0.110 | 0.186 | +| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 | +| | | | Cosine | 2.26 | 0.134 | 0.214 | +| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 | +| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 | +| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 | diff --git a/egs/voxceleb/v1.2/cmd.sh b/egs/voxceleb/v1.2/cmd.sh new file mode 100755 index 00000000..040f458b --- /dev/null +++ b/egs/voxceleb/v1.2/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/v1.2/conf/clsp.conf b/egs/voxceleb/v1.2/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v1.2/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_long.conf b/egs/voxceleb/v1.2/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_rtx.conf b/egs/voxceleb/v1.2/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_short.conf b/egs/voxceleb/v1.2/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v1.2/conf/coe_gpu_v100.conf b/egs/voxceleb/v1.2/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v1.2/conf/fbank80_specaug1_stmn_16k.yaml b/egs/voxceleb/v1.2/conf/fbank80_specaug1_stmn_16k.yaml new file mode 100644 index 00000000..8df42fc6 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/fbank80_specaug1_stmn_16k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.2/conf/fbank80_stmn_16k.yaml b/egs/voxceleb/v1.2/conf/fbank80_stmn_16k.yaml new file mode 100644 index 00000000..f4091f5d --- /dev/null +++ b/egs/voxceleb/v1.2/conf/fbank80_stmn_16k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..1633f4a2 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -0,0 +1,95 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.2 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + #min_lr: 1.0e-05 + min_lr: 1.0e-06 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..877736b3 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0.25 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..f15d453d --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml @@ -0,0 +1,93 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..45e55d97 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0. +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/vad_16k.yaml b/egs/voxceleb/v1.2/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/v1.2/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v1.2/datapath.sh b/egs/voxceleb/v1.2/datapath.sh new file mode 100644 index 00000000..a7eb575c --- /dev/null +++ b/egs/voxceleb/v1.2/datapath.sh @@ -0,0 +1,23 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/v1.2/default_config.sh b/egs/voxceleb/v1.2/default_config.sh new file mode 120000 index 00000000..fd0e1bb1 --- /dev/null +++ b/egs/voxceleb/v1.2/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh new file mode 100644 index 00000000..f2622b0e --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn2048x4.v3.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh new file mode 100644 index 00000000..a3ad0c29 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN small + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3.v3.0 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0040.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/path.sh b/egs/voxceleb/v1.2/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/v1.2/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh new file mode 100755 index 00000000..831eb1bc --- /dev/null +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh +. $config_file + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + hyp_utils/conda_env.sh \ + prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train + #local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train +fi +exit +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + # This script is for the old version of the dataset + # local/make_voxceleb1_oeh.pl $voxceleb1_root data + # Use this for the newer version of voxceleb1: + local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data +fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + local/prepare_voxsrc22_dev.py \ + --vox1-corpus-dir $voxceleb1_root \ + --voxsrc22-corpus-dir $voxsrc22_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then +# local/prepare_voxsrc22_test.py \ +# --corpus-dir $voxsrc22_root \ +# --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # # split vox2 into 2 parts, for cohort and qmf training + local/make_vox2_trials.py --data-dir data/voxceleb2cat_train +fi diff --git a/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav.sh b/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav.sh index 0d7e5d4c..7a97bb56 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav.sh @@ -85,7 +85,7 @@ echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-adv-test-wav.py \ + eval_xvec_cosine_scoring_from_adv_test_wav.py \ --feats $feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav_wavegan.sh b/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav_wavegan.sh index f083ecb8..5ad16f77 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav_wavegan.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_adv_test_wav_wavegan.sh @@ -92,7 +92,7 @@ fi echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py \ + eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py \ --feats $feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/adv/eval_cosine_scoring_from_art_test_wav.sh b/hyp_utils/adv/eval_cosine_scoring_from_art_test_wav.sh index 3abd289b..bca8266e 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_art_test_wav.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_art_test_wav.sh @@ -88,7 +88,7 @@ echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ART_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-art-test-wav.py \ + eval_xvec_cosine_scoring_from_art_test_wav.py \ --feats $feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh b/hyp_utils/adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh index 51c248fd..008b6ccc 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh @@ -94,7 +94,7 @@ echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py \ + eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py \ --feats $feat_config --transfer_feats $transfer_feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/adv/eval_cosine_scoring_from_transfer_art_test_wav.sh b/hyp_utils/adv/eval_cosine_scoring_from_transfer_art_test_wav.sh index 7f497d02..b60cdee4 100755 --- a/hyp_utils/adv/eval_cosine_scoring_from_transfer_art_test_wav.sh +++ b/hyp_utils/adv/eval_cosine_scoring_from_transfer_art_test_wav.sh @@ -96,7 +96,7 @@ echo "$0: score $key_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ART_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py \ + eval_xvec_cosine_scoring_from_transfer_art_test_wav.py \ --feats $feat_config --transfer_feats $transfer_feat_config ${args} \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyp_utils/xvectors/eval_cosine_scoring_from_test_wav.sh b/hyp_utils/xvectors/eval_cosine_scoring_from_test_wav.sh index b17a3ea2..963fd91b 100755 --- a/hyp_utils/xvectors/eval_cosine_scoring_from_test_wav.sh +++ b/hyp_utils/xvectors/eval_cosine_scoring_from_test_wav.sh @@ -70,7 +70,7 @@ echo "$0: score $ndx_file to $output_dir" $cmd JOB=1:$nj $log_dir/${name}.JOB.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $num_gpus \ - torch-eval-xvec-cosine-scoring-from-test-wav.py \ + eval_xvec_cosine_scoring_from_test_wav.py \ --feats $feat_config ${args} \ --v-file scp:$vector_file \ --ndx-file $ndx_file \ diff --git a/hyp_utils/xvectors/eval_xvec_logits_from_wav.sh b/hyp_utils/xvectors/eval_xvec_logits_from_wav.sh index bdd53862..4765e809 100755 --- a/hyp_utils/xvectors/eval_xvec_logits_from_wav.sh +++ b/hyp_utils/xvectors/eval_xvec_logits_from_wav.sh @@ -84,7 +84,7 @@ fi if [ $stage -le 0 ];then $cmd JOB=1:$nj $output_dir/log/eval_logits.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-eval-xvec-logits-from-wav.py \ + eval_xvec_logits_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ --part-idx JOB --num-parts $nj \ --input $data_dir/wav.scp \ diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py new file mode 100755 index 00000000..f387c7ac --- /dev/null +++ b/hyperion/bin/adv_finetune_xvector_from_wav.py @@ -0,0 +1,482 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +import torch +import torch.nn as nn +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.adv_attacks import AttackFactory +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import SpineNetXVector as SpineXVec +from hyperion.torch.models import TDNNXVector as TDXVec +from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "efficientnet": EXVec, + "tdnn": TDXVec, + "transformer": TFXVec, + "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_feats(rank, **kwargs): + feat_args = AF.filter_args(**kwargs["feats"]) + if rank == 0: + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + if rank == 0: + logging.info("feat-extractor={}".format(feat_extractor)) + return feat_extractor + + +def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes + model = TML.load(in_model_file) + model.change_config(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + try: + hard_prototype_mining = val_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + +def init_attack(feat_extractor, model, wav_scale, **kwargs): + victim_model = nn.Sequential(feat_extractor, model) + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = { + "eps_scale": wav_scale, + "loss": nn.functional.cross_entropy, + "time_dim": 1, + } + attack_args.update(extra_args) + logging.info("attacks args={}".format(attack_args)) + attack = AttackFactory.create(victim_model, **attack_args) + return attack + + +def train_xvec(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + feat_extractor = init_feats(**kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) + kwargs["wav_scale"] = train_loader.dataset.wav_scale + attack = init_attack(feat_extractor, model, **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + attack, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + AF.add_class_args(parser, prefix="feats") + xvec_class.add_finetune_args(parser, prefix="model") + AttackFactory.add_class_args(parser, prefix="attack") + + parser.add_argument("--in-model-file", required=True) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="""Fine-tune x-vector model from audio files + with adversarial training""" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +# def init_data( +# audio_path, +# train_list, +# val_list, +# train_aug_cfg, +# val_aug_cfg, +# num_workers, +# num_gpus, +# rank, +# **kwargs +# ): + +# ad_args = AD.filter_args(**kwargs) +# sampler_args = Sampler.filter_args(**kwargs) +# if rank == 0: +# logging.info("audio dataset args={}".format(ad_args)) +# logging.info("sampler args={}".format(sampler_args)) +# logging.info("init datasets") + +# train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) +# val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) + +# if rank == 0: +# logging.info("init samplers") +# train_sampler = Sampler(train_data, **sampler_args) +# val_sampler = Sampler(val_data, **sampler_args) + +# num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) +# largs = ( +# {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} +# ) + +# train_loader = torch.utils.data.DataLoader( +# train_data, batch_sampler=train_sampler, **largs +# ) + +# test_loader = torch.utils.data.DataLoader( +# val_data, batch_sampler=val_sampler, **largs +# ) + +# return train_loader, test_loader + + +# def init_feats(rank, **kwargs): +# feat_args = AF.filter_args(**kwargs["feats"]) +# if rank == 0: +# logging.info("feat args={}".format(feat_args)) +# logging.info("initializing feature extractor") +# feat_extractor = AF(trans=True, **feat_args) +# if rank == 0: +# logging.info("feat-extractor={}".format(feat_extractor)) +# return feat_extractor + + +# def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): +# xvec_args = XVec.filter_finetune_args(**kwargs) +# if rank == 0: +# logging.info("xvector network ft args={}".format(xvec_args)) +# xvec_args["num_classes"] = num_classes +# model = TML.load(in_model_path) +# model.rebuild_output_layer(**xvec_args) +# if train_mode == "ft-embed-affine": +# model.freeze_preembed_layers() +# if rank == 0: +# logging.info("x-vector-model={}".format(model)) +# return model + + +# def init_attack(feat_extractor, model, wav_scale, **kwargs): +# victim_model = nn.Sequential(feat_extractor, model) +# attack_args = AttackFactory.filter_args(**kwargs["attack"]) +# extra_args = { +# "eps_scale": wav_scale, +# "loss": nn.functional.cross_entropy, +# "time_dim": 1, +# } +# attack_args.update(extra_args) +# logging.info("attacks args={}".format(attack_args)) +# attack = AttackFactory.create(victim_model, **attack_args) +# return attack + + +# def train_xvec(gpu_id, args): + +# config_logger(args.verbose) +# del args.verbose +# logging.debug(args) + +# kwargs = namespace_to_dict(args) +# torch.manual_seed(args.seed) +# set_float_cpu("float32") + +# train_mode = kwargs["train_mode"] + +# ddp_args = ddp.filter_ddp_args(**kwargs) +# device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) +# kwargs["rank"] = rank + +# train_loader, test_loader = init_data(**kwargs) +# feat_extractor = init_feats(**kwargs) +# model = init_xvector(train_loader.dataset.num_classes, **kwargs) +# kwargs["wav_scale"] = train_loader.dataset.wav_scale +# attack = init_attack(feat_extractor, model, **kwargs) + +# trn_args = Trainer.filter_args(**kwargs) +# if rank == 0: +# logging.info("trainer args={}".format(trn_args)) +# metrics = {"acc": CategoricalAccuracy()} +# trainer = Trainer( +# model, +# feat_extractor, +# attack, +# device=device, +# metrics=metrics, +# ddp=world_size > 1, +# train_mode=train_mode, +# **trn_args +# ) +# if args.resume: +# trainer.load_last_checkpoint() +# trainer.fit(train_loader, test_loader) + +# ddp.ddp_cleanup() + + +# if __name__ == "__main__": + +# parser = ArgumentParser( +# description="Fine-tune x-vector model with adv attacks on wav domain" +# ) + +# parser.add_argument("--cfg", action=ActionConfigFile) +# parser.add_argument("--audio-path", required=True) +# parser.add_argument("--train-list", dest="train_list", required=True) +# parser.add_argument("--val-list", dest="val_list", required=True) + +# AD.add_argparse_args(parser) +# Sampler.add_argparse_args(parser) + +# parser.add_argument("--train-aug-cfg", default=None) +# parser.add_argument("--val-aug-cfg", default=None) + +# parser.add_argument( +# "--num-workers", type=int, default=5, help="num_workers of data loader" +# ) + +# AF.add_class_args(parser, prefix="feats") +# parser.add_argument("--in-model-path", required=True) + +# XVec.add_finetune_args(parser) +# AttackFactory.add_class_args(parser, prefix="attack") + +# Trainer.add_class_args(parser) +# ddp.add_ddp_args(parser) + +# # parser.add_argument('--num-gpus', type=int, default=1, +# # help='number of gpus, if 0 it uses cpu') +# parser.add_argument( +# "--seed", type=int, default=1123581321, help="random seed (default: 1)" +# ) +# parser.add_argument( +# "--resume", +# action="store_true", +# default=False, +# help="resume training from checkpoint", +# ) +# parser.add_argument( +# "--train-mode", +# default="ft-full", +# choices=["ft-full", "ft-embed-affine"], +# help=( +# "ft-full: adapt full x-vector network" +# "ft-embed-affine: adapt affine transform before embedding" +# ), +# ) + +# # parser.add_argument('--attack-eps', required=True, type=float, +# # help='epsilon adversarial attack') +# # parser.add_argument('--attack-eps-step', required=True, type=float, +# # help='eps step adversarial attack') +# # parser.add_argument('--attack-random-eps', default=False, +# # action='store_true', +# # help='use random eps in adv. attack') + +# # parser.add_argument('--attack-max-iter', default=10, type=int, +# # help='number of iterations for adversarial optimization') + +# # parser.add_argument('--p-attack', default=0.5, type=float, +# # help='ratio of batches with adv attack') + +# parser.add_argument( +# "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int +# ) +# parser.add_argument("--local_rank", default=0, type=int) + +# args = parser.parse_args() +# gpu_id = args.local_rank +# del args.local_rank + +# if gpu_id == 0: +# try: +# config_file = Path(args.exp_path) / "config.yaml" +# parser.save(args, str(config_file), format="yaml", overwrite=True) +# except: +# pass + +# # torch docs recommend using forkserver +# multiprocessing.set_start_method("forkserver") +# train_xvec(gpu_id, args) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index 437127b2..f0a2d010 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -10,8 +10,12 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -223,9 +227,7 @@ def eval_cosine_scoring( vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( - device - ) + vad = torch.tensor(vad, dtype=torch.bool).to(device) model.vad_t = vad logging.info( "utt %s detected %d/%d (%.2f %%) speech frames" @@ -244,7 +246,7 @@ def eval_cosine_scoring( for i in range(key.num_models): if key.tar[i, j] or key.non[i, j]: t3 = time.time() - model.x_e = x_e[i].to(device) + model.x_e = x_e[i : i + 1].to(device) if key.tar[i, j]: if attack.targeted: t = non diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index aaa91214..23c9bf68 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -7,13 +7,18 @@ import os import sys import time + # [Added Sonal May21] from pathlib import Path import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -281,9 +286,7 @@ def eval_cosine_scoring_wavegan( vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor( - vad.astype(np.bool, copy=False), dtype=torch.bool, device=device - ) + vad = torch.tensor(vad, dtype=torch.bool).to(device) model.vad_t = vad logging.info( "utt %s detected %d/%d (%.2f %%) speech frames" @@ -302,7 +305,7 @@ def eval_cosine_scoring_wavegan( for i in range(key.num_models): if key.tar[i, j] or key.non[i, j]: t3 = time.time() - model.x_e = x_e[i].to(device) + model.x_e = x_e[i : i + 1].to(device) if key.tar[i, j]: if attack.targeted: t = non diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index 8d4add76..c204e218 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -11,9 +11,13 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from art.estimators.classification import PyTorchClassifier +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -24,8 +28,9 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device @@ -102,6 +107,10 @@ def __init__( self.threshold = threshold def forward(self, s_t): + if s_t.dim() == 4: + # this is for attacks that only work in 4D inputs + s_t = s_t[0, 0] + f_t = s_t f_t = self.feat_extractor(s_t) if self.vad_t is not None: @@ -116,6 +125,10 @@ def forward(self, s_t): f_t = f_t.transpose(1, 2).contiguous() x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) + if self.x_e is None: + # this is for auto-pgd, when it runs a dummy evaluation + self.x_e = x_t + x_t = l2_norm(x_t) x_e = l2_norm(self.x_e) tar_score = torch.sum(x_e * x_t, dim=-1, keepdim=True) @@ -164,15 +177,15 @@ def eval_cosine_scoring( model.to(device) model.eval() - tar = np.asarray([1], dtype=np.int) - non = np.asarray([0], dtype=np.int) + tar = np.asarray([1], dtype=int) + non = np.asarray([0], dtype=int) logging.info("loading key and enrollment x-vectors") key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) audio_args = AR.filter_args(**kwargs) - audio_reader = AR(test_wav_file) + audio_reader = AR(test_wav_file, **audio_args) wav_scale = audio_reader.wav_scale if save_adv_wav: @@ -207,7 +220,7 @@ def eval_cosine_scoring( for j in range(key.num_tests): t1 = time.time() - logging.info("scoring test utt %s" % (key.seg_set[j])) + logging.info("scoring test utt %s", key.seg_set[j]) s, fs = audio_reader.read([key.seg_set[j]]) s = s[0] fs = fs[0] @@ -224,18 +237,14 @@ def eval_cosine_scoring( vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( - device - ) + vad = torch.tensor(vad, dtype=torch.bool).to(device) model.vad_t = vad logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key.seg_set[j], - speech_frames, - tot_frames, - speech_frames / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, ) t2 = time.time() @@ -246,7 +255,7 @@ def eval_cosine_scoring( model=model, loss=nn.CrossEntropyLoss(), optimizer=None, - input_shape=[1, s.shape[1]], + input_shape=(s.shape[1],), nb_classes=2, clip_values=(-wav_scale, wav_scale), device_type=device_type, @@ -254,10 +263,11 @@ def eval_cosine_scoring( attack_args["num_samples"] = s.shape[-1] attack = AttackFactory.create(model_art, **attack_args) + # s = s[None, None, :, :] for i in range(key.num_models): if key.tar[i, j] or key.non[i, j]: t3 = time.time() - model.x_e = x_e[i].to(device) + model.x_e = x_e[i : i + 1].to(device) if key.tar[i, j]: if attack.targeted: t = non @@ -270,6 +280,7 @@ def eval_cosine_scoring( t = non s_adv = attack.generate(s, t) + # s_adv = s_adv[0, 0] s_adv = torch.from_numpy(s_adv).to(device) with torch.no_grad(): scores[i, j] = model(s_adv).cpu().numpy()[0, 1] @@ -327,13 +338,13 @@ def eval_cosine_scoring( if num_seg_parts > 1: score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) - logging.info("saving scores to %s" % (score_file)) + logging.info("saving scores to %s", score_file) s = TrialScores( key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) ) s.save_txt(score_file) - logging.info("saving stats to %s" % (stats_file)) + logging.info("saving stats to %s", stats_file) attack_stats.to_csv(stats_file) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 0e9493c0..53349dc4 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -128,7 +132,7 @@ def eval_cosine_scoring( with torch.no_grad(): for j in range(ndx.num_tests): t1 = time.time() - logging.info("scoring test utt %s" % (ndx.seg_set[j])) + logging.info("scoring test utt %s", ndx.seg_set[j]) s, fs = audio_reader.read([ndx.seg_set[j]]) s = s[0] fs = fs[0] @@ -144,21 +148,15 @@ def eval_cosine_scoring( t4 = time.time() tot_frames = x_t.shape[1] if vad_spec is not None: - vad = torch.as_tensor( - v_reader.read([ndx.seg_set[j]], num_frames=x_t.shape[1])[0].astype( - np.uint8, copy=False - ), - dtype=torch.uint8, - ).to(device) + vad = v_reader.read([ndx.seg_set[j]], num_frames=x_t.shape[1])[0] + vad = torch.tensor(vad, dtype=torch.bool).to(device) x_t = x_t[:, vad] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - ndx.seg_set[j], - x_t.shape[1], - tot_frames, - x_t.shape[1] / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + ndx.seg_set[j], + x_t.shape[1], + tot_frames, + x_t.shape[1] / tot_frames * 100, ) t5 = time.time() @@ -169,9 +167,9 @@ def eval_cosine_scoring( for i in range(ndx.num_models): if ndx.trial_mask[i, j]: - y_e_i = torch.as_tensor(y_e[i], dtype=torch.get_default_dtype()).to( - device - ) + y_e_i = torch.as_tensor( + y_e[i : i + 1], dtype=torch.get_default_dtype() + ).to(device) y_e_i = l2_norm(y_e_i) scores_ij = torch.sum(y_e_i * y_t, dim=-1) if calibrator is None: @@ -213,9 +211,9 @@ def eval_cosine_scoring( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--ndx-file", dest="ndx_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -241,7 +239,7 @@ def eval_cosine_scoring( ) parser.add_argument( - "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + "--use-gpu", default=False, action="store_true", help="evaluate in gpu" ) parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index e0754498..1636e23b 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -10,8 +10,12 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -260,8 +264,8 @@ def eval_cosine_scoring( for i in range(key.num_models): if key.tar[i, j] or key.non[i, j]: t3 = time.time() - model.x_e = x_e[i].to(device) - tmodel.x_e = t_x_e[i].to(device) + model.x_e = x_e[i : i + 1].to(device) + tmodel.x_e = t_x_e[i : i + 1].to(device) if key.tar[i, j]: if attack.targeted: t = non diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 0f9f375d..fd75ce7a 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -11,9 +11,13 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from art.estimators.classification import PyTorchClassifier +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -24,8 +28,9 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device @@ -185,8 +190,8 @@ def eval_cosine_scoring( tmodel.to(device) tmodel.eval() - tar = np.asarray([1], dtype=np.int) - non = np.asarray([0], dtype=np.int) + tar = np.asarray([1], dtype=int) + non = np.asarray([0], dtype=int) logging.info("loading key and enrollment x-vectors") key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) @@ -249,9 +254,7 @@ def eval_cosine_scoring( vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( - device - ) + vad = torch.tensor(vad, dtype=torch.bool).to(device) model.vad_t = vad tmodel.vad_t = vad logging.info( @@ -283,8 +286,8 @@ def eval_cosine_scoring( for i in range(key.num_models): if key.tar[i, j] or key.non[i, j]: t3 = time.time() - model.x_e = x_e[i].to(device) - tmodel.x_e = t_x_e[i].to(device) + model.x_e = x_e[i : i + 1].to(device) + tmodel.x_e = t_x_e[i : i + 1].to(device) if key.tar[i, j]: if attack.targeted: t = non diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py new file mode 100755 index 00000000..b7370b9b --- /dev/null +++ b/hyperion/bin/prepare_data.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.data_prep import DataPrep + + +def make_parser(data_prep_class): + parser = ArgumentParser() + data_prep_class.add_class_args(parser) + return parser + + +if __name__ == "__main__": + parser = ArgumentParser( + description="""Prepares a dataset into relational database tables""" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in DataPrep.registry.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + data_prep_class = DataPrep.registry[args.subcommand] + args = namespace_to_dict(args)[args.subcommand] + + data_prep = data_prep_class(**args) + data_prep.prepare() diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index 57a33b56..da8ebc3f 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -10,12 +10,15 @@ import time from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, set_float_cpu -# from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py new file mode 100644 index 00000000..7caae8c4 --- /dev/null +++ b/hyperion/data_prep/__init__.py @@ -0,0 +1,8 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +# from .data_prep import data_prep_registry +from .data_prep import DataPrep +from .voxceleb2 import VoxCeleb2DataPrep diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py new file mode 100644 index 00000000..966adeef --- /dev/null +++ b/hyperion/data_prep/data_prep.py @@ -0,0 +1,56 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ActionYesNo +from pathlib import Path + + +class DataPrep: + """Base class for data preparation classes. + + Attributes: + corpus_dir: input data directory + output_dir: output data directory + use_kaldi_ids: puts speaker-id in front of segment id like kaldi + target_sample_freq: target sampling frequency to convert the audios to. + """ + + registry = {} + + def __init__(self, corpus_dir, output_dir, use_kaldi_ids, target_sample_freq): + self.corpus_dir = Path(corpus_dir) + self.output_dir = Path(output_dir) + self.use_kaldi_ids = use_kaldi_ids + self.target_sample_freq = target_sample_freq + + self.output_dir.mkdir(exist_ok=True, parents=True) + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + cls.registry[cls.dataset_name()] = cls + + @staticmethod + def dataset_name(): + raise NotImplementedError() + + @staticmethod + def add_class_args(parser): + parser.add_argument( + "--corpus-dir", required=True, help="""input data directory""", + ) + parser.add_argument( + "--output-dir", required=True, help="""output data directory""", + ) + parser.add_argument( + "--use-kaldi-ids", + default=False, + action=ActionYesNo, + help="""put speaker-id in front of segment id like kaldi""", + ) + parser.add_argument( + "--target-sample-freq", + default=None, + type=int, + help="""target sampling frequency to convert the audios to""", + ) diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py new file mode 100644 index 00000000..25692349 --- /dev/null +++ b/hyperion/data_prep/voxceleb2.py @@ -0,0 +1,169 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ActionYesNo +from pathlib import Path +import re + +import pandas as pd +import numpy as np + +from ..utils.misc import urlretrieve_progress +from ..utils import RecordingSet, SegmentSet, ClassInfo +from .data_prep import DataPrep + + +class VoxCeleb2DataPrep(DataPrep): + """Class for preparing VoxCeleb2 database into tables + + Attributes: + corpus_dir: input data directory + subset: subset of the data dev or test + cat_videos: concatenate utterances from the same video. + output_dir: output data directory + use_kaldi_ids: puts speaker-id in front of segment id like kaldi + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir, + subset, + cat_videos, + output_dir, + use_kaldi_ids, + target_sample_freq, + ): + super().__init__(corpus_dir, output_dir, use_kaldi_ids, target_sample_freq) + self.subset = subset + self.cat_videos = cat_videos + + @staticmethod + def dataset_name(): + return "voxceleb2" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + default="dev", + choices=["dev", "test"], + help="""vox2 subset in [dev, test]""", + ) + parser.add_argument( + "--cat-videos", + default=False, + action=ActionYesNo, + help="""concatenate utterances from the same video.""", + ) + + def _get_metadata(self): + file_name = "vox2_meta.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.openslr.org/resources/49/vox2_meta.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_meta = pd.read_csv(file_path, sep="\t") + print(df_meta.head()) + df_meta.set_index("VoxCeleb2 ID") + return df_meta + + def _get_langs_est(self): + file_name = "lang_vox2_final.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox2_final.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_lang = pd.read_csv(file_path, sep=",") + + def get_video(x): + x = re.sub("/.*.wav$", "", x) + x = re.sub("^.*/", "", x) + return x + + df_lang["video"] = df_lang["filename"].apply(get_video) + df_lang["filename"].drop(["filename"], axis=1, inplace=True) + df_lang.drop_duplicates(inplace=True) + df_lang.set_index("video") + return df_lang + + def prepare(self): + df_meta = self._get_metadata() + df_lang = self._get_langs_est() + rec_dir = self.corpus_dir / self.subset + rec_files = list(rec_dir.glob("**/*.m4a")) + speakers = [f.parents[1].name for f in rec_files] + video_ids = [f.parent.name for f in rec_files] + if self.concat_videos: + lists_cat_dir = self.output_dir / "lists_cat" + lists_cat_dir.mkdir(exist_ok=True, parents=True) + uniq_video_ids, uniq_video_idx, video_idx = np.unique( + video_ids, return_index=True, return_inverse=True + ) + rec_ids = uniq_video_ids + speakers = speakers[uniq_video_idx] + if self.use_kaldi_ids: + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] + else: + rec_ids = uniq_video_ids + + file_paths = [] + for i, video_id in enumerate(uniq_video_ids): + list_file = lists_cat_dir / f"{video_id}.txt" + with open(list_file, "w") as fw: + rec_mask = video_idx == i + recs_i = rec_files[rec_mask] + for rec in recs_i: + fw.write(f"{rec}\n") + + file_path = f"ffmpeg -v 8 -f concat -safe 0 -i {list_file} -f wav -acodec pcm_s16le -|" + file_paths.append(file_path) + + video_ids = uniq_video_ids + + else: + file_names = [f.name for f in rec_files] + if self.use_kaldi_ids: + rec_ids = [ + f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names) + ] + else: + rec_ids = [f"{v}-{f}" for v, f in zip(video_ids, file_names)] + + file_paths = [] + for rec_file in rec_files: + file_path = f"ffmpeg -v 8 -i {rec_file} -f wav -acodec pcm_s16le - |" + file_paths.append(file_path) + + recs = pd.DataFrame({"id": rec_ids, "file_path": file_paths}) + recs = RecordingSet(recs) + segments = pd.DataFrame( + { + "id": rec_ids, + "video_ids": video_ids, + "speaker": speakers, + "gender": df_meta.loc[speakers, "Gender"], + } + ) + segments = SegmentSet(segments) + uniq_speakers = np.unique(speakers) + speakers = pd.DataFrame( + { + "id": uniq_speakers, + "vgg_id": df_meta.loc[uniq_speakers, "VGGFace2 ID"], + "gender": df_meta.loc[uniq_speakers, "Gender"], + } + ) + speakers = ClassInfo(speakers) + + print(recs) + print(segments) + print(speakers) diff --git a/hyperion/torch/adv_attacks/art_attack_factory.py b/hyperion/torch/adv_attacks/art_attack_factory.py index ba103acf..801ba948 100644 --- a/hyperion/torch/adv_attacks/art_attack_factory.py +++ b/hyperion/torch/adv_attacks/art_attack_factory.py @@ -4,13 +4,24 @@ """ import numpy as np -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ArgumentParser, ActionYesNo try: from art.attacks import evasion as attacks except ImportError: pass +from ...utils.misc import filter_func_args + + +def make_4d_hook(func): + def wrapper(x, *args, **kwargs): + x = x[None, None] + y = func(x, *args, **kwargs) + return y[0, 0] + + return wrapper + class ARTAttackFactory(object): @staticmethod @@ -28,11 +39,10 @@ def create( num_random_init=0, minimal=False, random_eps=False, - min_eps=None, + min_eps=1e-6, beta=0.001, theta=0.1, gamma=1.0, - etha=0.01, confidence=0.0, lr=1e-2, lr_decay=0.5, @@ -42,9 +52,12 @@ def create( max_iter=10, overshoot=1.1, num_grads=10, - c=1e-3, max_halving=5, max_doubling=5, + tau_decr_factor=0.9, + initial_c=1e-5, + largest_c=20.0, + c_incr_factor=2.0, decision_rule="EN", init_eval=100, max_eval=10000, @@ -53,31 +66,44 @@ def create( use_importance=False, abort_early=True, th=None, + es: int = 0, sigma=0.5, lambda_tv=0.3, - labmda_c=1.0, + lambda_c=1.0, lambda_s=0.5, reg=3000, kernel_size=5, eps_factor=1.1, eps_iter=10, + p_wassertein=2, conj_sinkhorn_iter=400, proj_sinkhorn_iter=400, + sub_dim: int = 10, + bin_search_tol: float = 0.1, + lambda_geoda: float = 0.6, + sigma_geoda: float = 0.0002, + lambda_fadv=0.0, + layers_fadv=[1], + thr_lowpro: float = 0.5, + lambda_lowpro: float = 1.5, + eta_lowpro: float = 0.2, + eta_lowpro_decay: float = 0.98, + eta_lowpro_min: float = 1e-7, + eta_newton: float = 0.01, targeted=False, num_samples=1, eps_scale=1, batch_size=1, ): - eps = eps * eps_scale - eps_step = eps_step * eps_scale - if min_eps is not None: + if attack_type not in ["feature-adv"]: + eps = eps * eps_scale + eps_step = eps_step * eps_scale min_eps = min_eps * eps_scale + delta = delta * eps_scale - attack_set = set( - ["fgm", "pgd", "auto-pgd", "boundary", "cw-linf", "wasserstein"] - ) - if attack_type in attack_set: + attack_l12 = set(["fgm", "pgd", "auto-pgd", "wasserstein"]) + if attack_type in attack_l12: if norm == 1: eps = eps * num_samples eps_step = eps_step * num_samples @@ -98,14 +124,14 @@ def create( epsilon=eps, step_adapt=step_adapt, max_iter=max_iter, - num_trials=num_trials, + num_trial=num_trial, sample_size=sample_size, init_size=init_size, min_epsilon=min_eps, ) - if attack_type == "hop-skin-jump": - return attacks.HopSkinJump( + if attack_type == "hop-skip-jump": + return attacks.HopSkipJump( model, targeted=targeted, norm=norm, @@ -132,7 +158,7 @@ def create( ) if attack_type == "deepfool": - attacks.DeepFool( + return attacks.DeepFool( model, max_iter=max_iter, epsilon=eps, @@ -141,7 +167,7 @@ def create( ) if attack_type == "elasticnet": - attacks.ElasticNet( + return attacks.ElasticNet( model, confidence=confidence, targeted=targeted, @@ -149,13 +175,25 @@ def create( binary_search_steps=binary_search_steps, max_iter=max_iter, beta=beta, - initial_const=c, + initial_const=initial_c, batch_size=batch_size, decision_rule=decision_rule, ) + if attack_type == "feature-adv": + return attacks.FeatureAdversariesPyTorch( + model, + delta=delta, + lambda_=lambda_fadv, + layer=tuple(layers_fadv), + max_iter=max_iter, + batch_size=batch_size, + step_size=eps_step, + random_start=num_random_init > 0, + ) + if attack_type == "threshold": - attacks.ThresholdAttack(model, th=th, es=es, targeted=targeted) + return attacks.ThresholdAttack(model, th=th, es=es, targeted=targeted) if attack_type == "fgm": return attacks.FastGradientMethod( @@ -193,15 +231,48 @@ def create( ) if attack_type == "auto-pgd": - return attacks.AutoProjectedGradientDescent( + if len(model.input_shape) == 1: + # autopgd only works with image kind shape + model._input_shape = (1, 1, model.input_shape[0]) + attack = attacks.AutoProjectedGradientDescent( model, norm=norm, eps=eps, eps_step=eps_step, max_iter=max_iter, targeted=targeted, - nb_random_init=num_random_init, - random_eps=random_eps, + nb_random_init=max(1, num_random_init), + batch_size=batch_size, + ) + attack.generate = make_4d_hook(attack.generate) + return attack + + if attack_type == "auto-cgd": + if len(model.input_shape) == 1: + # autopgd only works with image kind shape + model._input_shape = (1, 1, model.input_shape[0]) + attack = attacks.AutoConjugateGradient( + model, + norm=norm, + eps=eps, + eps_step=eps_step, + max_iter=max_iter, + targeted=targeted, + nb_random_init=max(1, num_random_init), + batch_size=batch_size, + ) + attack.generate = make_4d_hook(attack.generate) + return attack + + if attack_type == "geoda": + return attacks.GeoDA( + model, + norm=norm, + sub_dim=sub_dim, + max_iter=max_iter, + bin_search_tol=bin_search_tol, + lambda_param=lambda_geoda, + sigma=sigma_geoda, batch_size=batch_size, ) @@ -210,14 +281,21 @@ def create( model, theta=theta, gamma=gamma, batch_size=batch_size ) - if attack_type == "newtonfool": - return attacks.NewtonFool( - model, eta=eta, max_iter=max_iter, batch_size=batch_size + if attack_type == "low-pro-fool": + return attacks.LowProFool( + model, + n_steps=max_iter, + threshold=thr_lowpro, + lambd=lambda_lowpro, + eta=eta_lowpro, + eta_decay=eta_lowpro_decay, + eta_min=eta_lowpro_min, + norm=norm, ) - if attack_type == "threshold": + if attack_type == "newtonfool": return attacks.NewtonFool( - model, eta=eta, max_iter=max_iter, batch_size=batch_size + model, eta=eta_newton, max_iter=max_iter, batch_size=batch_size ) if attack_type == "cw-l2": @@ -227,8 +305,8 @@ def create( learning_rate=lr, binary_search_steps=binary_search_steps, max_iter=max_iter, - initial_const=c, targeted=targeted, + initial_const=initial_c, max_halving=max_halving, max_doubling=max_doubling, batch_size=batch_size, @@ -241,19 +319,20 @@ def create( learning_rate=lr, max_iter=max_iter, targeted=targeted, - max_halving=max_halving, - max_doubling=max_doubling, - eps=eps, + decrease_factor=tau_decr_factor, + initial_const=initial_c, + largest_const=largest_c, + const_factor=c_incr_factor, batch_size=batch_size, ) if attack_type == "zoo": - return attacks.ZooMethod( + return attacks.ZooAttack( model, confidence, learning_rate=lr, max_iter=max_iter, - initial_const=c, + initial_const=initial_c, targeted=targeted, binary_search_steps=binary_search_steps, abort_early=abort_early, @@ -265,22 +344,33 @@ def create( ) if attack_type == "shadow": - return attacks.ShadowAttack( + if len(model.input_shape) == 1: + # autopgd only works with image kind shape + model._input_shape = (1, 1, model.input_shape[0]) + + attack = attacks.ShadowAttack( model, sigma=sigma, - num_steps=num_iters, + nb_steps=max_iter, learning_rate=lr, lambda_tv=lambda_tv, lambda_c=lambda_c, lambda_s=lambda_s, - batch_norm=batch_norm, + batch_size=batch_size, targeted=targeted, ) + attack.generate = make_4d_hook(attack.generate) + return attack if attack_type == "wasserstein": - return attacks.Wasserstein( + if len(model.input_shape) == 1: + # autopgd only works with image kind shape + model._input_shape = (1, 1, model.input_shape[0]) + + attack = attacks.Wasserstein( model, targeted=targeted, + p=p_wassertein, regularization=reg, kernel_size=kernel_size, eps=eps, @@ -292,6 +382,8 @@ def create( projected_sinkhorn_max_iter=proj_sinkhorn_iter, batch_size=batch_size, ) + attack.generate = make_4d_hook(attack.generate) + return attack raise Exception("%s is not a valid attack type" % (attack_type)) @@ -307,59 +399,7 @@ def filter_args(**kwargs): else: kwargs["norm"] = int(kwargs["norm"]) - valid_args = ( - "attack_type", - "eps", - "delta", - "step_adapt", - "num_trial", - "sample_size", - "init_size", - "norm", - "eps_step", - "num_random_init", - "minimal", - "random_eps", - "min_eps", - "beta", - "theta", - "gamma", - "etha", - "confidence", - "decision_rule", - "lr", - "lr_decay", - "lr_num_decay", - "momentum", - "binary_search_steps", - "max_iter", - "init_eval", - "max_eval", - "overshoot", - "num_grads", - "c", - "max_halving", - "max_doubling", - "variable_h", - "abort_early", - "num_parallel", - "use_importance", - "th", - "sigma", - "lambda_tv", - "labmda_c", - "lambda_s", - "reg", - "kernel_size", - "eps_factor", - "eps_iter", - "conj_sinkhorn_iter", - "proj_sinkhorn_iter", - "targeted", - ) - - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - + args = filter_func_args(ARTAttackFactory.create, kwargs) return args @staticmethod @@ -371,7 +411,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--attack-type", type=str.lower, - default="fgsm", + default="fgm", choices=[ "boundary", "brendel", @@ -380,12 +420,15 @@ def add_class_args(parser, prefix=None): "bim", "pgd", "auto-pgd", + "auto-cgd", + "feature-adv", + "low-pro-fool", "jsma", "newtonfool", "cw-l2", "cw-linf", "elasticnet", - "hop-skin-jump", + "hop-skip-jump", "zoo", "threshold", "shadow", @@ -571,7 +614,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--min-eps", - default=None, + default=1e-6, type=float, help=("Stop attack if perturbation is smaller than min_eps."), ) @@ -614,12 +657,33 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--c", + "--initial-c", default=1e-2, type=float, help=("Initial weight of constraint function f in carlini-wagner attack"), ) + parser.add_argument( + "--largest-c", + default=20.0, + type=float, + help=("largest weight of constraint function f in carlini-wagner attack"), + ) + + parser.add_argument( + "--c-incr-factor", + default=2, + type=float, + help=("factor to increment c in carline-wagner-l0/inf attack"), + ) + + parser.add_argument( + "--tau-decr-factor", + default=0.9, + type=float, + help=("factor to reduce tau in carline-wagner-linf attack"), + ) + parser.add_argument( "--max-halving", default=5, @@ -635,10 +699,10 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--no-abort", - default=False, - action="store_true", - help=("do not abort early in optimizer iterations"), + "--abort-early", + default=True, + action=ActionYesNo, + help=("abort early in optimizer iterations"), ) parser.add_argument( @@ -670,6 +734,14 @@ def add_class_args(parser, prefix=None): "Threshold for threshold attack, None indicates finding and minimum threshold" ), ) + parser.add_argument( + "--es", + default=0, + type=int, + help=( + "Indicates whether the attack uses CMAES (0) or DE (1) as Evolutionary Strategy" + ), + ) parser.add_argument( "--sigma", @@ -704,6 +776,19 @@ def add_class_args(parser, prefix=None): "Scalar penalty weight for similarity of color channels in perturbation" ), ) + parser.add_argument( + "--lambda-fadv", + default=0.0, + type=float, + help=("Regularization parameter of the L-inf soft constraint"), + ) + parser.add_argument( + "--layers-fadv", + default=[1], + type=int, + nargs="+", + help=("indices of the representation layers"), + ) parser.add_argument( "--reg", @@ -730,6 +815,12 @@ def add_class_args(parser, prefix=None): type=int, help=("Number of iterations to increase the epsilon."), ) + parser.add_argument( + "--p-wassertein", + default=2, + type=int, + help=("lp distance for wassertein distance"), + ) parser.add_argument( "--conj-sinkhorn-iter", default=400, @@ -743,6 +834,65 @@ def add_class_args(parser, prefix=None): help=("maximum number of iterations for the projected sinkhorn optimizer"), ) + parser.add_argument( + "--thr-lowpro", + type=float, + default=0.5, + help="""Lowest prediction probability of a valid adversary for low-pro-fool""", + ) + parser.add_argument( + "--lambda-lowpro", + type=float, + default=1.5, + help="""Amount of lp-norm impact on objective function for low-pro-fool""", + ) + parser.add_argument( + "--eta-lowpro", + type=float, + default=0.2, + help="""Rate of updating the perturbation vectors for low-pro-fool""", + ) + parser.add_argument( + "--eta-lowpro-decay", + type=float, + default=0.98, + help="""Step-by-step decrease of eta for low-pro-fool""", + ) + parser.add_argument( + "--eta-lowpro-min", type=float, default=1e-7, help="""Minimal eta value""" + ) + parser.add_argument( + "--eta-newton", type=float, default=0.01, help="""eta for newtonfool""" + ) + # parser.add_argument( + # "--sub-dim", + # default=10, + # type=int, + # help="Dimensionality of 2D frequency space (DCT).", + # ) + + # parser.add_argument( + # "--bin-search-tol", + # default=0.1, + # type=float, + # help="""Maximum remaining L2 perturbation defining binary search + # convergence""", + # ) + # parser.add_argument( + # "--lambda-geoda", + # default=0.6, + # type=float, + # help="""The lambda of equation 19 with lambda_param=0 corresponding to a + # single iteration and lambda_param=1 to a uniform distribution of + # iterations per step.""", + # ) + # parser.add_argument( + # "--sigma-geoda", + # default=0.0002, + # type=float, + # help="""Variance of the Gaussian perturbation.""", + # ) + parser.add_argument( "--targeted", default=False, diff --git a/hyperion/torch/adv_attacks/attack_factory.py b/hyperion/torch/adv_attacks/attack_factory.py index 5d53f6bc..ca89a794 100644 --- a/hyperion/torch/adv_attacks/attack_factory.py +++ b/hyperion/torch/adv_attacks/attack_factory.py @@ -30,7 +30,7 @@ def create( binary_search_steps=9, max_iter=10, abort_early=True, - c=1e-3, + initial_c=1e-3, reduce_c=False, c_incr_factor=2, tau_decr_factor=0.9, @@ -47,6 +47,7 @@ def create( eps = eps * eps_scale alpha = alpha * eps_scale + norm = float(norm) if attack_type == "fgsm": return FGSMAttack( @@ -98,7 +99,7 @@ def create( binary_search_steps, max_iter, abort_early, - c, + initial_c, norm_time=norm_time, time_dim=time_dim, use_snr=use_snr, @@ -114,7 +115,7 @@ def create( lr, max_iter, abort_early, - c, + initial_c, reduce_c, c_incr_factor, indep_channels, @@ -130,7 +131,7 @@ def create( lr, max_iter, abort_early, - c, + initial_c, reduce_c, c_incr_factor, tau_decr_factor, @@ -219,9 +220,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--norm", - type=float, - default=float("inf"), - choices=[float("inf"), 1, 2], + default="inf", + choices=["inf", "1", "2"], help=("Attack perturbation norm"), ) @@ -284,7 +284,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--c", + "--initial-c", default=1e-2, type=float, help=( diff --git a/hyperion/utils/misc.py b/hyperion/utils/misc.py index 6813c6b7..369962fd 100644 --- a/hyperion/utils/misc.py +++ b/hyperion/utils/misc.py @@ -115,3 +115,53 @@ def filter_func_args(func, kwargs, skip=set()): args = sig.bind_partial(**my_kwargs).arguments return args + + +from tqdm import tqdm + + +def tqdm_urlretrieve_hook(t): + """Wraps tqdm instance. + Don't forget to close() or __exit__() + the tqdm instance once you're done with it (easiest using `with` syntax). + Example + ------- + >>> from urllib.request import urlretrieve + >>> with tqdm(...) as t: + ... reporthook = tqdm_urlretrieve_hook(t) + ... urlretrieve(..., reporthook=reporthook) + Source: https://github.com/tqdm/tqdm/blob/master/examples/tqdm_wget.py + """ + last_b = [0] + + def update_to(b=1, bsize=1, tsize=None): + """ + b : int, optional + Number of blocks transferred so far [default: 1]. + bsize : int, optional + Size of each block (in tqdm units) [default: 1]. + tsize : int, optional + Total size (in tqdm units). If [default: None] or -1, + remains unchanged. + """ + if tsize not in (None, -1): + t.total = tsize + displayed = t.update((b - last_b[0]) * bsize) + last_b[0] = b + return displayed + + return update_to + + +def urlretrieve_progress(url, filename=None, data=None, desc=None): + """ + Works exactly like urllib.request.urlretrieve, but attaches a tqdm hook to display + a progress bar of the download. + Use "desc" argument to display a user-readable string that informs what is being downloaded. + Taken from lhotse: https://github.com/lhotse-speech/lhotse/blob/master/lhotse/utils.py + """ + from urllib.request import urlretrieve + + with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=desc) as t: + reporthook = tqdm_urlretrieve_hook(t) + return urlretrieve(url=url, filename=filename, reporthook=reporthook, data=data) From 947047d3b7641125d4c0ee527db7e51d48cd8d8d Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 14 Apr 2023 11:05:21 -0400 Subject: [PATCH 093/154] deleted adv.v1 and vae.v1 recipes --- egs/voxceleb/adv.v1/README.md | 164 ----- egs/voxceleb/adv.v1/cmd.sh | 25 - egs/voxceleb/adv.v1/datapath.sh | 21 - egs/voxceleb/adv.v1/default_config.sh | 1 - ...g_victim_lresnet34_transfer_resetdnn.v1.sh | 32 - ...fig_victim_resnet34_transfer_lresnet.v1.sh | 61 -- ...ig_victim_resnet34_transfer_resetdnn.v1.sh | 94 --- egs/voxceleb/adv.v1/hyp_utils | 1 - egs/voxceleb/adv.v1/path.sh | 6 - egs/voxceleb/adv.v1/run_001_prepare_data.sh | 26 - egs/voxceleb/adv.v1/run_002_compute_evad.sh | 56 -- .../adv.v1/run_003_prepare_noises_rirs.sh | 67 -- .../run_004_prepare_victim_xvec_train_data.sh | 42 -- ...un_006_prepare_transfer_xvec_train_data.sh | 48 -- .../run_008_extract_xvectors_victim_model.sh | 37 -- ...run_031_extract_xvectors_transfer_model.sh | 41 -- .../adv.v1/run_040_eval_be_victim_model.sh | 57 -- .../adv.v1/run_041_eval_be_transfer_model.sh | 58 -- .../adv.v1/run_042_eval_victim_from_wav.sh | 54 -- .../adv.v1/run_043_eval_whitebox_attacks.sh | 346 ---------- .../run_044_eval_transfer_blackbox_attacks.sh | 481 -------------- ...hitebox_attacks_with_randsmooth_defense.sh | 544 --------------- .../run_053_eval_art_whitebox_attacks.sh | 536 --------------- ..._054_eval_art_transfer_blackbox_attacks.sh | 626 ------------------ egs/voxceleb/adv.v1/steps | 1 - egs/voxceleb/adv.v1/steps_adv | 1 - egs/voxceleb/adv.v1/steps_be | 1 - egs/voxceleb/adv.v1/steps_fe | 1 - egs/voxceleb/adv.v1/steps_pyfe | 1 - egs/voxceleb/adv.v1/steps_xvec | 1 - egs/voxceleb/adv.v1/utils | 1 - egs/voxceleb/vae.v1/README.md | 89 --- egs/voxceleb/vae.v1/cmd.sh | 25 - egs/voxceleb/vae.v1/conf | 1 - egs/voxceleb/vae.v1/datapath.sh | 21 - egs/voxceleb/vae.v1/default_config.sh | 1 - ...e_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh | 30 - ..._resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh | 29 - ...ig_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh | 29 - ...ig_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh | 30 - ...g_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh | 30 - ...g_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh | 30 - ...e_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh | 30 - ...ae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh | 30 - ...ae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh | 30 - ...e_resnet2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh | 31 - ...e_resnet2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh | 31 - ...q_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh | 45 -- ...meansvq_z256cb512x16_c142.opt.lr0.01.v1.sh | 33 - ...kmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh | 33 - ...kmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh | 34 - ...kmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh | 33 - ...meansvq_z256cb512x16_c142.opt.lr0.01.v1.sh | 33 - ...svq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | 43 -- ...vq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh | 43 -- ...svq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | 43 -- ...akmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh | 33 - ...meansvq_z256cb512x128_c18.opt.lr0.01.v1.sh | 33 - ...meansvq_z256cb512x16_c142.opt.lr0.01.v1.sh | 33 - ...kmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh | 33 - ...meansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh | 33 - ...kmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh | 33 - ...kmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh | 33 - ..._z256cb512x4_c569_predvar.opt.lr0.01.v1.sh | 34 - ...kmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh | 33 - ...kmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh | 33 - ...akmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh | 46 -- ...svq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | 42 -- ...akmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh | 45 -- ...svq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | 45 -- ...svq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | 43 -- egs/voxceleb/vae.v1/hyp_utils | 1 - egs/voxceleb/vae.v1/local | 1 - egs/voxceleb/vae.v1/path.sh | 5 - egs/voxceleb/vae.v1/run_001_prepare_data.sh | 28 - egs/voxceleb/vae.v1/run_002_compute_evad.sh | 57 -- egs/voxceleb/vae.v1/run_003_compute_fbank.sh | 70 -- .../vae.v1/run_004_prepare_augment.sh | 123 ---- .../vae.v1/run_005_compute_fbank_augment.sh | 57 -- .../run_010_prepare_gen_model_train_data.sh | 45 -- egs/voxceleb/vae.v1/run_011_train_model.sh | 137 ---- egs/voxceleb/vae.v1/run_012_eval_recons.sh | 44 -- .../vae.v1/run_013_eval_xvector_asv.sh | 63 -- egs/voxceleb/vae.v1/steps | 1 - egs/voxceleb/vae.v1/steps_be | 1 - egs/voxceleb/vae.v1/steps_fe | 1 - egs/voxceleb/vae.v1/steps_pyfe | 1 - egs/voxceleb/vae.v1/steps_xvec | 1 - egs/voxceleb/vae.v1/utils | 1 - 89 files changed, 5427 deletions(-) delete mode 100644 egs/voxceleb/adv.v1/README.md delete mode 100755 egs/voxceleb/adv.v1/cmd.sh delete mode 100644 egs/voxceleb/adv.v1/datapath.sh delete mode 120000 egs/voxceleb/adv.v1/default_config.sh delete mode 100644 egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh delete mode 100644 egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_lresnet.v1.sh delete mode 100644 egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh delete mode 120000 egs/voxceleb/adv.v1/hyp_utils delete mode 100755 egs/voxceleb/adv.v1/path.sh delete mode 100755 egs/voxceleb/adv.v1/run_001_prepare_data.sh delete mode 100755 egs/voxceleb/adv.v1/run_002_compute_evad.sh delete mode 100755 egs/voxceleb/adv.v1/run_003_prepare_noises_rirs.sh delete mode 100755 egs/voxceleb/adv.v1/run_004_prepare_victim_xvec_train_data.sh delete mode 100755 egs/voxceleb/adv.v1/run_006_prepare_transfer_xvec_train_data.sh delete mode 100755 egs/voxceleb/adv.v1/run_008_extract_xvectors_victim_model.sh delete mode 100755 egs/voxceleb/adv.v1/run_031_extract_xvectors_transfer_model.sh delete mode 100755 egs/voxceleb/adv.v1/run_040_eval_be_victim_model.sh delete mode 100755 egs/voxceleb/adv.v1/run_041_eval_be_transfer_model.sh delete mode 100755 egs/voxceleb/adv.v1/run_042_eval_victim_from_wav.sh delete mode 100755 egs/voxceleb/adv.v1/run_043_eval_whitebox_attacks.sh delete mode 100755 egs/voxceleb/adv.v1/run_044_eval_transfer_blackbox_attacks.sh delete mode 100755 egs/voxceleb/adv.v1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh delete mode 100755 egs/voxceleb/adv.v1/run_053_eval_art_whitebox_attacks.sh delete mode 100755 egs/voxceleb/adv.v1/run_054_eval_art_transfer_blackbox_attacks.sh delete mode 120000 egs/voxceleb/adv.v1/steps delete mode 120000 egs/voxceleb/adv.v1/steps_adv delete mode 120000 egs/voxceleb/adv.v1/steps_be delete mode 120000 egs/voxceleb/adv.v1/steps_fe delete mode 120000 egs/voxceleb/adv.v1/steps_pyfe delete mode 120000 egs/voxceleb/adv.v1/steps_xvec delete mode 120000 egs/voxceleb/adv.v1/utils delete mode 100644 egs/voxceleb/vae.v1/README.md delete mode 100755 egs/voxceleb/vae.v1/cmd.sh delete mode 120000 egs/voxceleb/vae.v1/conf delete mode 100644 egs/voxceleb/vae.v1/datapath.sh delete mode 120000 egs/voxceleb/vae.v1/default_config.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_dvae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_dvae_resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqdvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256swish_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x128_c18.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569_predvar.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh delete mode 100644 egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh delete mode 120000 egs/voxceleb/vae.v1/hyp_utils delete mode 120000 egs/voxceleb/vae.v1/local delete mode 100755 egs/voxceleb/vae.v1/path.sh delete mode 100755 egs/voxceleb/vae.v1/run_001_prepare_data.sh delete mode 100755 egs/voxceleb/vae.v1/run_002_compute_evad.sh delete mode 100755 egs/voxceleb/vae.v1/run_003_compute_fbank.sh delete mode 100755 egs/voxceleb/vae.v1/run_004_prepare_augment.sh delete mode 100755 egs/voxceleb/vae.v1/run_005_compute_fbank_augment.sh delete mode 100755 egs/voxceleb/vae.v1/run_010_prepare_gen_model_train_data.sh delete mode 100755 egs/voxceleb/vae.v1/run_011_train_model.sh delete mode 100755 egs/voxceleb/vae.v1/run_012_eval_recons.sh delete mode 100755 egs/voxceleb/vae.v1/run_013_eval_xvector_asv.sh delete mode 120000 egs/voxceleb/vae.v1/steps delete mode 120000 egs/voxceleb/vae.v1/steps_be delete mode 120000 egs/voxceleb/vae.v1/steps_fe delete mode 120000 egs/voxceleb/vae.v1/steps_pyfe delete mode 120000 egs/voxceleb/vae.v1/steps_xvec delete mode 120000 egs/voxceleb/vae.v1/utils diff --git a/egs/voxceleb/adv.v1/README.md b/egs/voxceleb/adv.v1/README.md deleted file mode 100644 index cace8c2c..00000000 --- a/egs/voxceleb/adv.v1/README.md +++ /dev/null @@ -1,164 +0,0 @@ -# VoxCeleb Adversarial Attacks Version 1 - -Last update 2021/04/22 - -Recipe to evaluate Adversarial Attacks to x-Vector Speaker Verification Systems - -## Threat Model - -Speaker verification pipeline where: - - Enrollment side is not under attack, x-vectors for enrollment utterances are - pre-computed and storded on disk - - Test side is under Adversarial Attacks. - The attack adds an inperceptible perturbation to the - test waveform to make the system to: - - Classify target trials as non-targets - - Classify non-target trials as targets - -As attacks happen in waveform domain, test x-vectors cannot be precomputed and -need to be recomputed for each trial. -Also, the speaker verification pipeline needs to be fully differentiable from wave to score, -so the attack algorithm can optimize the perturbation noise. - -However, to train the x-vector network, this recipe computes acoustic features and speech augmentations off-line. -Look version adv.v1.1, for a newer recipe which computes features -and augmentations on the fly. - -Two broad types of attacks: - - White-box: the attacker has access to the x-vector model under attack - - Transfer based Black-box: the attacker doesn't have access to the x-vector model under attack (black-box model), - but has access to another x-vector model (white-box). Perturvation is obtained from the white-box model - and used to attack the black-box model. - -Multiple attacks algorithms: FGSM, Iter-FGSM, PGD, Carlini-Wagner. - -## Citing - - If you use this recipe, please cite: -``` -@inproceedings{Villalba2020, -address = {Shanghai, China}, -author = {Villalba, Jes{\'{u}}s and Zhang, Yuekai and Dehak, Najim}, -booktitle = {Interspeech 2020}, -month = {sep}, -title = {{x-Vectors Meet Adversarial Attacks : Benchmarking Adversarial Robustness in Speaker Verification}}, -year = {2020} -} -``` - -## Training Data - - - x-Vector network is trained on Voxceleb2 dev + test with augmentations - - MUSAN noise - - RIR reverberation - -## Test Data - - - Test data is VoxCeleb 1 Original Clean trial list. - - We don't use the larger Entire and Hard list because of the high computing cost - of these experiments. - -## Usage - - - Run the run_0*.sh scripts in sequence - - By default it will use ResNet34 as victim model and Residual E-TDNN as transfer model - - You can change that modifying the configuration script. - - For example, to use LResNet34 as transfer model use `config_victim_resnet34_transfer_lresnet.v1.sh` - when calling each of the steps as -```bash -run_0*.sh --config-file global_conf/config_victim_resnet34_transfer_lresnet.v1.sh -``` - -## Recipe Steps: - - - `run_001_prepare_data.sh` - - Data preparation script to generate Kaldi style data directories for - - VoxCeleb2 train+test - - VoxCeleb1 Original eval sets - - - `run_002_compute_evad.sh` - - Computes Energy VAD for all datasets - - - `run_003_compute_fbank.sh` - - Computes log-filter-banks acoustic features for all datasets - - - `run_004_prepare_augment.sh` - - Prepares Kaldi style data directories for augmented training data with MUSAN noise and RIR reverberation. - - - `run_005_compute_fbank_augment.sh - - Computes log-filter-banks for augmented datasets - - - `run_010_prepare_victim_xvec_train_data.sh` - - Prepares features train the victim x-vector model - - Applies sort-time mean normalization and remove silence frames - - Removes utterances shorter than 4secs and speakers with less than 8 utterances. - - Creates training and validation lists for x-vector training - - - `run_011_train_victim_xvector.sh` - - Trains the victim x-vector network - - - `run_012_prepare_transfer_xvec_train_data.sh` - - Prepares features train the transfer white-box x-vector model - - If training data for victim and tranfer models is the same, it does nothing - - - `run_013_train_transfer_xvector.sh` - - Trains the transfer white-box x-vector network - - - `run_030_extract_xvectors_victim_model.sh` - - Exctracts x-vectors for VoxCeleb1 test set using the victim model - - - `run_031_extract_xvectors_transfer_model.sh` - - Exctracts x-vectors for VoxCeleb1 test set using the transfer model - - - `run_040_eval_be_victim_model.sh` - - Eval cosine scoring back-end without attack on victim model x-vectors - - Trains calibration for the victim model scores - - Results are left in `exp/scores/$nnet_name/cosine/voxceleb1_o_clean_results` - - - `run_041_eval_be_tranfer_model.sh` - - Eval cosine scoring back-end without attack on transfer model x-vectors - - Trains calibration for the tranfer model scores - - Results are left in `exp/scores/$transfer_nnet_name/cosine/voxceleb1_o_clean_results` - - - `run_042_eval_victim_from_wav.sh` - - Eval cosine scoring back-end without attack on victim model x-vectors - from the test wave, computing features and x-vectors on the fly. - - This script is just to check that we get the same result as in step 40. - - You don't need to run it. - - Results are left in `exp/scores/$nnet_name/cosine_from_wav/voxceleb1_o_clean_results` - - - `run_043_eval_whitebox_attacks.sh` - - Eval white box attacks implemented in Hyperion toolkit: FGSM, Iter-FGSM, PGD, Carlini-Wagner - - Results are left in `exp/scores/$nnet_name/cosine_${attack_related_label}/voxceleb1_o_clean_results` - - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF - - Curves are left in `exp/scores/$nnet_name/cosine_${attack_related_label}_eall/` - - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - - Wavs are saves to `exp/scores/$nnet_name/cosine_${attack_related_label}/wav` - - - `run_044_eval_transfer_blackbox_attacks.sh` - - Eval transfer black box attacks implemented in Hyperion toolkit: FGSM, Iter-FGSM, PGD, Carlini-Wagner - - Results are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}/voxceleb1_o_clean_results` - - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF - - Curves are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}_eall/` - - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - - Wavs are saves to `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_${attack_related_label}/wav` - - - `run_045_eval_whitebox_attacks_with_randsmooth_defense.sh` - - Eval white box attacks with Gaussian randomized smoothing defense. - - Results are left in `exp/scores/$nnet_name/cosine_${attack_related_label}_randsmooth${smooth_sigma}/voxceleb1_o_clean_results` - - - `run_053_eval_art_whitebox_attacks.sh` - - Eval white box attacks implemented in IBM's Adversarial Robustness Toolkit (ART): FGSM, Iter-FGSM, PGD, Carlini-Wagner - - Results are left in `exp/scores/$nnet_name/cosine_art_${attack_related_label}/voxceleb1_o_clean_results` - - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF - - Curves are left in `exp/scores/$nnet_name/cosine_art_${attack_related_label}_eall/` - - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - - Wavs are saves to `exp/scores/$nnet_name/cosine_art_${attack_related_label}/wav` - - - `run_054_eval_art_transfer_blackbox_attacks.sh` - - Eval transfer black box attacks implemented in IBM's Adversarial Robustness Toolkit (ART): FGSM, Iter-FGSM, PGD, Carlini-Wagner - - Results are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_art_${attack_related_label}/voxceleb1_o_clean_results` - - When using option `--do-analysis true` it calculates curves: SNR vs EER, SNR vs actual DCF, Linf vs EER, Linf vs actual DCF - - Curves are left in `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_art_${attack_related_label}_eall/` - - When using `--save-wav true`, it writes adversarial wavs of succesful attacks to disk - - Wavs are saves to `exp/scores/$nnet_name/transfer.$transfer_nnet/cosine_art_${attack_related_label}/wav` diff --git a/egs/voxceleb/adv.v1/cmd.sh b/egs/voxceleb/adv.v1/cmd.sh deleted file mode 100755 index 9fb941ae..00000000 --- a/egs/voxceleb/adv.v1/cmd.sh +++ /dev/null @@ -1,25 +0,0 @@ -# you can change cmd.sh depending on what type of queue you are using. -# If you have no queueing system and want to run on a local machine, you -# can change all instances 'queue.pl' to run.pl (but be careful and run -# commands one by one: most recipes will exhaust the memory on your -# machine). queue.pl works with GridEngine (qsub). slurm.pl works -# with slurm. Different queues are configured differently, with different -# queue names and different ways of specifying things like memory; -# to account for these differences you can create and edit the file -# conf/queue.conf to match your queue's configuration. Search for -# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, -# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. - -if [ "$(hostname -d)" == "cm.gemini" ];then - #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" - export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" - export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" - export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" -else - export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\"" - export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\"" - export cuda_eval_cmd="$train_cmd" -fi - - - diff --git a/egs/voxceleb/adv.v1/datapath.sh b/egs/voxceleb/adv.v1/datapath.sh deleted file mode 100644 index 6d48a66d..00000000 --- a/egs/voxceleb/adv.v1/datapath.sh +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Paths to the databases used in the experiment - -#paths to databases - -if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then - voxceleb1_root=/export/corpora5/VoxCeleb1_v1 - voxceleb2_root=/export/corpora5/VoxCeleb2 - musan_root=/export/corpora5/JHU/musan -elif [ "$(hostname --domain)" == "cm.gemini" ];then - voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 - voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 - musan_root=/expscratch/dgromero/corpora-open/musan -else - echo "Put your database paths here" - exit 1 -fi - - diff --git a/egs/voxceleb/adv.v1/default_config.sh b/egs/voxceleb/adv.v1/default_config.sh deleted file mode 120000 index c91ded65..00000000 --- a/egs/voxceleb/adv.v1/default_config.sh +++ /dev/null @@ -1 +0,0 @@ -global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh deleted file mode 100644 index 39016679..00000000 --- a/egs/voxceleb/adv.v1/global_conf/config_victim_lresnet34_transfer_resetdnn.v1.sh +++ /dev/null @@ -1,32 +0,0 @@ -# Victim model Light ResNet34 x-vector -# For the black-box attacks we use Residual E-TDNN to generate the attack and transfer them to the ResNet34 -# Both models uses the same features: 80 fbanks -# Both models uses the same training data. - -# acoustic features -feat_config=conf/fbank80_stmn_16k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_16k.yaml - -# victim x-vector training -nnet_data=voxceleb2cat_train - -# victim x-vector cfg -nnet_type=resnet -nnet_name=${feat_type}_lresnet34 -nnet_cfg=conf/train_lresnet34_xvec.yaml -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - -# transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb2cat_combined - -transfer_nnet_type=resetdnn -transfer_nnet_cfg=train_resetdnn_xvec.yaml -transfer_nnet_name=${feat_type}_resetdnn5x512 -transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name -transfer_nnet=$transfer_nnet_dir/model_ep0070.pth - - diff --git a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_lresnet.v1.sh b/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_lresnet.v1.sh deleted file mode 100644 index 97f4283e..00000000 --- a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_lresnet.v1.sh +++ /dev/null @@ -1,61 +0,0 @@ -# Victim model ResNet34 x-vector -# For the black-box attacks we use Light ResNet34 to generate the attack and transfer them to the ResNet34 -# Both models uses the same features: 80 fbanks -# Both models uses the same training data. - -# victim x-vector training -nnet_data=voxceleb2cat_train_combined - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.05 - -nnet_type=resnet34 -dropout=0 -embed_dim=256 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=70 -num_augs=5 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - - -# transfer model training -transfer_nnet_data=voxceleb2cat_train_combined #this can be voxceleb2cat or voxceleb2cat_combined - -transfer_batch_size_1gpu=128 -transfer_eff_batch_size=512 # effective batch size -transfer_min_chunk=400 -transfer_max_chunk=400 -transfer_ipe=1 -transfer_lr=0.05 - -transfer_nnet_type=lresnet34 -transfer_dropout=0 -transfer_embed_dim=256 - -transfer_s=30 -transfer_margin_warmup=20 -transfer_margin=0.3 - -transfer_nnet_opt="--resnet-type $transfer_nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -transfer_nnet_name=${transfer_nnet_type}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -transfer_nnet_num_epochs=70 - -transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name -transfer_nnet=$transfer_nnet_dir/model_ep0070.pth - - diff --git a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh b/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh deleted file mode 100644 index 81f78c60..00000000 --- a/egs/voxceleb/adv.v1/global_conf/config_victim_resnet34_transfer_resetdnn.v1.sh +++ /dev/null @@ -1,94 +0,0 @@ -# Victim model ResNet34 x-vector -# For the black-box attacks we use Residual E-TDNN to generate the attack and transfer them to the ResNet34 -# Both models uses the same features: 80 fbanks -# Both models uses the same training data. - -# acoustic features -feat_config=conf/fbank80_stmn_16k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_16k.yaml - -# victim x-vector training -nnet_data=voxceleb2cat_train - -# victim x-vector cfg -nnet_type=resnet -nnet_name=${feat_type}_lresnet34 - -nnet_cfg=conf/train_lresnet34_xvec.yaml -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth - -# transfer model training -transfer_nnet_data=voxceleb2cat_train #this can be voxceleb2cat or voxceleb2cat_combined - -transfer_nnet_type=resetdnn -transfer_nnet_name=${feat_type}_resetdnn5x512 -transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name -transfer_nnet=$transfer_nnet_dir/model_ep0070.pth - - - -# # victim x-vector training -# nnet_data=voxceleb2cat_train_combined - -# batch_size_1gpu=32 -# eff_batch_size=512 # effective batch size -# min_chunk=400 -# max_chunk=400 -# ipe=1 -# lr=0.05 - -# nnet_type=resnet34 -# dropout=0 -# embed_dim=256 - -# s=30 -# margin_warmup=20 -# margin=0.3 - -# nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool" -# opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -# lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -# nnet_name=${nnet_type}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -# nnet_num_epochs=70 -# num_augs=5 -# nnet_dir=exp/xvector_nnets/$nnet_name -# nnet=$nnet_dir/model_ep0070.pth - - -# # transfer model training -# transfer_nnet_data=voxceleb2cat_train_combined #this can be voxceleb2cat or voxceleb2cat_combined - -# transfer_batch_size_1gpu=128 -# transfer_eff_batch_size=512 # effective batch size -# transfer_min_chunk=400 -# transfer_max_chunk=400 -# transfer_ipe=1 -# transfer_lr=0.05 - -# transfer_nnet_type=resetdnn -# transfer_num_layers=5 -# transfer_layer_dim=512 -# transfer_expand_dim=1536 -# transfer_dilation="1 2 3 4 1" -# transfer_kernel_sizes="5 3 3 3 1" -# transfer_dropout=0.1 -# transfer_embed_dim=256 - -# transfer_s=30 -# transfer_margin_warmup=20 -# transfer_margin=0.3 - -# transfer_nnet_opt="--tdnn-type $transfer_nnet_type --in-feats 80 --num-enc-blocks $transfer_num_layers --enc-hid-units $transfer_layer_dim --enc-expand-units $transfer_expand_dim --kernel-size $transfer_kernel_sizes --dilation $transfer_dilation" -# transfer_opt_opt="--optim.opt-type adam --optim.lr $transfer_lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -# transfer_lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -# transfer_nnet_name=${transfer_nnet_type}_nl${transfer_num_layers}ld${transfer_layer_dim}_e${transfer_embed_dim}_arcs${transfer_s}m${transfer_margin}_do${transfer_dropout}_adam_lr${transfer_lr}_b${transfer_eff_batch_size}_amp.v1 -# transfer_nnet_num_epochs=70 - -# transfer_nnet_dir=exp/xvector_nnets/$transfer_nnet_name -# transfer_nnet=$transfer_nnet_dir/model_ep0070.pth - - diff --git a/egs/voxceleb/adv.v1/hyp_utils b/egs/voxceleb/adv.v1/hyp_utils deleted file mode 120000 index f6d1eb7a..00000000 --- a/egs/voxceleb/adv.v1/hyp_utils +++ /dev/null @@ -1 +0,0 @@ -../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/path.sh b/egs/voxceleb/adv.v1/path.sh deleted file mode 100755 index 42bfa7be..00000000 --- a/egs/voxceleb/adv.v1/path.sh +++ /dev/null @@ -1,6 +0,0 @@ - -export HYP_ROOT=$(readlink -f `pwd -P`/../../..) -export TOOLS_ROOT=$HYP_ROOT/tools - -. $TOOLS_ROOT/path.sh -HYP_ART_ENV=$HYP_ENV diff --git a/egs/voxceleb/adv.v1/run_001_prepare_data.sh b/egs/voxceleb/adv.v1/run_001_prepare_data.sh deleted file mode 100755 index 8af0f353..00000000 --- a/egs/voxceleb/adv.v1/run_001_prepare_data.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. datapath.sh - - -if [ $stage -le 1 ];then - - # Prepare the VoxCeleb2 dataset for training. - local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train -fi - -if [ $stage -le 2 ];then - # prepare voxceleb1 for test - local/make_voxceleb1_o.pl $voxceleb1_root data -fi diff --git a/egs/voxceleb/adv.v1/run_002_compute_evad.sh b/egs/voxceleb/adv.v1/run_002_compute_evad.sh deleted file mode 100755 index cc3d8296..00000000 --- a/egs/voxceleb/adv.v1/run_002_compute_evad.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -vaddir=`pwd`/exp/vad_e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - - -if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $fbankdir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $fbankdir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $fbankdir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $fbankdir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $fbankdir/storage - else - echo "we don't distribute data between multiple machines" - fi - fi -fi - -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ - --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_vad/$name $vaddir - utils/fix_data_dir.sh data/${name} - done -fi - - diff --git a/egs/voxceleb/adv.v1/run_003_prepare_noises_rirs.sh b/egs/voxceleb/adv.v1/run_003_prepare_noises_rirs.sh deleted file mode 100755 index a448af9a..00000000 --- a/egs/voxceleb/adv.v1/run_003_prepare_noises_rirs.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# Copyright -# 2020 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -# We prepare the noise files and RIR for online speech augmentation - -if [ $stage -le 1 ]; then - - # Prepare the MUSAN corpus, which consists of music, speech, and noise - # suitable for augmentation. - local/make_musan.sh $musan_root 16 data - - for name in musan_noise musan_music - do - steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ - --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ - data/${name} data/${name}_proc_audio exp/${name}_proc_audio - utils/fix_data_dir.sh data/${name}_proc_audio - done - -fi - -if [ $stage -le 2 ]; then - - # Create Babble noise from MUSAN speech files - for name in musan_speech - do - steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ - --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ - data/${name} data/${name}_babble exp/${name}_babble - # utils/fix_data_dir.sh data/${name}_babble - done -fi - -if [ $stage -le 3 ]; then - if [ ! -d "RIRS_NOISES" ]; then - if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then - ln -s ../../sre19-cmn2/v1/RIRS_NOISES - else - # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip - fi - fi - local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom - local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom - local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real - for rirs in rirs_smallroom rirs_mediumroom rirs_real - do - #pack all rirs in h5 files - steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs - done - -fi - - diff --git a/egs/voxceleb/adv.v1/run_004_prepare_victim_xvec_train_data.sh b/egs/voxceleb/adv.v1/run_004_prepare_victim_xvec_train_data.sh deleted file mode 100755 index 6939052e..00000000 --- a/egs/voxceleb/adv.v1/run_004_prepare_victim_xvec_train_data.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Copyright -# 2020 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -if [ $stage -le 2 ]; then - # This script preprocess audio for x-vector training - steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-adv.v1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ - data/${nnet_data} data/${nnet_data}_proc_audio_no_sil exp/${nnet_data}_proc_audio_no_sil - hyp_utils/kaldi/utils/fix_data_dir.sh data/${nnet_data}_proc_audio_no_sil - -fi - -if [ $stage -le 3 ]; then - # Now, we remove files with less than 4s - hyp_utils/remove_short_audios.sh --min-len 4 data/${nnet_data}_proc_audio_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 4 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${nnet_data}_proc_audio_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh \ - data/${nnet_data}_proc_audio_no_sil \ - data/${nnet_data}_proc_audio_no_sil/lists_xvec -fi - -exit diff --git a/egs/voxceleb/adv.v1/run_006_prepare_transfer_xvec_train_data.sh b/egs/voxceleb/adv.v1/run_006_prepare_transfer_xvec_train_data.sh deleted file mode 100755 index f80d2924..00000000 --- a/egs/voxceleb/adv.v1/run_006_prepare_transfer_xvec_train_data.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# Copyright -# 2020 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -if [ "$transfer_nnet_data" == "$nnet_data" ];then - echo "Training data for victim and transfer model are the same" - echo "Skipping this step" - exit 0 -fi - -if [ $stage -le 2 ]; then - # This script preprocess audio for x-vector training - steps_xvec/preprocess_audios_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-adv.v1-$(date +'%m_%d_%H_%M') --use-bin-vad true \ - data/${transfer_nnet_data} data/${transfer_nnet_data}_proc_audio_no_sil exp/${transfer_nnet_data}_proc_audio_no_sil - hyp_utils/kaldi/utils/fix_data_dir.sh data/${transfer_nnet_data}_proc_audio_no_sil - -fi - -if [ $stage -le 3 ]; then - # Now, we remove files with less than 4s - hyp_utils/remove_short_audios.sh --min-len 4 data/${transfer_nnet_data}_proc_audio_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 4 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 4 data/${transfer_nnet_data}_proc_audio_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh \ - data/${transfer_nnet_data}_proc_audio_no_sil \ - data/${transfer_nnet_data}_proc_audio_no_sil/lists_xvec -fi - -exit diff --git a/egs/voxceleb/adv.v1/run_008_extract_xvectors_victim_model.sh b/egs/voxceleb/adv.v1/run_008_extract_xvectors_victim_model.sh deleted file mode 100755 index 03234eaa..00000000 --- a/egs/voxceleb/adv.v1/run_008_extract_xvectors_victim_model.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# Copyright -# 2020 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -nnet_stage=1 -config_file=default_config.sh -use_gpu=false -xvec_chunk_length=12800 -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" - xvec_cmd="$cuda_eval_cmd --mem 4G" -else - xvec_cmd="$train_cmd --mem 12G" -fi - -if [ $stage -le 2 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors_from_wav.sh \ - --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - --feat-config $feat_config \ - $nnet data/$name \ - $xvector_dir/$name - done -fi diff --git a/egs/voxceleb/adv.v1/run_031_extract_xvectors_transfer_model.sh b/egs/voxceleb/adv.v1/run_031_extract_xvectors_transfer_model.sh deleted file mode 100755 index 5daf2ec8..00000000 --- a/egs/voxceleb/adv.v1/run_031_extract_xvectors_transfer_model.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false - -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length 12800" - xvec_cmd="$cuda_eval_cmd" -else - xvec_cmd="$train_cmd" -fi - -nnet_name=$transfer_nnet_name -nnet=$transfer_nnet - -xvector_dir=exp/xvectors/$nnet_name - -if [ $stage -le 1 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors.sh --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - $nnet data/$name \ - $xvector_dir/$name - done -fi - -exit diff --git a/egs/voxceleb/adv.v1/run_040_eval_be_victim_model.sh b/egs/voxceleb/adv.v1/run_040_eval_be_victim_model.sh deleted file mode 100755 index ac8c8a24..00000000 --- a/egs/voxceleb/adv.v1/run_040_eval_be_victim_model.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name -score_plda_dir=$score_dir/cosine - -if [ $stage -le 1 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_plda_dir/voxceleb1_scores - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - -fi - - -if [ $stage -le 2 ];then - local/calibrate_voxceleb1_o_clean.sh --cmd "$train_cmd" $score_plda_dir - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test ${score_plda_dir}_cal_v1 - - for f in $(ls ${score_plda_dir}_cal_v1/*_results); - do - echo $f - cat $f - echo "" - done - - -fi diff --git a/egs/voxceleb/adv.v1/run_041_eval_be_transfer_model.sh b/egs/voxceleb/adv.v1/run_041_eval_be_transfer_model.sh deleted file mode 100755 index b9451768..00000000 --- a/egs/voxceleb/adv.v1/run_041_eval_be_transfer_model.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - - -nnet_name=$transfer_nnet_name -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name -score_plda_dir=$score_dir/cosine - -if [ $stage -le 1 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_plda_dir/voxceleb1_scores - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - -fi - -if [ $stage -le 2 ];then - local/calibrate_voxceleb1_o_clean.sh --cmd "$train_cmd" $score_plda_dir - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test ${score_plda_dir}_cal_v1 - - for f in $(ls ${score_plda_dir}_cal_v1/*_results); - do - echo $f - cat $f - echo "" - done - - -fi diff --git a/egs/voxceleb/adv.v1/run_042_eval_victim_from_wav.sh b/egs/voxceleb/adv.v1/run_042_eval_victim_from_wav.sh deleted file mode 100755 index b8ee5ada..00000000 --- a/egs/voxceleb/adv.v1/run_042_eval_victim_from_wav.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 -score_plda_dir=$score_dir/cosine_from_wav - -if [ $stage -le 1 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_xvec/eval_cosine_scoring_from_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 200 \ - --feat-config conf/fbank80_stmn_16k.yaml \ - --cal-file $cal_file \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - -fi - - diff --git a/egs/voxceleb/adv.v1/run_043_eval_whitebox_attacks.sh b/egs/voxceleb/adv.v1/run_043_eval_whitebox_attacks.sh deleted file mode 100755 index 55500abd..00000000 --- a/egs/voxceleb/adv.v1/run_043_eval_whitebox_attacks.sh +++ /dev/null @@ -1,346 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgsm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 2 ];then - score_array=() - stats_array=() - for snr in 30 20 10 0 - do - score_plda_dir=$score_dir/cosine_fgsm_snr${snr} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack snr=$snr" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type snr-fgsm --attack.snr $snr" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_snrall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 3 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha} - echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type rand-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_randfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 4 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha} - echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type iter-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 5 ];then - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/cosine_cwl2_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - done - done - done - -fi - - -if [ $stage -le 6 ];then - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwrms_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner RMS attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - -fi - - -if [ $stage -le 7 ];then - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwsnr_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner SNR attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.use-snr --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done -fi - - -exit - diff --git a/egs/voxceleb/adv.v1/run_044_eval_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1/run_044_eval_transfer_blackbox_attacks.sh deleted file mode 100755 index 937b4b6b..00000000 --- a/egs/voxceleb/adv.v1/run_044_eval_transfer_blackbox_attacks.sh +++ /dev/null @@ -1,481 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -transfer_feat_config=$feat_config - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -transfer_xvector_dir=exp/xvectors/$transfer_nnet_name -transfer_score_dir=exp/scores/$transfer_nnet_name -transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgsm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 2 ];then - - score_array=() - stats_array=() - - for snr in 30 20 10 0 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_fgsm_snr${snr} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack snr=$snr" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type snr-fgsm --attack.snr $snr" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_fgsm_snrall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 3 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_randfgsm_e${eps}_a${alpha} - echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type rand-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_randfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 4 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_iterfgsm_e${eps}_a${alpha} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type iter-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 5 ];then - - for confidence in 0 1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwl2_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - -fi - - -if [ $stage -le 6 ];then - - for confidence in 0 1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwrms_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner RMS attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 200 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - -fi - - -if [ $stage -le 7 ];then - - for confidence in 0 1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwsnr_conf${confidence}_lr${lr}_noabort_it$it - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner SNR attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - -fi - - -# if [ $stage -le -8 ];then - -# for confidence in 0 1 -# do -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwl0_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L0 attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 1000 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type cw-l0 --confidence $confidence --c-factor 10 \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi -# done - -# fi - - -# if [ $stage -le 9 ];then - -# for confidence in 0 1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_cwlinf_conf${confidence} -# echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" -# steps_adv/eval_cosine_scoring_from_transfer_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --transfer-feat-config $transfer_feat_conf --transfer-audio-feat $transfer_feat \ -# --attack-type cw-linf --confidence $confidence --c-factor 2 \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ -# --threshold $thr005 \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet \ -# $transfer_xvector_dir/voxceleb1_test/xvector.scp \ -# $transfer_nnet \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_plda_dir -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean \ -# $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ -# $score_analysis_dir/voxceleb1 & -# fi - -# done - -# fi - -wait - diff --git a/egs/voxceleb/adv.v1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh b/egs/voxceleb/adv.v1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh deleted file mode 100755 index ad2e4cdf..00000000 --- a/egs/voxceleb/adv.v1/run_045_eval_whitebox_attacks_with_randsmooth_defense.sh +++ /dev/null @@ -1,544 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml -sigmas="0.001 0.01" -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - for sigma in $sigmas - do - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_fgsm_e${eps}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgsm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 2 ];then - - for sigma in $sigmas - do - score_array=() - stats_array=() - for snr in 30 20 10 0 - do - score_plda_dir=$score_dir/cosine_fgsm_snr${snr}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack snr=$snr" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type snr-fgsm --attack.snr $snr" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_fgsm_snrall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 3 ];then - for sigma in $sigmas - do - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type rand-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_randfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 4 ];then - for sigma in $sigmas - do - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type iter-fgsm --attack.eps $eps --attack.alpha $alpha" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav --smooth-sigma $sigma \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - done -fi - - -if [ $stage -le 5 ];then - - for sigma in $sigmas - do - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - - score_plda_dir=$score_dir/cosine_cwl2_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence lr=$lr num-its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - done - done - done - done -fi - - -if [ $stage -le 6 ];then - for sigma in $sigmas - do - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwrms_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner RMS attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.norm-time --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - done -fi - - -if [ $stage -le 7 ];then - for sigma in $sigmas - do - - for confidence in 0 #1 - do - for lr in 0.001 - do - for it in 10 - do - score_plda_dir=$score_dir/cosine_cwsnr_conf${confidence}_lr${lr}_noabort_it${it}_randsmooth${sigma} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner SNR attack confidence=$confidence lr=$lr num_its=$it" - steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd -tc 15" $eval_args --nj 100 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence --attack.lr $lr --attack.no-abort --attack.use-snr --attack.max-iter $it" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - done - done - done -fi - - -exit - - - -# #!/bin/bash -# # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# # -# # Apache 2.0. -# # -# . ./cmd.sh -# . ./path.sh -# set -e - -# stage=1 -# config_file=default_config.sh -# use_gpu=false -# do_analysis=false -# save_wav=false - -# . parse_options.sh || exit 1; -# . $config_file -# . datapath.sh - -# if [ "$use_gpu" == "true" ];then -# eval_args="--use-gpu true" -# eval_cmd="$cuda_eval_cmd" -# else -# eval_cmd="$train_cmd" -# fi - -# xvector_dir=exp/xvectors/$nnet_name -# score_dir=exp/scores/$nnet_name - -# score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -# cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -# #thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -# thr005=2.94 -# thr001=4.60 -# thr0001=6.90 -# declare -a score_array -# declare -a stats_array - -# if [ $stage -le 1 ];then - -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# score_plda_dir=$score_dir/cosine_fgsm_e${eps}_randsmooth${sigma} -# echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type fgsm --eps $eps \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_fgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done - -# fi - - - - -# if [ $stage -le 3 ];then -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_randfgsm_e${eps}_a${alpha}_randsmooth$sigma -# echo "Eval Voxceleb 1 with Cosine scoring with Rand-FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type rand-fgsm --eps $eps --alpha $alpha --smooth-sigma $sigma\ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# done - -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_randfgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done -# fi - - -# if [ $stage -le 4 ];then -# for sigma in 0.001 0.01 -# do -# score_array=() -# stats_array=() -# for eps in 0.00001 0.0001 0.001 0.01 0.1 -# do -# alpha=$(echo $eps | awk '{ print $0/5.}') -# score_plda_dir=$score_dir/cosine_iterfgsm_e${eps}_a${alpha}_randsmooth$sigma -# echo "Eval Voxceleb 1 with Cosine scoring with Iterative FGSM attack eps=$eps" -# steps_adv/eval_cosine_scoring_from_adv_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ -# --feat-config conf/fbank80_16k.pyconf --audio-feat logfb \ -# --attack-type iter-fgsm --eps $eps --alpha $alpha \ -# --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ -# --cal-file $cal_file --threshold $thr005 --smooth-sigma $sigma \ -# data/voxceleb1_test/trials_o_clean \ -# data/voxceleb1_test/utt2model \ -# data/voxceleb1_test \ -# $xvector_dir/voxceleb1_test/xvector.scp \ -# $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - -# score_array+=($score_plda_dir/voxceleb1_scores) -# stats_array+=($score_plda_dir/voxceleb1_stats) - -# $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ -# local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - -# for f in $(ls $score_plda_dir/*_results); -# do -# echo $f -# cat $f -# echo "" -# done - -# done -# if [ "${do_analysis}" == "true" ];then -# score_analysis_dir=$score_dir/cosine_iterfgsm_eall_randsmooth$sigma -# local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ -# data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ -# $score_analysis_dir/voxceleb1 & -# fi -# done -# fi - -# wait diff --git a/egs/voxceleb/adv.v1/run_053_eval_art_whitebox_attacks.sh b/egs/voxceleb/adv.v1/run_053_eval_art_whitebox_attacks.sh deleted file mode 100755 index 3d01fbfa..00000000 --- a/egs/voxceleb/adv.v1/run_053_eval_art_whitebox_attacks.sh +++ /dev/null @@ -1,536 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 - -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_art_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 2 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_fgsm_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM minimal attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgsm_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - -if [ $stage -le 3 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_art_fgml1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 4 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_fgml1_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L1 minimal attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml1_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 5 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/cosine_art_fgml2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 6 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_fgml2_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM-L2 minimal attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.eps-step $alpha --attack.minimal --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_fgml2_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 7 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_iterfgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with IterFGM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 8 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_pgdlinf_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_pgdlinf_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - - -if [ $stage -le 9 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_pgdl1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_pgdl1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 10 ];then - score_array=() - stats_array=() - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/cosine_art_pgdl2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/cosine_art_pgdl2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi -fi - -if [ $stage -le 11 ];then - - for confidence in 0 #1 - do - score_plda_dir=$score_dir/cosine_art_cwl2_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - -if [ $stage -le 12 ];then - - for confidence in 0 #1 - do - score_plda_dir=$score_dir/cosine_art_cwlinf_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner Linf attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 400 \ - --feat-config $feat_config \ - --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - diff --git a/egs/voxceleb/adv.v1/run_054_eval_art_transfer_blackbox_attacks.sh b/egs/voxceleb/adv.v1/run_054_eval_art_transfer_blackbox_attacks.sh deleted file mode 100755 index 254cef78..00000000 --- a/egs/voxceleb/adv.v1/run_054_eval_art_transfer_blackbox_attacks.sh +++ /dev/null @@ -1,626 +0,0 @@ -#!/bin/bash -# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -do_analysis=false -save_wav=false -feat_config=conf/fbank80_stmn_16k.yaml - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -transfer_feat_config=$feat_config - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name -score_dir=exp/scores/$nnet_name - -score_clean=$score_dir/cosine_cal_v1/voxceleb1_scores -cal_file=$score_dir/cosine_cal_v1/cal_tel.h5 - -transfer_xvector_dir=exp/xvectors/$transfer_nnet_name -transfer_score_dir=exp/scores/$transfer_nnet_name -transfer_cal_file=$transfer_score_dir/cosine_cal_v1/cal_tel.h5 - -#thresholds for p=(0.05,0.01,0.001) -> thr=(2.94, 4.60, 6.90) -thr005=2.94 -thr001=4.60 -thr0001=6.90 -declare -a score_array -declare -a stats_array - -if [ $stage -le 1 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 2 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgsm_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 3 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 4 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM minimal L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml1_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 5 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with FGM L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 6 ];then - - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring FGM minimal L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type fgm --attack.eps $eps --attack.minimal --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_fgml2_minimal_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 7 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with iter FGSM attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type bim --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_iterfgsm_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - - -if [ $stage -le 8 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD Linf attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdlinf_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 9 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L1 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 1" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl1_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 10 ];then - score_array=() - stats_array=() - - for eps in 0.00001 0.0001 0.001 0.01 0.1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_e${eps} - echo "Eval Voxceleb 1 with Cosine scoring with PGD L2 attack eps=$eps" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 80 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type pgd --attack.eps $eps --attack.eps-step $alpha --attack.max-iter 10 --attack.norm 2" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - score_array+=($score_plda_dir/voxceleb1_scores) - stats_array+=($score_plda_dir/voxceleb1_stats) - - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_pgdl2_eall - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean "${score_array[*]}" "${stats_array[*]}" \ - $score_analysis_dir/voxceleb1 & - fi - -fi - - -if [ $stage -le 11 ];then - - for confidence in 0 #1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwl2_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner L2 attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 20 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-l2 --attack.confidence $confidence" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - - -if [ $stage -le 12 ];then - - for confidence in 0 #1 - do - alpha=$(echo $eps | awk '{ print $0/5.}') - score_plda_dir=$score_dir/transfer.${transfer_nnet_name}/cosine_art_cwlinf_conf${confidence} - echo "Eval Voxceleb 1 with Cosine scoring with Carlini-Wagner LInf attack confidence=$confidence" - steps_adv/eval_cosine_scoring_from_transfer_art_test_wav.sh --cmd "$eval_cmd" $eval_args --nj 40 \ - --feat-config $feat_config \ - --transfer-feat-config $transfer_feat_config \ - --attack-opts "--attack.attack-type cw-linf --attack.confidence $confidence --attack.eps 0.3" \ - --save-wav $save_wav --save-wav-path $score_plda_dir/wav \ - --cal-file $cal_file --transfer-cal-file $transfer_cal_file \ - --threshold $thr005 \ - data/voxceleb1_test/trials_o_clean \ - data/voxceleb1_test/utt2model \ - data/voxceleb1_test \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $nnet \ - $transfer_xvector_dir/voxceleb1_test/xvector.scp \ - $transfer_nnet \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats - - $train_cmd --mem 10G $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1_o_clean.sh data/voxceleb1_test $score_plda_dir - - for f in $(ls $score_plda_dir/*_results); - do - echo $f - cat $f - echo "" - done - if [ "${do_analysis}" == "true" ];then - score_analysis_dir=$score_plda_dir - local/attack_analysis.sh --cmd "$train_cmd --mem 10G" \ - data/voxceleb1_test/trials_o_clean $score_clean \ - $score_plda_dir/voxceleb1_scores $score_plda_dir/voxceleb1_stats \ - $score_analysis_dir/voxceleb1 & - fi - - done - -fi - -wait - diff --git a/egs/voxceleb/adv.v1/steps b/egs/voxceleb/adv.v1/steps deleted file mode 120000 index aede39fe..00000000 --- a/egs/voxceleb/adv.v1/steps +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/steps_adv b/egs/voxceleb/adv.v1/steps_adv deleted file mode 120000 index fa9be351..00000000 --- a/egs/voxceleb/adv.v1/steps_adv +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/adv \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/steps_be b/egs/voxceleb/adv.v1/steps_be deleted file mode 120000 index b2098c2a..00000000 --- a/egs/voxceleb/adv.v1/steps_be +++ /dev/null @@ -1 +0,0 @@ -../v1/steps_be \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/steps_fe b/egs/voxceleb/adv.v1/steps_fe deleted file mode 120000 index 73ccc1eb..00000000 --- a/egs/voxceleb/adv.v1/steps_fe +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/kaldi/vad \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/steps_pyfe b/egs/voxceleb/adv.v1/steps_pyfe deleted file mode 120000 index 7b9d122a..00000000 --- a/egs/voxceleb/adv.v1/steps_pyfe +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/feats \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/steps_xvec b/egs/voxceleb/adv.v1/steps_xvec deleted file mode 120000 index af66a94d..00000000 --- a/egs/voxceleb/adv.v1/steps_xvec +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/xvectors \ No newline at end of file diff --git a/egs/voxceleb/adv.v1/utils b/egs/voxceleb/adv.v1/utils deleted file mode 120000 index 3d590a1d..00000000 --- a/egs/voxceleb/adv.v1/utils +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/README.md b/egs/voxceleb/vae.v1/README.md deleted file mode 100644 index 1514fae4..00000000 --- a/egs/voxceleb/vae.v1/README.md +++ /dev/null @@ -1,89 +0,0 @@ -# VoxCeleb Version 3 - -Last update 2020/07/09 - -This recipe is work in progress - -Recipe to evaluate generative models on VoxCeleb -We train models on VoxCeleb2 and evaluate on full VoxCeleb1. -The goal is to evaluate the hability of generative models to -recostruct VoxCeleb1 data or to generate data from scratch. - -## Models included: - - The following models can be evaluated with this recipe: - - Basic Autoencoders (AE) - - Variational Autoencoders (VAE - - VQ-VAE - - Denoising AE, VAE, VQ-VAE - -## Training Data - - - Autoencoders, VAE, VQ-VAE, GAN are trained on - - VoxCeleb2 dev+test - - Denoising versions are trained on - - VoxCeleb2 dev+test + augmentation with - - MUSAN noise - - RIR reverberation - -## Test Data - - - Test data is the full VoxCeleb 1 - -## Usage - - - Run the run_stepnumber_*.sh scripts in sequence - - Depending on the model that you are testing you can skip some steps - - if not running denoising versions skip steps 3 and 4 - - Run train/eval steps only corresponding to the model that you are using - -## Results - -We compute average of the metrics across VoxCeleb1, values in parenthesis are std. -We report EER on VoxCeleb1 Test Original Clean Task using recostructed log-filter-banks and LResNet34 x-vector trained in recipe v1.1. -Baseline EER=1.94% when using original log-filter-banks. - -### Models trained without augmentation - -| Config | Model Type | Architecture | Latent-channels | Compression (bits x/bits z) | ELBO/dim (std) | MSE (std) | L1 (std) | codebook size | EER(%) | -| ------ | ---------- | ------------ | :--------: | :-------: | :----: | :----: | :----: | :----: | :----: | -| config_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh | VAE | DC1d Enc-Dec
dc-blocks=4 / hid-channels=256 | 80 | 8 | -1.96 (0.62) | 1.57 (0.91) | 0.90 (0.24) | | 16.36 | -| config_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh | VAE | DC1d Enc-Dec
dc-blocks=9 / hid-channels=256 | 80 | 8 | -1.95 (0.62) | 1.56 (0.91) | 0.89 (0.24) | -| config_vae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh | VAE | ResNet1d Enc-Dec
res-blocks=4/ hid-channels=256 | 80 | 8 | -1.97 (0.65) | 1.55 (0.93) | 0.89 (0.25) | | 15.05 | -| config_vae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh | VAE | ResNet1d Enc-Dec
res-blocks=8/ hid-channels=256 | 80 | 8 | -1.98 (0.65) | 1.55 (0.93) | 0.88 (0.25) | | 13.45 | -| config_vae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh | VAE | ResNet1d Enc-Dec
res-blocks=16/ hid-channels=256 | 80 | 8 | -1.98 (0.69) | 1.54 (0.94) | 0.88 (0.25) | | 13.45 | -| config_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh | VAE | DC2d Enc-Dec
dc-blocks=4 / hid-channels=64 | 80 | 0.8 | -2.25 (1.00) | 1.49 (1.06) | 0.84 (0.29) | | 10.04 | -| config_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh | VAE | DC2d Enc-Dec
dc-blocks=8 / hid-channels=64 | 80 | 0.8 | -2.23 (1.00) | 1.49 (1.06) | 0.84 (0.29) | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 2275 | -1.84 (0.21) | 2.20 (0.71) | 1.12 (0.16) | 512 | 28.42 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 1138 | -1.79 (0.32) | 1.86 (0.78) | 1.01 (0.19) | 512x2 | 22.08 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 569 | -1.40 (0.43) | 1.69 (0.83) | 0.95 (0.21) | 512x4 | 19.18 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569_predvar.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 569 | -1.78 (0.42) | 1.70 (0.83) | 0.95 (0.21) | 512x4 | 18.16 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 284 | -1.87 (0.59) | 1.56 (0.89) | 0.89 (0.23) | 512x8 | 15.48 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 142 | -2.04 (0.83) | 1.46 (0.96) | 0.84 (0.27) | 512x16 | 11.77 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 71 | -2.15 (1.4) | 1.43 (1.08) | 0.80 (0.32) | 512x32 | 8.13 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 36 | -9.27 (8.31) | 1.49 (1.22) | 0.79 (0.36) | 512x64 | 6.41 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x128_c18.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 18 | -20.97 (20.62) | 1.46 (1.24) | 0.77 (0.38) | 512x128 | 5.67 | -| config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 9 | -27.91 (26.00) | 1.49 (1.27) | 0.78 (0.39) | 512x256 | 5.41 | -| config_vqvae_transformer_b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6/ d_model=512 / heads=8 / d_ff=2048 | 512 | 36 | -1.74(0.31) | 0.48 (0.15) | 0.52 (0.08) | 512x8 | 10.49 | -| config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6 / d_model=512 / heads=8 / att-context=25 / d_ff=2048 | 512 | 36 | -1.61(0.15) | 0.42 (0.08) | 0.49 (0.05) | 512x8 | 4.26 | -| config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6 / d_model=512 / heads=8 / att-context=25 / d_ff=2048
RAdam Opt. | 512 | 36 | -1.33(0.15) | 0.28 (0.05) | 0.40 (0.03) | 512x8 | 4.06 | -| config_vqvae_transformer_b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6 / d_model=512 / heads=8 / d_ff=2048
Rel. Pos Enc.
RAdam Opt. | 512 | 36 | -1.29(0.10) | 0.27 (0.05) | 0.39 (0.03) | 512x8 | 4.21 | -| config_vqvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6 / d_model=512 / heads=8 / att-context=25 / d_ff=2048
Rel. Pos Enc.
RAdam Opt. | 512 | 36 | -1.30(0.09) | 0.27 (0.04) | 0.39 (0.03) | 512x8 | 4.02 | -| config_vqvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Conformer Enc
blocks=6 / d_model=512 / heads=8 / att-context=25 / d_ff=2048
RAdam Opt. | 512 | 36 | -1.26(0.10) | 0.28 (0.04) | 0.39 (0.03) | 512x8 | 4.06 | - - -### Models trained with augmentation (Denoising versions) - -| Config | Model Type | Architecture | Latent-channels | Compression (bits x/bits z) | ELBO/dim (std) | MSE (std) | L1 (std) | codebook size | EER(%) | -| ------ | ---------- | ------------ | :--------: | :-------: | :----: | :----: | :----: | :----: | :----: | -| config_dvae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh | VAE | ResNet1d Enc-Dec
res-blocks=16 / hid-channels=256 | 80 | 8 | -1.77 (0.33) | 1.67 (0.87) | 0.94 (0.22) | | 16.70 | -| config_dvae_resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh | VAE | ResNet2d Enc-Dec
res-blocks=16 / base-channels=64 | 80 | 0.8 | -1.77 (0.39) | 1.57 (0.92) | 0.89 (0.25) | | 12.40 | -| config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 569 | -1.75 (0.29) | 1.78 (0.84) | 0.98 (0.21) | 512x4 | 18.37 | -| config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 284 | -1.80 (0.42) | 1.69 (0.83) | 0.95 (0.21) | 512x8 | 15.19 | -| config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 142 | -1.81 (0.42) | 1.55 (0.97) | 0.87 (0.26) | 512x16 | 11.37 | -| config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh | VQ-VAE | ResNet1d Enc-Dec
res-blocks=8 / hid-channels=256 | 256 | 71 | -1.95 (0.49) | 1.47 (1.03) | 0.83 (0.30) | 512x32 | 8.75 | -| config_vqdvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh | VQ-VAE | Xformer Enc
blocks=6/ d_model=512 / heads=8 / att-context=25 / d_ff=2048
Radam Opt. | 512 | 36 | -1.85 (0.13) | 0.56 (0.31) | 0.57 (0.11) | 512x8 | 5.3 | -| config_vqdvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh | VQ-VAE | Xformer Enc
blocks=6/ d_model=512 / heads=8 / d_ff=2048
Rel. Pos. Enc
Radam Opt. | 512 | 36 | -1.77 (0.05) | 0.43 (0.10) | 0.51 (0.04) | 512x8 | 4.56 | -| config_vqdvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh | VQ-VAE | Conformer Enc
blocks=6/ d_model=512 / heads=8 / d_ff=2048
Rel. Pos. Enc
Radam Opt. | 512 | 36 | -1.83 (0.05) | 0.59 (0.11) | 0.59 (0.04) | 512x8 | 6.56 | - - diff --git a/egs/voxceleb/vae.v1/cmd.sh b/egs/voxceleb/vae.v1/cmd.sh deleted file mode 100755 index fe9c55b0..00000000 --- a/egs/voxceleb/vae.v1/cmd.sh +++ /dev/null @@ -1,25 +0,0 @@ -# you can change cmd.sh depending on what type of queue you are using. -# If you have no queueing system and want to run on a local machine, you -# can change all instances 'queue.pl' to run.pl (but be careful and run -# commands one by one: most recipes will exhaust the memory on your -# machine). queue.pl works with GridEngine (qsub). slurm.pl works -# with slurm. Different queues are configured differently, with different -# queue names and different ways of specifying things like memory; -# to account for these differences you can create and edit the file -# conf/queue.conf to match your queue's configuration. Search for -# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, -# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. - -if [ "$(hostname -d)" == "cm.gemini" ];then - #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" - export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" - export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" - export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" -else - export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" - export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" - export cuda_eval_cmd="$train_cmd" -fi - - - diff --git a/egs/voxceleb/vae.v1/conf b/egs/voxceleb/vae.v1/conf deleted file mode 120000 index 7dfe9dce..00000000 --- a/egs/voxceleb/vae.v1/conf +++ /dev/null @@ -1 +0,0 @@ -../../sre19-cmn2/v1/conf \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/datapath.sh b/egs/voxceleb/vae.v1/datapath.sh deleted file mode 100644 index 632362a7..00000000 --- a/egs/voxceleb/vae.v1/datapath.sh +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# -# Paths to the databases used in the experiment - -#paths to databases - -if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then - voxceleb1_root=/export/corpora5/VoxCeleb1 - voxceleb2_root=/export/corpora5/VoxCeleb2 - musan_root=/export/corpora5/JHU/musan -elif [ "$(hostname --domain)" == "cm.gemini" ];then - voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 - voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 - musan_root=/expscratch/dgromero/corpora-open/musan -else - echo "Put your database paths here" - exit 1 -fi - - diff --git a/egs/voxceleb/vae.v1/default_config.sh b/egs/voxceleb/vae.v1/default_config.sh deleted file mode 120000 index 5755326d..00000000 --- a/egs/voxceleb/vae.v1/default_config.sh +++ /dev/null @@ -1 +0,0 @@ -global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index 19b1cedf..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# Denoising VAE with symmetric ResNet1D encoder-decoder with -# 16 residual blocks, 256 dim per block, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=dvae -narch=resnet1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 3 4 6 3 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 3 4 6 3 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b16d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=90 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0090.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index 68fbba13..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_dvae_resnet2d_b16c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,29 +0,0 @@ -# Denoising VAE with symmetric ResNet2D encoder-decoder with -# 16 residual blocks, 64 base channels, latent_dim=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=dvae -narch=resnet2d -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 2 2 2 2 --enc.resb-channels 64 128 256 512 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 2 2 2 2 --dec.resb-channels 512 256 128 64 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b16c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index 3dc324ae..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b4d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,29 +0,0 @@ -# VAE with symmetric DC1 encoder-decoder with 4 layers, 256 dim per layer, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=512 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=dc1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.conv-repeats 1 1 1 1 --enc.conv-channels 256 --enc.conv-kernel-sizes 3 --enc.conv-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.conv-repeats 1 1 1 1 --dec.conv-channels 256 --dec.conv-kernel-sizes 3 --dec.conv-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b4d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=540 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0540.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index 11d79a6b..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_dc1d_b9d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric DC1 encoder-decoder with 9 layers, 256 dim per layer, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=512 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=dc1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.conv-repeats 2 2 3 2 --enc.conv-channels 256 --enc.conv-kernel-sizes 3 --enc.conv-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.conv-repeats 2 2 3 2 --dec.conv-channels 256 --dec.conv-kernel-sizes 3 --dec.conv-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" - -nnet_name=${model_type}_${narch}_b9d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=550 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0550.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index 6de722df..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric deep conv 2D encoder-decoder with -# 4 residual blocks, 64 base channels , latent_channels=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=64 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=dc2d -vae_opt="" -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.conv-repeats 1 1 1 1 --enc.conv-channels 64 128 256 512 --enc.conv-kernel-sizes 3 --enc.conv-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.conv-repeats 1 1 1 1 --dec.conv-channels 512 256 128 64 --dec.conv-kernel-sizes 3 --dec.conv-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b4c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=500 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0440.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index 879ce269..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_dc2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric deep conv 2D encoder-decoder with -# 8 residual blocks, 64 base channels , latent_channels=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=dc2d -vae_opt="" -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.conv-repeats 2 2 2 2 --enc.conv-channels 64 128 256 512 --enc.conv-kernel-sizes 3 --enc.conv-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.conv-repeats 2 2 2 2 --dec.conv-channels 512 256 128 64 --dec.conv-kernel-sizes 3 --dec.conv-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=400 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0400.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index aca516a1..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b16d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric ResNet1D encoder-decoder with -# 16 residual blocks, 256 dim per block, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 3 4 6 3 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 3 4 6 3 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b16d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=410 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0410.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index be0a00b6..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b4d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric ResNet1D encoder-decoder with -# 16 residual blocks, 256 dim per block, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 1 1 1 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 1 1 1 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b4d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=370 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0370.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh deleted file mode 100644 index 167b3837..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet1d_b8d256_z80_c8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,30 +0,0 @@ -# VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=80, compression factor=8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=128 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet1d -vae_opt="--in-feats 80" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_z${latent_dim}_c8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=420 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0420.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index 0240d1d0..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b4c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,31 +0,0 @@ -# VAE with symmetric ResNet2D encoder-decoder with -# 4 residual blocks, 64 base channels , latent_channels=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet2d -vae_opt="" -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 1 1 1 --enc.resb-channels 64 128 256 512 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 1 1 1 --dec.resb-channels 512 256 128 64 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b4c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=600 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0205.pth - - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh deleted file mode 100644 index ff503162..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vae_resnet2d_b8c64_z80_c0.8.opt.lr0.01.v1.sh +++ /dev/null @@ -1,31 +0,0 @@ -# VAE with symmetric ResNet2D encoder-decoder with -# 8 residual blocks, 64 base channels , latent_channels=80, compression factor=0.8 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=80 -model_type=vae -narch=resnet2d -vae_opt="" -enc_opt="--enc.in-conv-channels 64 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 2 2 2 2 --enc.resb-channels 64 128 256 512 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 80 --dec.in-conv-channels 512 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 2 2 2 2 --dec.resb-channels 512 256 128 64 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8c64_z${latent_dim}_c0.8_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=205 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0205.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth - diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh deleted file mode 100644 index 98af99a2..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.0025.v6.sh +++ /dev/null @@ -1,45 +0,0 @@ -k# VQ-VAE with Conformer Encoder for Enc and Dec with -# 6 conformer blocks, relative pos encoder, d_model=512, heads=8, d_ff=2048, -# latent_dim=512, codebook=512x8, compression factor=36, att-context=25 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.0025 - -model_type=vq-dvae - -dropout=0 -narch=conformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 -conv_kernel=31 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context --enc.conv-kernel-sizes $conv_kernel" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context --dec.conv-kernel-sizes $conv_kernel" - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 10000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 10000 --lrsched.update-lr-on-opt-step" - -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}cbk${conv_kernel}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv6_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=40 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0040.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh deleted file mode 100644 index 841207ea..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=142 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=16 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh deleted file mode 100644 index 795a8d4f..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=71 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=32 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh deleted file mode 100644 index da17dc19..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh +++ /dev/null @@ -1,34 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=569 -# Trained for denosing - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=4 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=90 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0090.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh deleted file mode 100644 index a2d8005e..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=284 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=8 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256swish_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256swish_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh deleted file mode 100644 index 435460c2..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_resnet1d_b8d256swish_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=142 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=16 -narch=resnet1d -model_type=vq-dvae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2 --enc.hid-act swish" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2 --dec.hid-act swish" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256swish_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=100 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0100.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index f99031d1..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,43 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=512x8, compression factor=36 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-dvae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context" - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=40 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0040.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh b/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh deleted file mode 100644 index 03fe5a33..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqdvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.005.v6.sh +++ /dev/null @@ -1,43 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, relative pos encoder, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=512x8, compression factor=36 - -nnet_data=voxceleb2cat_train_combined -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.005 - -model_type=vq-dvae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context --enc.rel-pos-enc" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context --dec.rel-pos-enc" - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 10000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 10000 --lrsched.update-lr-on-opt-step" - -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}rpe_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv6_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=40 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0040.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index e4962443..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_conformer_lac25b6d512h8cbk31ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,43 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=conformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 -conv_kernel=31 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context --enc.conv-kernel-sizes $conv_kernel" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context --dec.conv-kernel-sizes $conv_kernel" - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}cbk${conv_kernel}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=120 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0120.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh deleted file mode 100644 index 31487e05..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512_c2275.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=2275 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=1 -narch=resnet1d -model_type=vq-vae -vq_type=ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=370 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0370.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x128_c18.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x128_c18.opt.lr0.01.v1.sh deleted file mode 100644 index 56deb6c8..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x128_c18.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=18 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=128 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=550 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0550.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh deleted file mode 100644 index f5b56dc2..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x16_c142.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=142 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=16 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=440 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0440.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh deleted file mode 100644 index 7998a6c3..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x256_c9.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=9 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=256 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=360 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0360.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh deleted file mode 100644 index 1252c9e4..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x2_c1138.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=1138 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=2 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=510 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0510.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh deleted file mode 100644 index 59327eb4..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x32_c71.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=71 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=32 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=440 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0440.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh deleted file mode 100644 index 2082dd74..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=569 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=4 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=370 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0370.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569_predvar.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569_predvar.opt.lr0.01.v1.sh deleted file mode 100644 index 6ce2b144..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x4_c569_predvar.opt.lr0.01.v1.sh +++ /dev/null @@ -1,34 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=569 -# P(x|z) with sample dependent variances predicted by nnet - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=4 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups --px-pdf normal-diag-cov" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_predvar_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=400 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0400.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh deleted file mode 100644 index 8ef652f3..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x64_c36.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=64 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=460 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0460.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh deleted file mode 100644 index 56498b78..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_resnet1d_b8d256_emakmeansvq_z256cb512x8_c284.opt.lr0.01.v1.sh +++ /dev/null @@ -1,33 +0,0 @@ -# VQ-VAE with symmetric ResNet1D encoder-decoder with -# 8 residual blocks, 256 dim per block, latent_dim=256, codebook=512, compression factor=284 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=256 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 -dropout=0 -latent_dim=256 -vq_clusters=512 -num_groups=8 -narch=resnet1d -model_type=vq-vae -vq_type=multi-ema-k-means-vq -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.in-conv-channels 256 --enc.in-kernel-size 5 --enc.in-stride 1 --enc.resb-repeats 1 2 3 2 --enc.resb-channels 256 --enc.resb-kernel-sizes 3 --enc.resb-strides 1 2 2 2" -dec_opt="--dec.in-channels 256 --dec.in-conv-channels 256 --dec.in-kernel-size 3 --dec.in-stride 1 --dec.resb-repeats 1 2 3 2 --dec.resb-channels 256 --dec.resb-kernel-sizes 3 --dec.resb-strides 1 2 2 2" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 16000 --lrsched.hold-steps 16000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 8000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b8d256_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv1_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=430 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0430.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh deleted file mode 100644 index 3c193e06..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh +++ /dev/null @@ -1,46 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type scaled-dot-prod-v1" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type scaled-dot-prod-v1" - - -#opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -#lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 12000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b${blocks}d${d_model}h${heads}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=160 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0160.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth - - diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index ba68e597..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,42 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type scaled-dot-prod-v1 --enc.rel-pos-enc" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type scaled-dot-prod-v1 --dec.rel-pos-enc" - - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_b${blocks}d${d_model}h${heads}linff${d_ff}rpe_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=150 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0150.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh deleted file mode 100644 index f02db8e9..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36.opt.lr0.01.v4.sh +++ /dev/null @@ -1,45 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context" - - -#opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -#lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 12000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_adam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=170 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0170.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index 59a8843d..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,45 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context" - - -#opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp" -#lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 12000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=170 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0170.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh b/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh deleted file mode 100644 index a04f4b58..00000000 --- a/egs/voxceleb/vae.v1/global_conf/config_vqvae_transformer_lac25b6d512h8ff2048rpe_emakmeansvq_z512cb512x8_c36_radam.opt.lr0.01.v4.sh +++ /dev/null @@ -1,43 +0,0 @@ -# VQ-VAE with Transformer Encoder for Enc and Dec with -# 6 transformer blocks, d_model=512, heads=8, d_ff=2048, latent_dim=512, codebook=8x8, compression factor=36 - -nnet_data=voxceleb2cat_train -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -min_chunk=400 -max_chunk=400 -ipe=1 -lr=0.01 - -model_type=vq-vae - -dropout=0 -narch=transformer-enc-v1 -blocks=6 -d_model=512 -heads=8 -d_ff=2048 -att_context=25 - -latent_dim=512 -vq_type=multi-ema-k-means-vq -vq_clusters=512 -num_groups=8 - -vae_opt="--in-feats 80 --z-dim $latent_dim --vq-type $vq_type --vq-clusters $vq_clusters --vq-groups $num_groups" -enc_opt="--enc.num-blocks $blocks --enc.d-model $d_model --enc.num-heads $heads --enc.ff-type linear --enc.d-ff $d_ff --enc.in-layer-type linear --enc.att-type local-scaled-dot-prod-v1 --enc.att-context $att_context --enc.rel-pos-enc" -dec_opt="--dec.in-feats $latent_dim --dec.num-blocks $blocks --dec.d-model $d_model --dec.num-heads $heads --dec.ff-type linear --dec.d-ff $d_ff --dec.in-layer-type linear --dec.att-type local-scaled-dot-prod-v1 --dec.att-context $att_context --dec.rel-pos-enc" - - -opt_opt="--optim.opt-type radam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 8000 --lrsched.hold-steps 2000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" -nnet_name=${model_type}_${narch}_lac${att_context}b${blocks}d${d_model}h${heads}linff${d_ff}rpe_${vq_type}_z${latent_dim}c${vq_clusters}x${num_groups}_do${dropout}_optv4_radam_lr${lr}_b${eff_batch_size}.$nnet_data -nnet_num_epochs=160 -num_augs=5 -nnet_dir=exp/vae_nnets/$nnet_name -nnet=$nnet_dir/model_ep0160.pth - -# xvector network trained with recipe v1.1 -xvec_nnet_name=fbank80_stmn_lresnet34_e256_arcs30m0.3_do0_adam_lr0.05_b512_amp.v1 -xvec_nnet_dir=../v1.1/exp/xvector_nnets/$xvec_nnet_name -xvec_nnet=$xvec_nnet_dir/model_ep0070.pth diff --git a/egs/voxceleb/vae.v1/hyp_utils b/egs/voxceleb/vae.v1/hyp_utils deleted file mode 120000 index f6d1eb7a..00000000 --- a/egs/voxceleb/vae.v1/hyp_utils +++ /dev/null @@ -1 +0,0 @@ -../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/local b/egs/voxceleb/vae.v1/local deleted file mode 120000 index ce1cbf90..00000000 --- a/egs/voxceleb/vae.v1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/path.sh b/egs/voxceleb/vae.v1/path.sh deleted file mode 100755 index 6994fdab..00000000 --- a/egs/voxceleb/vae.v1/path.sh +++ /dev/null @@ -1,5 +0,0 @@ - -export HYP_ROOT=$(readlink -f `pwd -P`/../../..) -export TOOLS_ROOT=$HYP_ROOT/tools - -. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/vae.v1/run_001_prepare_data.sh b/egs/voxceleb/vae.v1/run_001_prepare_data.sh deleted file mode 100755 index 65ff18d0..00000000 --- a/egs/voxceleb/vae.v1/run_001_prepare_data.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. datapath.sh - - -if [ $stage -le 1 ];then - - # Prepare the VoxCeleb2 dataset for training. - local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train - #local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test - #utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test -fi - -if [ $stage -le 2 ];then - # prepare voxceleb1 for test - local/make_voxceleb1_oeh.pl $voxceleb1_root data -fi diff --git a/egs/voxceleb/vae.v1/run_002_compute_evad.sh b/egs/voxceleb/vae.v1/run_002_compute_evad.sh deleted file mode 100755 index eeae00ac..00000000 --- a/egs/voxceleb/vae.v1/run_002_compute_evad.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - - -if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $vaddir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $vaddir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $vaddir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $vaddir/storage - else - echo "we don't distribute data between multiple machines" - fi - fi -fi - -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ - --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_vad/$name $vaddir - utils/fix_data_dir.sh data/${name} - done -fi - - diff --git a/egs/voxceleb/vae.v1/run_003_compute_fbank.sh b/egs/voxceleb/vae.v1/run_003_compute_fbank.sh deleted file mode 100755 index 713a34cb..00000000 --- a/egs/voxceleb/vae.v1/run_003_compute_fbank.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -nodes=fs01 -storage_name=$(date +'%m_%d_%H_%M') -fbankdir=`pwd`/exp/fbank -vaddir=`pwd`/exp/fbank -vaddir_gt=`pwd`/exp/vad_gt - -stage=1 -config_file=default_config.sh -feat_vers="numpy" - -. parse_options.sh || exit 1; - -if [ "$feat_vers" == "kaldi" ];then - make_fbank=steps/make_fbank.sh - fbank_cfg=conf/fbank80_16k.conf -else - fbank_cfg=conf/fbank80_16k.yaml - if [ "$feat_vers" == "numpy" ];then - make_fbank=steps_pyfe/make_fbank.sh - else - make_fbank=steps_pyfe/make_torch_fbank.sh - fi -fi - -# Make filterbanks -if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $fbankdir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/fbank/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $fbankdir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $fbankdir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $fbankdir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $fbankdir/storage - else - echo "we don't distribute data between multiple machines" - fi - fi -fi - -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - $make_fbank --write-utt2num-frames true --fbank-config $fbank_cfg --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_fbank/$name $fbankdir - utils/fix_data_dir.sh data/${name} - done - -fi - - diff --git a/egs/voxceleb/vae.v1/run_004_prepare_augment.sh b/egs/voxceleb/vae.v1/run_004_prepare_augment.sh deleted file mode 100755 index 7d78ae92..00000000 --- a/egs/voxceleb/vae.v1/run_004_prepare_augment.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -# In this script, we augment the SWBD,SRE,MX6 and Voxceleb data with reverberation, -# noise, music, and babble, and combined it with the clean data. -# The combined list will be used to train the xvector DNN. - -frame_shift=0.01 - -if [ $stage -le 1 ]; then - - if [ ! -d "RIRS_NOISES" ]; then - if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then - ln -s ../../sre19-cmn2/v1/RIRS_NOISES - else - # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip - fi - fi - - # Prepare the MUSAN corpus, which consists of music, speech, and noise - # suitable for augmentation. - local/make_musan.sh $musan_root 16 data - - # Get the duration of the MUSAN recordings. This will be used by the - # script augment_data_dir.py. - for name in speech noise music; do - utils/data/get_utt2dur.sh data/musan_${name} - mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur - done - -fi - - -if [ $stage -le 2 ]; then - - for name in voxceleb2cat_train - do - export TMPDIR=data/tmp - mkdir -p $TMPDIR - - awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/$name/utt2num_frames > data/$name/reco2dur - - # Make a reverberated version of the list. Note that we don't add any - # additive noise here. - - # Make a version with reverberated speech - rvb_opts=() - rvb_opts+=(--rir-set-parameters "0.2, RIRS_NOISES/real_rirs_isotropic_noises/rir_list") - rvb_opts+=(--rir-set-parameters "0.4, RIRS_NOISES/simulated_rirs/smallroom/rir_list") - rvb_opts+=(--rir-set-parameters "0.4, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") - - python steps/data/reverberate_data_dir.py \ - "${rvb_opts[@]}" \ - --speech-rvb-probability 1 \ - --pointsource-noise-addition-probability 0 \ - --isotropic-noise-addition-probability 0 \ - --num-replications 1 \ - --source-sampling-rate 16000 \ - data/${name} data/${name}_reverb - cp data/${name}/vad.scp data/${name}_reverb/ - utils/copy_data_dir.sh --utt-suffix "-reverb" data/${name}_reverb data/${name}_reverb.new - rm -rf data/${name}_reverb - mv data/${name}_reverb.new data/${name}_reverb - - - # Augment with musan_noise - python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0:13:8" --fg-noise-dir "data/musan_noise" data/${name} data/${name}_noise - # Augment with musan_music - python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/${name} data/${name}_music - # Augment with musan_speech - python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13:10" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/${name} data/${name}_babble - - - awk '{ $1=$1"-reverb"; print $0}' data/${name}/reco2dur > data/${name}_reverb/reco2dur - - # Augment with musan_noise - python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0:13:8" --fg-noise-dir "data/musan_noise" data/${name}_reverb data/${name}_reverb_noise - # Augment with musan_music - python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/${name}_reverb data/${name}_reverb_music - # Augment with musan_speech - python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13:10" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/${name}_reverb data/${name}_reverb_babble - - - # Combine noise only - utils/combine_data.sh data/${name}_noise_all \ - data/${name}_noise data/${name}_music data/${name}_babble - - # Combine reverbs - utils/combine_data.sh data/${name}_reverb_all data/${name}_reverb \ - data/${name}_reverb_noise data/${name}_reverb_music data/${name}_reverb_babble - - # Combine reverb, noise, music, and babble into one directory. - utils/combine_data.sh data/${name}_aug data/${name}_reverb_all data/${name}_noise_all - unset TMPDIR - done - -fi - - -if [ $stage -le 3 ];then - # Take a random subset of the augmentations - utils/subset_data_dir.sh data/voxceleb2cat_train_aug \ - $(wc -l data/voxceleb2cat_train/utt2spk | awk '{ print int('$num_augs'*$1)}') \ - data/voxceleb2cat_train_augx${num_augs} - utils/fix_data_dir.sh data/voxceleb2cat_train_augx${num_augs} -fi - - -exit diff --git a/egs/voxceleb/vae.v1/run_005_compute_fbank_augment.sh b/egs/voxceleb/vae.v1/run_005_compute_fbank_augment.sh deleted file mode 100755 index 10d13e03..00000000 --- a/egs/voxceleb/vae.v1/run_005_compute_fbank_augment.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e -fbankdir=`pwd`/exp/fbank - -stage=1 -config_file=default_config.sh -feat_vers="numpy" - -. parse_options.sh || exit 1; -. $config_file - -if [ "$feat_vers" == "kaldi" ];then - make_fbank=steps/make_fbank.sh - fbank_cfg=conf/fbank80_16k.conf -else - fbank_cfg=conf/fbank80_16k.yaml - if [ "$feat_vers" == "numpy" ];then - make_fbank=steps_pyfe/make_fbank.sh - else - make_fbank=steps_pyfe/make_torch_fbank.sh - fi -fi - -export TMPDIR=data/tmp -mkdir -p $TMPDIR - -if [ $stage -le 1 ];then - - # Make filterbanks for the augmented data. Note that we do not compute a new - # vad.scp file here. Instead, we use the vad.scp from the clean version of - # the list. - for name in voxceleb2cat_train_augx${num_augs} - do - $make_fbank --write-utt2num-frames true \ - --fbank-config $fbank_cfg --nj 120 --cmd "$train_cmd" \ - data/$name exp/make_fbank/$name $fbankdir - fix_data_dir.sh data/$name - done - -fi - - -if [ $stage -le 2 ];then - - # Combine the clean and augmented lists. - utils/combine_data.sh --extra-files "utt2num_frames" data/voxceleb2cat_train_combined data/voxceleb2cat_train_augx${num_augs} data/voxceleb2cat_train - -fi - -exit - diff --git a/egs/voxceleb/vae.v1/run_010_prepare_gen_model_train_data.sh b/egs/voxceleb/vae.v1/run_010_prepare_gen_model_train_data.sh deleted file mode 100755 index c2f5c832..00000000 --- a/egs/voxceleb/vae.v1/run_010_prepare_gen_model_train_data.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh - -. parse_options.sh || exit 1; -. $config_file - -# Now we prepare the features to generate examples for xvector training. -if [ $stage -le 2 ]; then - # This script applies CMVN and removes nonspeech frames. Note that this is somewhat - # wasteful, as it roughly doubles the amount of training data on disk. After - # creating training examples, this can be removed. - steps_xvec/prepare_feats_for_nnet_train.sh --nj 40 --cmd "$train_cmd" \ - --storage_name voxceleb-vae.v1-$(date +'%m_%d_%H_%M') \ - data/${nnet_data} data/${nnet_data}_no_sil exp/${nnet_data}_no_sil - utils/fix_data_dir.sh data/${nnet_data}_no_sil - -fi - - -if [ $stage -le 3 ]; then - # Now, we need to remove features that are too short after removing silence - # frames. We want atleast 4s (400 frames) per utterance. - hyp_utils/remove_short_utts.sh --min-len 400 data/${nnet_data}_no_sil - - # We also want several utterances per speaker. Now we'll throw out speakers - # with fewer than 8 utterances. - hyp_utils/remove_spk_few_utts.sh --min-num-utts 8 data/${nnet_data}_no_sil - -fi - -if [ $stage -le 4 ]; then - # Prepare train and validation lists for x-vectors - local/make_train_lists_sup_embed_with_augm.sh data/${nnet_data}_no_sil data/${nnet_data}_no_sil/lists_xvec -fi - -exit diff --git a/egs/voxceleb/vae.v1/run_011_train_model.sh b/egs/voxceleb/vae.v1/run_011_train_model.sh deleted file mode 100755 index 8c9bb4d4..00000000 --- a/egs/voxceleb/vae.v1/run_011_train_model.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=1 -config_file=default_config.sh -resume=false -interactive=false -num_workers=8 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ x=int($2/$1+0.5); if(x==0){ x=1 }; print x }') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - - - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $nnet_dir/log - - if [ "$model_type" == "vae" ] || [ "$model_type" == "vq-vae" ];then - # Train VAE - train_exec=torch-train-${model_type}.py - $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --num-gpus $ngpu \ - $train_exec $narch:$narch \ - --data-rspec scp:$list_dir/feats.scp \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --num-frames-file $list_dir/utt2num_frames \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $opt_opt $lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $nnet_num_epochs \ - --z-dim $latent_dim $enc_opt $dec_opt $vae_opt \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args - - # train_exec=torch-train-${narch}-${model_type}.py - - # $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - # hyp_utils/conda_env.sh --num-gpus $ngpu \ - # $train_exec \ - # --data-rspec scp:$list_dir/feats.scp \ - # --train-list $list_dir/lists_xvec/train.scp \ - # --val-list $list_dir/lists_xvec/val.scp \ - # --num-frames-file $list_dir/utt2num_frames \ - # --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - # --iters-per-epoch $ipe \ - # --batch-size $batch_size \ - # --num-workers $num_workers $opt_opt $lrs_opt \ - # --grad-acc-steps $grad_acc_steps \ - # --epochs $nnet_num_epochs \ - # --z-dim $latent_dim $enc_opt $dec_opt $vae_opt \ - # --num-gpus $ngpu \ - # --log-interval $log_interval \ - # --exp-path $nnet_dir $args - - elif [[ "$model_type" =~ "dvae" ]];then - # Train Denoising VAE - train_exec=torch-train-${model_type}.py - - $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --num-gpus $ngpu \ - $train_exec $narch:$narch\ - --data-rspec scp:$list_dir/feats.scp \ - --train-list $list_dir/lists_xvec/train.scp \ - --train-pair-list $list_dir/lists_xvec/augm2clean.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --val-pair-list $list_dir/lists_xvec/augm2clean.scp \ - --num-frames-file $list_dir/utt2num_frames \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $opt_opt $lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $nnet_num_epochs \ - --z-dim $latent_dim $enc_opt $dec_opt $vae_opt \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args - - # train_exec=torch-train-${narch}-${model_type}.py - - # $cuda_cmd --gpu $ngpu $nnet_dir/log/train.log \ - # hyp_utils/conda_env.sh --num-gpus $ngpu \ - # $train_exec \ - # --data-rspec scp:$list_dir/feats.scp \ - # --train-list $list_dir/lists_xvec/train.scp \ - # --train-pair-list $list_dir/lists_xvec/augm2clean.scp \ - # --val-list $list_dir/lists_xvec/val.scp \ - # --val-pair-list $list_dir/lists_xvec/augm2clean.scp \ - # --num-frames-file $list_dir/utt2num_frames \ - # --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - # --iters-per-epoch $ipe \ - # --batch-size $batch_size \ - # --num-workers $num_workers $opt_opt $lrs_opt \ - # --grad-acc-steps $grad_acc_steps \ - # --epochs $nnet_num_epochs \ - # --z-dim $latent_dim $enc_opt $dec_opt $vae_opt \ - # --num-gpus $ngpu \ - # --log-interval $log_interval \ - # --exp-path $nnet_dir $args - - else - echo "unknown model type $model_type" - exit 1 - - fi - -fi - - -exit diff --git a/egs/voxceleb/vae.v1/run_012_eval_recons.sh b/egs/voxceleb/vae.v1/run_012_eval_recons.sh deleted file mode 100755 index 961ae68d..00000000 --- a/egs/voxceleb/vae.v1/run_012_eval_recons.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# Copyright -# 2020 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -#xvec_chunk_length=12800 -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - eval_args="--use-gpu true" - eval_cmd="$cuda_eval_cmd" -else - eval_cmd="$train_cmd" -fi - -output_dir=exp/recons_output/$nnet_name -if [[ "$model_type" =~ "vae" ]];then - eval_script=hyp_utils/generative/eval_vae.sh -else - echo "unknown model type $model_type" - exit 1 -fi - -if [ $stage -le 1 ]; then - for name in voxceleb1_test - do - num_utt=$(wc -l data/$name/utt2spk | awk '{ print $1}') - nj=$(($num_utt < 100 ? $num_utt:100)) - $eval_script --cmd "$eval_cmd --mem 6G" --nj $nj ${eval_args} \ - $nnet data/$name $output_dir/$name - done -fi - - - - diff --git a/egs/voxceleb/vae.v1/run_013_eval_xvector_asv.sh b/egs/voxceleb/vae.v1/run_013_eval_xvector_asv.sh deleted file mode 100755 index 11932091..00000000 --- a/egs/voxceleb/vae.v1/run_013_eval_xvector_asv.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -# Copyright -# 2018 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -config_file=default_config.sh -use_gpu=false -xvec_chunk_length=12800 -. parse_options.sh || exit 1; -. $config_file - -if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" - xvec_cmd="$cuda_eval_cmd" -else - xvec_cmd="$train_cmd" -fi - -xvector_dir=exp/xvectors/$nnet_name/$xvec_nnet_name -score_be_dir=exp/scores/$nnet_name/$xvec_nnet_name/cosine - - -if [ $stage -le 1 ]; then - # Extracts x-vectors for evaluation - for name in voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 100 ? $num_spk:100)) - steps_xvec/extract_xvectors_with_vae_preproc.sh \ - --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ - $xvec_nnet $nnet data/$name \ - $xvector_dir/$name - done -fi - - -if [ $stage -le 2 ];then - - echo "Eval Voxceleb 1 with Cosine scoring" - steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_be_dir/voxceleb1_scores - - $train_cmd --mem 10G --num-threads 6 $score_be_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_be_dir - - for f in $(ls $score_be_dir/*_results); - do - echo $f - cat $f - echo "" - done - -fi - -exit diff --git a/egs/voxceleb/vae.v1/steps b/egs/voxceleb/vae.v1/steps deleted file mode 120000 index aede39fe..00000000 --- a/egs/voxceleb/vae.v1/steps +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/steps_be b/egs/voxceleb/vae.v1/steps_be deleted file mode 120000 index 4958fae7..00000000 --- a/egs/voxceleb/vae.v1/steps_be +++ /dev/null @@ -1 +0,0 @@ -../v1.1/steps_be \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/steps_fe b/egs/voxceleb/vae.v1/steps_fe deleted file mode 120000 index 73ccc1eb..00000000 --- a/egs/voxceleb/vae.v1/steps_fe +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/kaldi/vad \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/steps_pyfe b/egs/voxceleb/vae.v1/steps_pyfe deleted file mode 120000 index 7b9d122a..00000000 --- a/egs/voxceleb/vae.v1/steps_pyfe +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/feats \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/steps_xvec b/egs/voxceleb/vae.v1/steps_xvec deleted file mode 120000 index af66a94d..00000000 --- a/egs/voxceleb/vae.v1/steps_xvec +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/xvectors \ No newline at end of file diff --git a/egs/voxceleb/vae.v1/utils b/egs/voxceleb/vae.v1/utils deleted file mode 120000 index 3d590a1d..00000000 --- a/egs/voxceleb/vae.v1/utils +++ /dev/null @@ -1 +0,0 @@ -hyp_utils/kaldi/utils \ No newline at end of file From 083b210a0acfe60750d398f595eb68f61a7928d4 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 14 Apr 2023 15:52:15 -0400 Subject: [PATCH 094/154] started adapt adv.v2 --- egs/voxceleb/adv.v1.1/run_002_compute_evad.sh | 2 +- egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml | 98 +++++++++++------ .../adv.v2/conf/lresnet34_spknet.yaml | 100 +++++++++++------ egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml | 101 ++++++++++++------ .../adv.v2/run_011_train_victim_xvector.sh | 56 +++++----- .../generate_adv_attacks_xvector_classif.sh | 2 +- .../adv/generate_adv_attacks_xvector_verif.sh | 2 +- .../generate_adv_attacks_xvector_classif.py} | 0 .../generate_adv_attacks_xvector_verif.py} | 0 9 files changed, 233 insertions(+), 128 deletions(-) rename hyperion/{bin_deprec2/torch-generate-adv-attacks-xvector-classif.py => bin/generate_adv_attacks_xvector_classif.py} (100%) rename hyperion/{bin_deprec2/torch-generate-adv-attacks-xvector-verif.py => bin/generate_adv_attacks_xvector_verif.py} (100%) diff --git a/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh b/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh index e854b393..f6b8e62f 100755 --- a/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/adv.v1.1/run_002_compute_evad.sh @@ -20,7 +20,7 @@ config_file=default_config.sh if [ $stage -le 1 ]; then # Prepare to distribute data over multiple machines if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage + dir_name=$USER/hyp-data/voxceleb/adv.v1.1/$storage_name/vad/storage if [ "$nodes" == "b0" ];then utils/create_split_dir.pl \ utils/create_split_dir.pl \ diff --git a/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml b/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml index 79f33282..d07a2126 100644 --- a/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml +++ b/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml @@ -1,32 +1,68 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 feats: fbank80_stmn_16k.yaml -min_chunk_length: 4 -max_chunk_length: 4 -iters_per_epoch: 6 -resnet_type: lresnet34 -in_feats: 80 -in_channels: 1 -in_kernel_size: 3 -in_stride: 1 -no_maxpool: true -dropout_rate: 0.0 -embed_dim: 10 -margin: 0.2 -margin_warmup_epochs: 6.0 -s: 30 -epochs: 20 -optim: - amsgrad: true - beta1: 0.9 - beta2: 0.95 - lr: 0.01 - opt_type: adam - weight_decay: 1.0e-05 -lrsched: - decay_rate: 0.5 - decay_steps: 8000 - eps: 1.0e-08 - hold_steps: 16000 - lrsch_type: exp_lr - min_lr: 1.0e-05 - update_lr_on_opt_step: true - warmup_steps: 1000 +model: + resnet_type: lresnet34 + in_feats: 80 + in_channels: 1 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + dropout_rate: 0.0 + embed_dim: 10 + margin: 0.2 + margin_warmup_epochs: 6.0 + s: 30 +trainer: + epochs: 20 + eff_batch_size: 512 + optim: + amsgrad: true + beta1: 0.9 + beta2: 0.95 + lr: 0.01 + opt_type: adam + weight_decay: 1.0e-05 + lrsched: + decay_rate: 0.5 + decay_steps: 8000 + eps: 1.0e-08 + hold_steps: 16000 + lrsch_type: exp_lr + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + diff --git a/egs/voxceleb/adv.v2/conf/lresnet34_spknet.yaml b/egs/voxceleb/adv.v2/conf/lresnet34_spknet.yaml index 0a78edb5..a12487ee 100644 --- a/egs/voxceleb/adv.v2/conf/lresnet34_spknet.yaml +++ b/egs/voxceleb/adv.v2/conf/lresnet34_spknet.yaml @@ -1,34 +1,68 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 feats: fbank80_stmn_16k.yaml -train_aug_cfg: conf/reverb_noise_aug.yaml -val_aug_cfg: conf/reverb_noise_aug.yaml -min_chunk_length: 4 -max_chunk_length: 4 -iters_per_epoch: 6 -resnet_type: lresnet34 -in_feats: 80 -in_channels: 1 -in_kernel_size: 3 -in_stride: 1 -no_maxpool: true -dropout_rate: 0.0 -embed_dim: 256 -margin: 0.3 -margin_warmup_epochs: 20.0 -s: 30 -epochs: 70 -optim: - amsgrad: true - beta1: 0.9 - beta2: 0.95 - lr: 0.05 - opt_type: adam - weight_decay: 1.0e-05 -lrsched: - decay_rate: 0.5 - decay_steps: 8000 - eps: 1.0e-08 - hold_steps: 40000 - lrsch_type: exp_lr - min_lr: 1.0e-05 - update_lr_on_opt_step: true - warmup_steps: 1000 +model: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: lrsched_exp_default.yaml + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 512 diff --git a/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml b/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml index 4754206d..94e26f24 100644 --- a/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml +++ b/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml @@ -1,34 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + data_loader: + num_workers: 8 feats: fbank80_stmn_16k.yaml -min_chunk_length: 4 -max_chunk_length: 4 -iters_per_epoch: 6 -resnet_type: res2net50 -in_feats: 80 -in_channels: 1 -in_kernel_size: 3 -in_stride: 1 -no_maxpool: true -res2net_width_factor: 1.625 -res2net_scale: 4 -dropout_rate: 0.0 -embed_dim: 10 -margin: 0.2 -margin_warmup_epochs: 6.0 -s: 30 -epochs: 20 -optim: - amsgrad: true - beta1: 0.9 - beta2: 0.95 - lr: 0.01 - opt_type: adam - weight_decay: 1.0e-05 -lrsched: - decay_rate: 0.5 - decay_steps: 8000 - eps: 1.0e-08 - hold_steps: 16000 - lrsch_type: exp_lr - min_lr: 1.0e-05 - update_lr_on_opt_step: true - warmup_steps: 1000 +model: + resnet_type: res2net50 + in_feats: 80 + in_channels: 1 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + dropout_rate: 0.0 + embed_dim: 10 + margin: 0.2 + margin_warmup_epochs: 6.0 + s: 30 +trainer: + epochs: 20 + eff_batch_size: 256 + optim: + amsgrad: true + beta1: 0.9 + beta2: 0.95 + lr: 0.01 + opt_type: adam + weight_decay: 1.0e-05 + lrsched: + decay_rate: 0.5 + decay_steps: 8000 + eps: 1.0e-08 + hold_steps: 16000 + lrsch_type: exp_lr + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 1000 diff --git a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh index c5d03ee2..971b88a3 100755 --- a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh +++ b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh @@ -10,45 +10,45 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" +use_tb=false +use_wandb=false . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($spknet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $spknet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${spknet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi +nnet_type=$spknet_command +nnet_data=$spknet_data +nnet_dir=$spknet_dir +nnet_cfg=$spknet_config +list_dir=data/${nnet_data}_proc_audio_no_sil if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi +nnet_type=$spknet_command +nnet_dir + # Network Training if [ $stage -le 1 ]; then - - mkdir -p $spknet_dir/log - $cuda_cmd --gpu $ngpu $spknet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $spknet_command --cfg $spknet_config \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $spknet_dir $args - + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu + fi + diff --git a/hyp_utils/adv/generate_adv_attacks_xvector_classif.sh b/hyp_utils/adv/generate_adv_attacks_xvector_classif.sh index 29d762af..f0401c3a 100755 --- a/hyp_utils/adv/generate_adv_attacks_xvector_classif.sh +++ b/hyp_utils/adv/generate_adv_attacks_xvector_classif.sh @@ -75,7 +75,7 @@ echo "$0: generate attacks for $data_dir to $output_dir" if [ $stage -le 1 ];then $cmd JOB=1:$nj $log_dir/generate_attack.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-generate-adv-attacks-xvector-classif.py \ + generate_adv_attacks_xvector_classif.py \ --feats $feat_config ${args} $attacks_opts \ --wav-file $wav \ --list-file $list \ diff --git a/hyp_utils/adv/generate_adv_attacks_xvector_verif.sh b/hyp_utils/adv/generate_adv_attacks_xvector_verif.sh index 4cf99518..e20b03ff 100755 --- a/hyp_utils/adv/generate_adv_attacks_xvector_verif.sh +++ b/hyp_utils/adv/generate_adv_attacks_xvector_verif.sh @@ -73,7 +73,7 @@ echo "$0: generate attacks for $data_dir to $output_dir" if [ $stage -le 1 ];then $cmd JOB=1:$nj $log_dir/generate_attack.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - torch-generate-adv-attacks-xvector-verif.py \ + generate_adv_attacks_xvector_verif.py \ --feats $feat_config ${args} $attacks_opts \ --v-file scp:$vector_file \ --key-file $key_file \ diff --git a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py similarity index 100% rename from hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-classif.py rename to hyperion/bin/generate_adv_attacks_xvector_classif.py diff --git a/hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py similarity index 100% rename from hyperion/bin_deprec2/torch-generate-adv-attacks-xvector-verif.py rename to hyperion/bin/generate_adv_attacks_xvector_verif.py From df273d2a6d6dd294e875db86642334a616f42701 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 3 May 2023 12:42:24 -0400 Subject: [PATCH 095/154] fixed block and local attention with mask in conformers --- .../v1/conf/reverb_noise20dB_aug.yaml | 34 + ...2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml | 74 + ...2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml | 76 + ...2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml | 77 + ...base_conf_rnnt_k2_pruned_stage1_v3.10.yaml | 76 + ...2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml | 76 + ...2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml | 76 + ...ase_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml | 76 + ...ase_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml | 76 + ...2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml | 76 + ...2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml | 76 + ...2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml | 76 + ...2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml | 76 + ...2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml | 76 + ...2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml | 76 + ...2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml | 75 + ...g_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh | 32 + ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh | 30 + ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh | 32 + ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh | 32 + ...wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh | 32 + ...wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh | 32 + ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh | 32 + ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh | 32 + ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh | 32 + ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh | 32 + ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh | 32 + ...g_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh | 32 + hyperion/bin/train_wav2vec2rnn_transducer.py | 5 +- hyperion/np/augment/speed_augment.py | 17 +- .../layer_blocks/conformer_encoder_v1.py | 76 +- .../layer_blocks/transducer_predictor.py | 20 +- .../layer_blocks/transformer_encoder_v1.py | 45 +- hyperion/torch/layers/__init__.py | 2 +- hyperion/torch/layers/attention.py | 1366 ++++++++++------- hyperion/torch/layers/pos_encoder.py | 68 +- hyperion/torch/lr_schedulers/factory.py | 86 +- hyperion/torch/models/__init__.py | 1 + hyperion/torch/models/transducer/__init__.py | 1 + .../transducer/conformer_v1_rnn_transducer.py | 87 ++ .../models/transducer/lstm_rnn_transducer.py | 151 -- .../models/transducer/rnn_rnn_transducer.py | 2 +- .../torch/models/transducer/rnn_transducer.py | 2 + .../torch/models/wav2transducer/__init__.py | 2 + .../wav2transducer/hf_wav2rnn_transducer.py | 2 - .../hf_wav2vec2conformer_v1_rnn_transducer.py | 105 ++ .../hf_wav2vec2rnn_rnn_transducer.py | 7 +- hyperion/torch/narchs/conformer_encoder_v1.py | 238 +-- hyperion/torch/narchs/rnn_encoder.py | 17 +- .../torch/narchs/rnn_transducer_decoder.py | 13 +- hyperion/torch/torch_model.py | 7 +- hyperion/torch/utils/masking.py | 7 +- 52 files changed, 2923 insertions(+), 958 deletions(-) create mode 100644 egs/librispeech/v1/conf/reverb_noise20dB_aug.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml create mode 100644 egs/librispeech/v1/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh create mode 100644 egs/librispeech/v1/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh create mode 100644 hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py delete mode 100644 hyperion/torch/models/transducer/lstm_rnn_transducer.py create mode 100644 hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py diff --git a/egs/librispeech/v1/conf/reverb_noise20dB_aug.yaml b/egs/librispeech/v1/conf/reverb_noise20dB_aug.yaml new file mode 100644 index 00000000..23086ecb --- /dev/null +++ b/egs/librispeech/v1/conf/reverb_noise20dB_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 10 + max_snr: 20 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 10 + max_snr: 20 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 10 + max_snr: 20 diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml new file mode 100644 index 00000000..fc5b833a --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml new file mode 100644 index 00000000..c16a9e6d --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 64 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml new file mode 100644 index 00000000..9dd6a944 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 64 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + pos_enc_type: abs + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml new file mode 100644 index 00000000..43c2063d --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 1 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml new file mode 100644 index 00000000..3b3a83b4 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 32 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml new file mode 100644 index 00000000..9286657b --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 16 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml new file mode 100644 index 00000000..b4869ed3 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 4 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml new file mode 100644 index 00000000..645f784c --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 2 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml new file mode 100644 index 00000000..fbbac0c2 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 8 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml new file mode 100644 index 00000000..f1f8c414 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 32 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml new file mode 100644 index 00000000..44cb9642 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 16 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml new file mode 100644 index 00000000..031061f9 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 8 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml new file mode 100644 index 00000000..6cb61718 --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 4 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml new file mode 100644 index 00000000..4b5e0e4d --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + att_type: block-scaled-dot-prod-v1 + att_context: 2 + d_model: 512 + num_heads: 8 + num_blocks: 1 + d_ff: 2048 + in_layer_type: linear + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml b/egs/librispeech/v1/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml new file mode 100644 index 00000000..91b5fccb --- /dev/null +++ b/egs/librispeech/v1/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 4 + val: + dataset: + aug_cfgs: + - conf/reverb_noise20dB_aug.yaml + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 28. + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 4 +model: + hf_feats: + pretrained_model_path: facebook/wav2vec2-base-960h + transducer: + encoder: + rnn_type: lstm + num_layers: 1 + hid_feats: 512 + proj_feats: 0 + out_feats: 512 + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 512 + embed_dropout_rate: 0.4 + rnn_dropout_rate: 0.4 + rnn_type: lstm + joiner: + hid_feats: 512 + feat_fusion_method: weighted-avg + feat_fusion_start: 2 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.99 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4200 + hold_steps: 1500 + min_lr: 4e-5 + warmup_steps: 1500 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + # eff_batch_size: 1024 + eff_batch_size: 128 + train_mode: hf-feats-frozen-nograd diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh new file mode 100644 index 00000000..a0e4f1a9 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v1.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0120.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh new file mode 100644 index 00000000..823f50b1 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh @@ -0,0 +1,30 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0115.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh new file mode 100644 index 00000000..16971bcc --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh new file mode 100644 index 00000000..d4b45852 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.3 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh new file mode 100644 index 00000000..3c98fc9b --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.4.1 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh new file mode 100644 index 00000000..187ad022 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.4.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0100.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh new file mode 100644 index 00000000..1538a7d1 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.4 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0104.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh new file mode 100644 index 00000000..0ce9fd99 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.5 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh new file mode 100644 index 00000000..81702305 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.6 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh new file mode 100644 index 00000000..83f7682d --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.7 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0110.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh new file mode 100644 index 00000000..beb92d39 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2conformer_v1_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_conf_rnnt_k2_pruned.v3.9 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0050.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0100.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh b/egs/librispeech/v1/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh new file mode 100644 index 00000000..8e15e372 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh @@ -0,0 +1,32 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wav2vec2base +#vad +# vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=train_clean_100 +dev_data=dev_clean + +bpe_model=data/lang_bpe_1000/bpe.model +# x-vector cfg + +nnet_type=hf_wav2vec2rnn_rnn_transducer + +nnet_s1_base_cfg=conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_lstm_rnnt_k2_pruned.v1.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0120.pth +nnet_s1=$nnet_s1_dir/model_ep0070.pth +nnet_s1=$nnet_s1_dir/model_ep0105.pth + +nnet_s2_base_cfg=conf/train_wav2vec2base_transducer_stage2_v1.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0020.pth diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index 67f5c6ba..7018c406 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -17,7 +17,8 @@ from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory -from hyperion.torch.models import (HFWav2Vec2RNNRNNTransducer, +from hyperion.torch.models import (HFWav2Vec2ConformerV1RNNTransducer, + HFWav2Vec2RNNRNNTransducer, HFWav2Vec2RNNTransducer) from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp @@ -28,6 +29,8 @@ model_dict = { "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, + "hf_wav2vec2conformer_v1_rnn_transducer": + HFWav2Vec2ConformerV1RNNTransducer, # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, diff --git a/hyperion/np/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py index 4400a4b4..18a15651 100644 --- a/hyperion/np/augment/speed_augment.py +++ b/hyperion/np/augment/speed_augment.py @@ -34,10 +34,8 @@ def __init__( rng=None, ): logging.info( - "init speed augment with prob={}, speed_ratios={}, keep_length={}".format( - speed_prob, speed_ratios, keep_length - ) - ) + "init speed augment with prob={}, speed_ratios={}, keep_length={}". + format(speed_prob, speed_ratios, keep_length)) self.speed_prob = speed_prob self.speed_ratios = speed_ratios self.keep_length = keep_length @@ -63,12 +61,12 @@ def create(cls, cfg, random_seed=112358, rng=None): with open(cfg, "r") as f: cfg = yaml.load(f, Loader=yaml.FullLoader) - assert isinstance(cfg, dict), "wrong object type for cfg={}".format(cfg) + assert isinstance(cfg, dict), f"wrong object type for cfg={cfg}" return cls( speed_prob=cfg["speed_prob"], speed_ratios=cfg["speed_ratios"], - keep_length=cfg["keep_length"], + keep_length=cfg["keep_length"] if "keep_length" in cfg else False, random_seed=random_seed, rng=rng, ) @@ -100,11 +98,12 @@ def forward(self, x): # print(f"1 r={r} {x.shape} {y.shape}", flush=True) if self.keep_length: if r > 1: - dither = np.max(x) / 2 ** 15 # we add some dither in the padding - pad_y = dither * np.ones((x.shape[-1] - y.shape[-1],), dtype=y.dtype) + dither = np.max(x) / 2**15 # we add some dither in the padding + pad_y = dither * np.ones( + (x.shape[-1] - y.shape[-1], ), dtype=y.dtype) y = np.concatenate((y, pad_y), axis=-1) elif r < 1: - y = y[: x.shape[-1]] + y = y[:x.shape[-1]] # print(f"2 r={r} {x.shape} {y.shape}", flush=True) return y, info diff --git a/hyperion/torch/layer_blocks/conformer_encoder_v1.py b/hyperion/torch/layer_blocks/conformer_encoder_v1.py index b2eab352..5764c85e 100644 --- a/hyperion/torch/layer_blocks/conformer_encoder_v1.py +++ b/hyperion/torch/layer_blocks/conformer_encoder_v1.py @@ -94,14 +94,14 @@ def __init__( self.ff_macaron = ff_macaron if ff_macaron: self.ff_scale = 0.5 - self.feed_forward_macaron = self._make_ff( - feed_forward, num_feats, d_ff, ff_kernel_size, hid_act, dropout_rate - ) + self.feed_forward_macaron = self._make_ff(feed_forward, num_feats, + d_ff, ff_kernel_size, + hid_act, dropout_rate) self.norm_ff_macaron = nn.LayerNorm(num_feats) - self.feed_forward = self._make_ff( - feed_forward, num_feats, d_ff, ff_kernel_size, hid_act, dropout_rate - ) + self.feed_forward = self._make_ff(feed_forward, num_feats, d_ff, + ff_kernel_size, hid_act, + dropout_rate) conv_blocks = [] for i in range(conv_repeats): @@ -145,7 +145,7 @@ def _make_att( """Creates multihead attention block from att_type string Args: - att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1'] + att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1', 'block-scaled-dot-prod-att-v1'] num_feats: input/output feat. dimension (aka d_model) num_heads: number of heads dropout_rate: dropout rate for attention block @@ -170,11 +170,15 @@ def _make_att( d_k, causal_pos_enc, dropout_rate, - time_dim=1, ) return ScaledDotProdAttV1( - num_feats, num_feats, num_heads, d_k, d_k, dropout_rate, time_dim=1 + num_feats, + num_feats, + num_heads, + d_k, + d_k, + dropout_rate, ) if att_type == "local-scaled-dot-prod-v1": @@ -188,7 +192,6 @@ def _make_att( context, causal_pos_enc, dropout_rate, - time_dim=1, ) return LocalScaledDotProdAttV1( @@ -199,11 +202,34 @@ def _make_att( d_k, context, dropout_rate, - time_dim=1, + ) + + if att_type == "block-scaled-dot-prod-v1": + if pos_enc_type == "rel": + return BlockScaledDotProdAttRelPosEncV1( + num_feats, + num_feats, + num_heads, + d_k, + d_k, + context, + causal_pos_enc, + dropout_rate, + ) + + return BlockScaledDotProdAttV1( + num_feats, + num_feats, + num_heads, + d_k, + d_k, + context, + dropout_rate, ) @staticmethod - def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rate): + def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, + dropout_rate): """Creates position-wise feed forward block from ff_type string Args: @@ -219,19 +245,27 @@ def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rat """ if ff_type == "linear": - return PositionwiseFeedForward( - num_feats, hid_feats, activation, dropout_rate, time_dim=1 - ) + return PositionwiseFeedForward(num_feats, + hid_feats, + activation, + dropout_rate, + time_dim=1) if ff_type == "conv1dx2": - return Conv1dx2( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 - ) + return Conv1dx2(num_feats, + hid_feats, + kernel_size, + activation, + dropout_rate, + time_dim=1) if ff_type == "conv1d-linear": - return Conv1dLinear( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 - ) + return Conv1dLinear(num_feats, + hid_feats, + kernel_size, + activation, + dropout_rate, + time_dim=1) def forward(self, x, pos_emb=None, mask=None): """Forward pass function diff --git a/hyperion/torch/layer_blocks/transducer_predictor.py b/hyperion/torch/layer_blocks/transducer_predictor.py index 00339fe7..6f43343a 100644 --- a/hyperion/torch/layer_blocks/transducer_predictor.py +++ b/hyperion/torch/layer_blocks/transducer_predictor.py @@ -85,7 +85,7 @@ def __init__(self, def get_config(self): config = { - "pred_type": "conv", + "pred_type": "rnn", "vocab_size": self.vocab_size, "embed_dim": self.embed_dim, "num_layers": self.num_layers, @@ -187,7 +187,7 @@ def __init__( out_feats = embed_dim self.out_feats = out_feats - if out_feats != embed_feats: + if out_feats != embed_dim: self.output_proj = nn.Linear(embed_dim, out_feats) else: self.output_proj = None @@ -210,7 +210,7 @@ def forward( self, y: torch.Tensor, states: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, None]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]: """ Args: y: @@ -223,19 +223,21 @@ def forward( """ y = y.to(torch.int64) embed = self.embedding(y) - if self.context > 1: + if self.context_size > 1: embed = embed.transpose(1, 2) if states is None: - embed = F.pad(embedding_out, pad=(self.context_size - 1, 0)) + embed = nn.functional.pad(embed, + pad=(self.context_size - 1, 0)) else: - raise NotImplementedError() - embed = self.conv(embed).transpose(1, 2) + embed = torch.cat((states[0], embed), dim=-1) + + out = self.conv(embed).transpose(1, 2) - out = self.hid_act(embed) + out = self.hid_act(out) if self.output_proj: out = self.output_proj(out) - return out, None + return out, (embed[:, :, -self.context_size + 1:], ) # # this stuff about clamp() is a temporary fix for a mismatch # # at utterance start, we use negative ids in beam_search.py diff --git a/hyperion/torch/layer_blocks/transformer_encoder_v1.py b/hyperion/torch/layer_blocks/transformer_encoder_v1.py index c8eaaa1b..cfb843b6 100644 --- a/hyperion/torch/layer_blocks/transformer_encoder_v1.py +++ b/hyperion/torch/layer_blocks/transformer_encoder_v1.py @@ -67,9 +67,9 @@ def __init__( self.self_attn = self_attn if isinstance(feed_forward, str): - self.feed_forward = self._make_ff( - feed_forward, num_feats, d_ff, ff_kernel_size, ff_act, ff_dropout_rate - ) + self.feed_forward = self._make_ff(feed_forward, num_feats, d_ff, + ff_kernel_size, ff_act, + ff_dropout_rate) else: self.feed_forward = feed_forward @@ -122,11 +122,15 @@ def _make_att( d_k, causal_pos_enc, dropout_rate, - time_dim=1, ) return ScaledDotProdAttV1( - num_feats, num_feats, num_heads, d_k, d_k, dropout_rate, time_dim=1 + num_feats, + num_feats, + num_heads, + d_k, + d_k, + dropout_rate, ) if att_type == "local-scaled-dot-prod-v1": @@ -140,7 +144,6 @@ def _make_att( context, causal_pos_enc, dropout_rate, - time_dim=1, ) return LocalScaledDotProdAttV1( @@ -151,11 +154,11 @@ def _make_att( d_k, context, dropout_rate, - time_dim=1, ) @staticmethod - def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rate): + def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, + dropout_rate): """Creates position-wise feed forward block from ff_type string Args: @@ -171,19 +174,27 @@ def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rat """ if ff_type == "linear": - return PositionwiseFeedForward( - num_feats, hid_feats, activation, dropout_rate, time_dim=1 - ) + return PositionwiseFeedForward(num_feats, + hid_feats, + activation, + dropout_rate, + time_dim=1) if ff_type == "conv1dx2": - return Conv1dx2( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 - ) + return Conv1dx2(num_feats, + hid_feats, + kernel_size, + activation, + dropout_rate, + time_dim=1) if ff_type == "conv1d-linear": - return Conv1dLinear( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 - ) + return Conv1dLinear(num_feats, + hid_feats, + kernel_size, + activation, + dropout_rate, + time_dim=1) def forward(self, x, pos_emb=None, mask=None): """Forward pass function diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py index 42b40303..b2aa1692 100644 --- a/hyperion/torch/layers/__init__.py +++ b/hyperion/torch/layers/__init__.py @@ -17,6 +17,6 @@ from .mvn import MeanVarianceNorm from .norm_layer_factory import NormLayer1dFactory, NormLayer2dFactory from .pool_factory import GlobalPool1dFactory -from .pos_encoder import NoPosEncoder, PosEncoder, RelPosEncoder +from .pos_encoder import NoPosEncoder, PosEncoder, RelPosEncoder, ConvPosEncoder from .spec_augment import AxisMasker, SpecAugment, SpecWarper from .subpixel_convs import ICNR1d, ICNR2d, SubPixelConv1d, SubPixelConv2d diff --git a/hyperion/torch/layers/attention.py b/hyperion/torch/layers/attention.py index 8ab75111..3e53cec9 100644 --- a/hyperion/torch/layers/attention.py +++ b/hyperion/torch/layers/attention.py @@ -20,20 +20,23 @@ class ScaledDotProdAttV1(nn.Module): d_k: key/query projection dimension d_v: value projection dimension dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input - dimensions are (batch, time, in_feats) """ def __init__( - self, in_feats, out_feats, num_heads, d_k, d_v, dropout_rate=0, time_dim=1 + self, + in_feats, + out_feats, + num_heads, + d_k, + d_v, + dropout_rate=0, ): super().__init__() - # We assume d_v always equals d_k + # We assume d_q always equals d_k self.d_v = d_v self.d_k = d_k self.num_heads = num_heads self.dropout_rate = dropout_rate - self.time_dim = time_dim self.linear_q = nn.Linear(in_feats, num_heads * d_k) self.linear_k = nn.Linear(in_feats, num_heads * d_k) self.linear_v = nn.Linear(in_feats, num_heads * d_v) @@ -54,7 +57,7 @@ def __repr__(self): return self.__str__() def __str__(self): - s = "{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, dropout_rate={}, time_dim={})".format( + s = "{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, dropout_rate={})".format( self.__class__.__name__, self.in_feats, self.out_feats, @@ -62,17 +65,11 @@ def __str__(self): self.d_k, self.d_v, self.dropout_rate, - self.time_dim, ) return s def _compute_qkv(self, query, key, value): batch_size = value.size(0) - if self.time_dim != 1: - query = query.transpose(1, self.time_dim) - key = key.transpose(1, self.time_dim) - value = value.transpose(1, self.time_dim) - q = self.linear_q(query).view(batch_size, -1, self.num_heads, self.d_k) k = self.linear_k(key).view(batch_size, -1, self.num_heads, self.d_k) v = self.linear_v(value).view(batch_size, -1, self.num_heads, self.d_v) @@ -85,8 +82,7 @@ def _compute_qkv(self, query, key, value): def _compute_softmax(self, scores, mask): if mask is not None: mask = mask.unsqueeze(1).eq( - 0 - ) # (batch, 1, time1, time2) or (batch, 1, time) + 0) # (batch, 1, time1, time2) or (batch, 1, time) if scores.dtype == torch.half: min_value = -65504 else: @@ -95,14 +91,14 @@ def _compute_softmax(self, scores, mask): if mask.dim() == 4: scores = scores.masked_fill(mask, min_value) return torch.softmax(scores, dim=-1).masked_fill( - mask, 0.0 - ) # (batch, head, time1, time2) + mask, 0.0) # (batch, head, time1, time2) else: - mask1 = mask.unsqueze(2) + mask1 = mask.unsqueeze(2) mask2 = mask.unsqueeze(-1) scores = scores.masked_fill(mask1, min_value) scores = scores.masked_fill(mask2, min_value) - return torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + return torch.softmax(scores, + dim=-1) # (batch, head, time1, time2) return torch.softmax(scores, dim=-1) # (batch, head, time1, time2) @@ -114,15 +110,13 @@ def _apply_attn(self, v): p_attn = self.attn x = torch.matmul(p_attn, v) # (batch, head, time1, d_k) - x = ( - x.transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) # (batch, time1, d_model) + x = (x.transpose(1, 2).contiguous().view(batch_size, -1, + self.num_heads * self.d_v) + ) # (batch, time1, d_model) return self.linear_out(x) # (batch, time1, d_model) - ___compute_softmax = _compute_softmax - ___apply_attn = _apply_attn + _base_compute_softmax = _compute_softmax + _base_apply_attn = _apply_attn def forward(self, query, key, value, mask=None): """Computes 'Scaled Dot Product Attention'. @@ -141,10 +135,9 @@ def forward(self, query, key, value, mask=None): """ q, k, v = self._compute_qkv(query, key, value) scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt( - self.d_k - ) # (batch, head, time1, time2) - self.attn = self.___compute_softmax(scores, mask) - return self.___apply_attn(v) + self.d_k) # (batch, head, time1, time2) + self.attn = self._base_compute_softmax(scores, mask) + return self._base_apply_attn(v) class LocalScaledDotProdAttV1(ScaledDotProdAttV1): @@ -160,8 +153,6 @@ class LocalScaledDotProdAttV1(ScaledDotProdAttV1): d_v: value projection dimension context: maximum attention temporal context. dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input - dimensions are (batch, time, in_feats) """ def __init__( @@ -173,85 +164,39 @@ def __init__( d_v, context=25, dropout_rate=0, - time_dim=1, ): """Construct an MultiHeadedAttention object.""" - super().__init__( - in_feats, out_feats, num_heads, d_k, d_v, dropout_rate, time_dim - ) + super().__init__(in_feats, out_feats, num_heads, d_k, d_v, + dropout_rate) self.context = context def __repr__(self): return self.__str__() def __str__(self): - s = ( - "{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, " - "context={}, dropout_rate={}, time_dim={})".format( - self.__class__.__name__, - self.in_feats, - self.out_feats, - self.num_heads, - self.d_k, - self.d_v, - self.context, - self.dropout_rate, - self.time_dim, - ) - ) + s = ("{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, " + "context={}, dropout_rate={})".format( + self.__class__.__name__, + self.in_feats, + self.out_feats, + self.num_heads, + self.d_k, + self.d_v, + self.context, + self.dropout_rate, + )) return s - def _compute_qkv00(self, query, key, value): - batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - if self.time_dim != 1: - query = query.transpose(1, self.time_dim) - key = key.transpose(1, self.time_dim) - value = value.transpose(1, self.time_dim) - - context_k = self.context - num_blocks = math.ceil(t2 / context_k) # (t2 + context_k//2)//context_k - context_q = math.ceil(t1 / num_blocks) - num_blocks_q = math.ceil(t1 / context_q) # (t1 + context_q//2)//context_q - assert ( - num_blocks == num_blocks_q - ), "num_blocks_k({})!=num_blocks_q({}), context_k={}, context_q={}, t1={}, t2={}".format( - num_blocks, num_blocks_q, context_k, context_q, t1, t2 - ) - pad1 = context_q * num_blocks - t1 - pad2 = context_k * num_blocks - t2 - # print('1',query.shape,key.shape,value.shape,pad1,pad2, context_q, context_k) - if pad1 > 0: - query = nn.functional.pad(query, (0, 0, 0, pad1)) - - if pad2 > 0: - key = nn.functional.pad(key, (0, 0, 0, pad2)) - value = nn.functional.pad(value, (0, 0, 0, pad2)) - - # print('2',query.shape,key.shape,value.shape) - q0 = self.linear_q(query) # (batch, time1, head*d_k) - k0 = self.linear_k(key) # (batch, time2, head*d_k) - v0 = self.linear_v(value) # (batch, time2, head*d_v) - - return q0, k0, v0, context_q, context_k, num_blocks - def _compute_qkv0(self, query, key, value): batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - if self.time_dim != 1: - query = query.transpose(1, self.time_dim) - key = key.transpose(1, self.time_dim) - value = value.transpose(1, self.time_dim) - - num_blocks = round(t2 / self.context) - # print(num_blocks, t2, self.context) + t1 = query.size(1) + t2 = key.size(1) + + num_blocks = max(1, round(t2 / self.context)) context_k = math.ceil(t2 / num_blocks) context_q = math.ceil(t1 / num_blocks) pad1 = context_q * num_blocks - t1 pad2 = context_k * num_blocks - t2 - # print('1',query.shape,key.shape,value.shape,pad1,pad2, context_q, context_k) if pad1 > 0: query = nn.functional.pad(query, (0, 0, 0, pad1)) @@ -259,17 +204,16 @@ def _compute_qkv0(self, query, key, value): key = nn.functional.pad(key, (0, 0, 0, pad2)) value = nn.functional.pad(value, (0, 0, 0, pad2)) - # print('2',query.shape,key.shape,value.shape) q0 = self.linear_q(query) # (batch, time1, head*d_k) k0 = self.linear_k(key) # (batch, time2, head*d_k) v0 = self.linear_v(value) # (batch, time2, head*d_v) return q0, k0, v0, context_q, context_k, num_blocks - def _compute_scores( - self, q0, k0, num_blocks, context_q, context_k, q_left_shift, k_left_shift - ): - + def _compute_scores(self, q0, k0, num_blocks, context_q, context_k, + q_left_shift, k_left_shift): + # q0 (batch, time1, head*d_k) + # k0 (batch, time2, head*d_k) batch_size = q0.size(0) if q_left_shift > 0: # we are computing the shifted block-diag score matrix @@ -278,22 +222,14 @@ def _compute_scores( q0 = q0[:, q_left_shift:-q_right_shift] k0 = k0[:, k_left_shift:-k_right_shift] - q = ( - q0.view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + q = (q0.view(batch_size, -1, self.num_heads, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, head, blocks, time1, d_k) - k = ( - k0.view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + k = (k0.view(batch_size, -1, self.num_heads, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, head, blocks time2, d_k) - # print('4',q.shape,k.shape) - return torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) @staticmethod @@ -331,7 +267,7 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): context2 = scores1.size(4) # set elements in scores2 that overlap with elements in scores1 to -inf - scores2[:, :, :, : context1 - shift1, : context2 - shift2] = min_val + scores2[:, :, :, :context1 - shift1, :context2 - shift2] = min_val scores2[:, :, :, shift1:, shift2:] = min_val # set the padding time steps that we had to add to make integer block-number to -inf @@ -371,9 +307,9 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): scores2 = scores2.view(batch_size, num_heads, -1, context2) # print('aa', scores1.shape, scores2.shape) # pad scores2 to have the same size as scores1 - scores2 = nn.functional.pad( - scores2, (0, 0, shift1, context1 - shift1), mode="constant", value=min_val - ) + scores2 = nn.functional.pad(scores2, (0, 0, shift1, context1 - shift1), + mode="constant", + value=min_val) # print('bb', scores1.shape, scores2.shape) # concat scores1, scores2 and do softmax in time2 dimension # (batch, heads, blocks*time1, 2*time2) @@ -381,17 +317,13 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): # now we separate back probs into probs1, and probs2 # probs1 - probs1 = ( - probs[:, :, :, :context2] - .contiguous() - .view(batch_size, num_heads, num_blocks, -1, context2) - ) + probs1 = (probs[:, :, :, :context2].contiguous().view( + batch_size, num_heads, num_blocks, -1, context2)) # probs2 - probs2 = ( - probs[:, :, shift1 : -(context1 - shift1), context2:] - .contiguous() - .view(batch_size, num_heads, num_blocks - 1, -1, context2) - ) + probs2 = (probs[:, :, shift1:-(context1 - shift1), + context2:].contiguous().view(batch_size, num_heads, + num_blocks - 1, -1, + context2)) return probs1, probs2 @@ -406,9 +338,9 @@ def _mask_scores_1d(self, scores, mask, shift1, shift2): context1 = scores.size(3) context2 = scores.size(4) mask_blocks = torch.ones_like(scores, dtype=mask.dtype) - mask_single_block = torch.zeros( - (batch_size, context1, context2), dtype=mask.dtype - ) + mask_single_block = torch.zeros((batch_size, context1, context2), + dtype=mask.dtype, + device=mask.device) t1_start = shift1 t2_start = shift2 @@ -416,9 +348,11 @@ def _mask_scores_1d(self, scores, mask, shift1, shift2): t1_end = t1_start + context1 t2_end = t2_start + context2 mask_single_block.fill_(False) - mask_single_block.masked_fill_(mask[:, 0, t1_start:t1_end], True) - mask_single_block.masked_fill_(mask[:, :, t2_start:t2_end], True) - mask_blocks[:, block] = mask_single_block + mask_single_block.masked_fill_(mask[:, t1_start:t1_end, None], + True) + mask_single_block.masked_fill_(mask[:, None, t2_start:t2_end], + True) + mask_blocks[:, :, block] = mask_single_block.unsqueeze(1) t1_start += context1 t2_start += context2 @@ -437,23 +371,24 @@ def _mask_scores_2d(self, scores, mask, shift1, shift2): mask_blocks = torch.ones_like(scores, dtype=mask.dtype) t1_start = shift1 t2_start = shift2 + mask = mask.unsequeeze(1) for block in range(num_blocks): t1_end = min(t1_start + context1, mask.size(1)) t2_end = min(t2_start + context2, mask.size(2)) - mask_blocks[:, block, : (t1_end - t1_start), : (t2_end - t2_start)] = mask[ - :, t1_start:t1_end, t2_start:t2_end - ] + mask_blocks[:, :, block, :(t1_end - t1_start), :( + t2_end - t2_start)] = mask[:, :, t1_start:t1_end, + t2_start:t2_end] t1_start += context1 t2_start += context2 return scores.masked_fill(mask_blocks, min_value) - def _compute_softmax( - self, scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 - ): + def _compute_softmax(self, scores1, scores2, mask, q_left_shift, + k_left_shift, t1, t2): + if mask is not None: # put to -inf scores in points where mask==0 - if mask.dim() == 4: + if mask.dim() == 3: # case when mask is 2d matrix per batch element mask = mask.eq(0) # (batch, time1, time2) @@ -461,28 +396,27 @@ def _compute_softmax( scores1 = self._mask_scores_2d(scores1, mask, 0, 0) # second, we mask shifted block diagonal blocks - scores2 = self._mask_scores_2d( - scores2, mask, q_left_shift, k_left_shift - ) + scores2 = self._mask_scores_2d(scores2, mask, q_left_shift, + k_left_shift) - else: + elif mask.dim() == 2: # case when mask is 1d vector per batch element, # meaning that time1 and time2 are the same, so mask is symmetric - pad2 = 0 # fix this + pad2 = scores1.size(2) * scores1.size(3) - mask.size(-1) mask = nn.functional.pad(mask, (0, pad2)) - mask = mask.squeeze(1).eq(0) # (batch, 1, time) + mask = mask.eq(0) # (batch, time) # first, we mask block diagonal blocks scores1 = self._mask_scores_1d(scores1, mask, 0, 0) # second, we mask shifted block diagonal blocks - scores2 = self._mask_scores_1d( - scores2, mask, q_left_shift, k_left_shift - ) + scores2 = self._mask_scores_1d(scores2, mask, q_left_shift, + k_left_shift) + else: + raise ValueError() - self.attn1, self.attn2 = self._softmax( - scores1, scores2, q_left_shift, k_left_shift, t1, t2 - ) + self.attn1, self.attn2 = self._softmax(scores1, scores2, q_left_shift, + k_left_shift, t1, t2) def _apply_attn(self, v0, t1): if self.dropout_rate > 0: @@ -501,51 +435,43 @@ def _apply_attn(self, v0, t1): q_right_shift = context_q - q_left_shift k_right_shift = context_k - k_left_shift - v = ( - v0.view(batch_size, -1, self.num_heads, self.d_v) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + v = (v0.view(batch_size, -1, self.num_heads, + self.d_v).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, heads, blocks, time2, d_v) # print('8',p_attn1.shape,p_attn2.shape, v.shape) # (batch, head, blocks, time1, time2) x (batch, head, blocks, time2, d_v) x = torch.matmul(p_attn1, v) # (batch, heads, blocks, time1, d_k) # print('9',x.shape) - x = ( - x.view(batch_size, self.num_heads, -1, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) + x = (x.view(batch_size, self.num_heads, -1, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, -1, self.num_heads * self.d_v)) # (batch, time1, d_model) # print('10',x.shape) - v = ( - v0[:, k_left_shift:-k_right_shift] - .view(batch_size, -1, self.num_heads, self.d_v) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_v) - ) + v = (v0[:, k_left_shift:-k_right_shift].view( + batch_size, -1, self.num_heads, + self.d_v).transpose(1, + 2).contiguous().view(batch_size, + self.num_heads, + num_blocks - 1, -1, + self.d_v)) # (batch, blocks-1, head, time2, d_v) # print('11',p_attn1.shape,p_attn2.shape, v.shape) # (batch, blocks-1, head, time1, time2) x (batch, blocks-1, head, time2, d_v) x2 = torch.matmul(p_attn2, v) # (batch, heads, blocks-1, time1, d_k) # print('12',x2.shape) - x2 = ( - x2.view(batch_size, self.num_heads, -1, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) + x2 = (x2.view(batch_size, self.num_heads, -1, self.d_k).transpose( + 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v)) # (batch, time1, d_model) # print('12',x2.shape) - x[:, q_left_shift:-q_right_shift:] = x[:, q_left_shift:-q_right_shift:] + x2 + x[:, + q_left_shift:-q_right_shift:] = x[:, + q_left_shift:-q_right_shift:] + x2 x = x[:, :t1] return self.linear_out(x) # (batch, time1, d_model) - def forward1(self, query, key, value, mask): + def forward(self, query, key, value, mask): """Computes 'Local Scaled Dot Product Attention'. Args: @@ -561,170 +487,91 @@ def forward1(self, query, key, value, mask): Attention weigthed average of the values with size=(batch, time1, out_feats) """ batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - if t2 <= self.context: + t1 = query.size(1) + t2 = key.size(1) + if t2 <= 2 * self.context: return super().forward(query, key, value, mask) q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - query, key, value - ) - # q0 size=(batch, time1, head * d_k) - # k0 size=(batch, time2, head * d_k) - # v0 size=(batch, time2, head * d_v) + query, key, value) + # q0 size=(batch, time1, head*d_k) + # k0 size=(batch, time2, head*d_k) + # v0 size=(batch, time2, head*d_v) # compute block diagonal affinity matrix - # # print('3',q0.shape,k0.shape,v0.shape) - # q = q0.view( - # batch_size, -1, self.num_heads, self.d_k).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks, -1, self.d_k) - # # (batch, head, blocks, time1, d_k) - # k = k0.view( - # batch_size, -1, self.num_heads, self.d_k).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks, -1, self.d_k) - # # (batch, head, blocks time2, d_k) - # # print('4',q.shape,k.shape) - - # scores1 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - scores1 = self._compute_scores(q0, k0, num_blocks, context_q, context_k, 0, 0) + scores1 = self._compute_scores(q0, k0, num_blocks, context_q, + context_k, 0, 0) # (batch, head, blocks context_q, context_k) - # print('5',scores1.shape) # compute shifted block diagonal affinity matrix q_left_shift = context_q // 2 k_left_shift = context_k // 2 - # q_right_shift = context_q - q_left_shift - # k_right_shift = context_k - k_left_shift - # q = q0[:,q_left_shift:-q_right_shift].view( - # batch_size, -1, self.num_heads, self.d_k).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks-1, -1, self.d_k) - # # (batch, blocks-1, head, time1, d_k) - # k = k0[:,k_left_shift:-k_right_shift].view( - # batch_size, -1, self.num_heads, self.d_k).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks-1, -1, self.d_k) - # # (batch, blocks-1, head, d_k) - # # print('6',q.shape,k.shape) - - # scores2 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - scores2 = self._compute_scores( - q0, k0, num_blocks - 1, context_q, context_k, q_left_shift, k_left_shift - ) + scores2 = self._compute_scores(q0, k0, num_blocks - 1, context_q, + context_k, q_left_shift, k_left_shift) # (batch, head, blocks-1 context_q, context_k) - # print('7',scores2.shape) # combine both block diagonal affinity matrix to do the softmax - # if mask is not None: - # # put to -inf scores in points where mask==0 - # if mask.dim() == 4: - # # case when mask is 2d matrix per batch element - # mask = mask.eq(0) # (batch, time1, time2) - - # # first, we mask block diagonal blocks - # scores1 = self._mask_scores_2d(scores1, mask, 0, 0) - - # # second, we mask shifted block diagonal blocks - # scores2 = self._mask_scores_2d(scores2, mask, q_left_shift, k_left_shift) + self._compute_softmax(scores1, scores2, mask, q_left_shift, + k_left_shift, t1, t2) + return self._apply_attn(v0, t1) - # else: - # # case when mask is 1d vector per batch element, - # # meaning that time1 and time2 are the same, so mask is symmetric - # mask = nn.functional.pad(mask, (0, pad2)) - # mask = mask.squeeze(1).eq(0) # (batch, 1, time) - # # first, we mask block diagonal blocks - # scores1 = self._mask_scores_1d(scores1, mask, 0, 0) +class BlockScaledDotProdAttV1(ScaledDotProdAttV1): + """Block Scaled dot product multihead attention layer + It calculates self-attention with block diagonal mask - # # second, we mask shifted block diagonal blocks - # scores2 = self._mask_scores_1d(scores2, mask, q_left_shift, k_left_shift) + Attributes: + in_feats: input feature dimension + out_feats: output feature dimension + num_heads: number of heads + d_k: key/query projection dimension + d_v: value projection dimension + context: maximum attention temporal context. + dropout_rate: dropout rate + """ - # self.attn1, self.attn2 = self._softmax( - # scores1, scores2, q_left_shift, k_left_shift, t1, t2) + def __init__( + self, + in_feats, + out_feats, + num_heads, + d_k, + d_v, + context=25, + dropout_rate=0, + ): + """Construct an MultiHeadedAttention object.""" + super().__init__(in_feats, out_feats, num_heads, d_k, d_v, + dropout_rate) + self.context = context - self._compute_softmax( - scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 - ) - return self._apply_attn(v0, t1) + def __repr__(self): + return self.__str__() - # if self.dropout_rate > 0: - # p_attn1 = self.dropout(self.attn1) - # p_attn2 = self.dropout(self.attn2) - # else: - # p_attn1 = self.attn1 - # p_attn2 = self.attn2 - - # v = v0.view( - # batch_size, -1, self.num_heads, self.d_v).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks, -1, self.d_k) - # # (batch, heads, blocks, time2, d_v) - # # print('8',p_attn1.shape,p_attn2.shape, v.shape) - # # (batch, blocks, head, time1, time2) x (batch, blocks, head, time2, d_v) - # x = torch.matmul(p_attn1, v) # (batch, heads, blocks, time1, d_k) - # # print('9',x.shape) - # x = x.view(batch_size, self.num_heads, -1, self.d_k).transpose( - # 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) - # # (batch, time1, d_model) - # # print('10',x.shape) - - # v = v0[:,k_left_shift:-k_right_shift].view( - # batch_size, -1, self.num_heads, self.d_v).transpose( - # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks-1, -1, self.d_v) - # # (batch, blocks-1, head, time2, d_v) - # # print('11',p_attn1.shape,p_attn2.shape, v.shape) - # # (batch, blocks-1, head, time1, time2) x (batch, blocks-1, head, time2, d_v) - # x2 = torch.matmul(p_attn2, v) # (batch, heads, blocks-1, time1, d_k) - # # print('12',x2.shape) - # x2 = x2.view(batch_size, self.num_heads, -1, self.d_k).transpose( - # 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) - # # (batch, time1, d_model) - # # print('12',x2.shape) - # x[:,q_left_shift:-q_right_shift:] = x[:,q_left_shift:-q_right_shift:] + x2 - # x = x[:,:t1] - # return self.linear_out(x) # (batch, time1, d_model) - - def forward2(self, query, key, value, mask): - """Computes 'Local Scaled Dot Product Attention'. + def __str__(self): + s = ("{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, " + "context={}, dropout_rate={})".format( + self.__class__.__name__, + self.in_feats, + self.out_feats, + self.num_heads, + self.d_k, + self.d_v, + self.context, + self.dropout_rate, + )) + return s - Args: - query: query with size=(batch, time1, in_feats), - where time1 is the output time dimension - key: key with size=(batch, time2, in_feats) - where time1 is the input time dimension - value: value with size=(batch, time2, in_feats) - mask: optional mask with size=(batch, time1, time2), - to zero attention between some time steps. - or (batch, time) if time1=time2 - Returns: - Attention weigthed average of the values with size=(batch, time1, out_feats) - """ + def _compute_qkv0(self, query, key, value): batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - if t2 <= self.context: - return super().forward(query, key, value, mask) - - if self.time_dim != 1: - query = query.transpose(1, self.time_dim) - key = key.transpose(1, self.time_dim) - value = value.transpose(1, self.time_dim) + t1 = query.size(1) + t2 = key.size(1) - context_k = self.context - num_blocks = math.ceil(t2 / context_k) # (t2 + context_k//2)//context_k + num_blocks = max(1, t2 // self.context) + context_k = math.ceil(t2 / num_blocks) context_q = math.ceil(t1 / num_blocks) - num_blocks_q = math.ceil(t1 / context_q) # (t1 + context_q//2)//context_q - assert ( - num_blocks == num_blocks_q - ), "num_blocks_k({})!=num_blocks_q({}), context_k={}, context_q={}, t1={}, t2={}".format( - num_blocks, num_blocks_q, context_k, context_q, t1, t2 - ) pad1 = context_q * num_blocks - t1 pad2 = context_k * num_blocks - t2 - # print('1',query.shape,key.shape,value.shape,pad1,pad2, context_q, context_k) if pad1 > 0: query = nn.functional.pad(query, (0, 0, 0, pad1)) @@ -732,152 +579,185 @@ def forward2(self, query, key, value, mask): key = nn.functional.pad(key, (0, 0, 0, pad2)) value = nn.functional.pad(value, (0, 0, 0, pad2)) - # print('2',query.shape,key.shape,value.shape) q0 = self.linear_q(query) # (batch, time1, head*d_k) k0 = self.linear_k(key) # (batch, time2, head*d_k) v0 = self.linear_v(value) # (batch, time2, head*d_v) - # # q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - # # query, key, value) - # # # q0 size=(batch, time1, head*d_k) - # # # k0 size=(batch, time2, head*d_k) - # # # v0 size=(batch, time2, head*d_v) + return q0, k0, v0, context_q, context_k, num_blocks - # compute block diagonal affinity matrix - # # print('3',q0.shape,k0.shape,v0.shape) - q = ( - q0.view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + def _compute_scores(self, q0, k0, num_blocks, context_q, context_k): + # q0 (batch, time1, head*d_k) + # k0 (batch, time2, head*d_k) + batch_size = q0.size(0) + q = (q0.view(batch_size, -1, self.num_heads, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, head, blocks, time1, d_k) - k = ( - k0.view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + k = (k0.view(batch_size, -1, self.num_heads, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, head, blocks time2, d_k) - # # print('4',q.shape,k.shape) - scores1 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - # # scores1 = self._compute_scores( - # # q0, k0, num_blocks, context_q, context_k, 0, 0) - # (batch, head, blocks context_q, context_k) - # print('5',scores1.shape) + return torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - # compute shifted block diagonal affinity matrix - q_left_shift = context_q // 2 - k_left_shift = context_k // 2 - q_right_shift = context_q - q_left_shift - k_right_shift = context_k - k_left_shift - q = ( - q0[:, q_left_shift:-q_right_shift] - .view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_k) - ) - # (batch, blocks-1, head, time1, d_k) - k = ( - k0[:, k_left_shift:-k_right_shift] - .view(batch_size, -1, self.num_heads, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_k) - ) - # # (batch, blocks-1, head, d_k) - # # print('6',q.shape,k.shape) + @staticmethod + def _softmax(scores, t1, t2): + """Computes softmax for block diagonal attention maps - scores2 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - # scores2 = self._compute_scores( - # q0, k0, num_blocks-1, context_q, context_k, - # q_left_shift, k_left_shift) - # (batch, head, blocks-1 context_q, context_k) - # print('7',scores2.shape) + Args: + scores: attention scores from block-diagonal score matrix + with size=(batch, heads, blocks, t1, t2) + t1: length of time dimension 1 (output time dimension) + t2: length of time dimension 2 (input time dimension), with self-att t1=t2. - # combine both block diagonal affinity matrix to do the softmax - # if mask is not None: - # # put to -inf scores in points where mask==0 - # if mask.dim() == 4: - # # case when mask is 2d matrix per batch element - # mask = mask.eq(0) # (batch, time1, time2) + Returns + probs1: posterior attention scores for block-diagonal att. matrix + with size=(batch, heads, blocks, t1, t2) + probs2: posterior attention scores for a shifted block-diagonal att. matrix + with size=(batch, heads, blocks-1, t1, t2) + + """ + if scores.dtype == torch.half: + min_val = -65504 + else: + min_val = -1e20 + + batch_size = scores.size(0) + num_heads = scores.size(1) + num_blocks = scores.size(2) + context1 = scores.size(3) + context2 = scores.size(4) + + # set the padding time steps that we had to add to make integer block-number to -inf + # in scores1 - # # first, we mask block diagonal blocks - # scores1 = self._mask_scores_2d(scores1, mask, 0, 0) + dt1 = max(0, scores.size(2) * scores.size(3) - t1) + if dt1 > 0: + scores[:, :, -1, -dt1:, :] = min_val - # # second, we mask shifted block diagonal blocks - # scores2 = self._mask_scores_2d(scores2, mask, q_left_shift, k_left_shift) + dt2 = max(0, scores.size(2) * scores.size(4) - t2) + if dt2 > 0: + scores[:, :, -1, :, -dt2:] = min_val - # else: - # # case when mask is 1d vector per batch element, - # # meaning that time1 and time2 are the same, so mask is symmetric - # mask = nn.functional.pad(mask, (0, pad2)) - # mask = mask.squeeze(1).eq(0) # (batch, 1, time) + # flatten blocks and time1 dimensions + scores = scores.view(batch_size, num_heads, -1, context2) + # pad scores2 to have the same size as scores1 - # # first, we mask block diagonal blocks - # scores1 = self._mask_scores_1d(scores1, mask, 0, 0) + # (batch, heads, blocks*time1, time2) + probs = torch.softmax(scores, dim=-1).contiguous().view( + batch_size, num_heads, num_blocks, -1, context2) - # # second, we mask shifted block diagonal blocks - # scores2 = self._mask_scores_1d(scores2, mask, q_left_shift, k_left_shift) + return probs - self.attn1, self.attn2 = self._softmax( - scores1, scores2, q_left_shift, k_left_shift, t1, t2 - ) + def _mask_scores_1d(self, scores, mask): + if scores.dtype == torch.half: + min_value = -65504 + else: + min_value = -1e20 + + batch_size = scores.size(0) + num_blocks = scores.size(2) + context1 = scores.size(3) + context2 = scores.size(4) + mask_blocks = torch.ones_like(scores, dtype=mask.dtype) + mask_single_block = torch.zeros((batch_size, context1, context2), + dtype=mask.dtype, + device=mask.device) + + t1_start = 0 + t2_start = 0 + for block in range(num_blocks): + t1_end = t1_start + context1 + t2_end = t2_start + context2 + mask_single_block.fill_(False) + mask_single_block.masked_fill_(mask[:, t1_start:t1_end, None], + True) + mask_single_block.masked_fill_(mask[:, None, t2_start:t2_end], + True) + mask_blocks[:, :, block] = mask_single_block.unsqueeze(1) + t1_start += context1 + t2_start += context2 + + return scores.masked_fill(mask_blocks, min_value) + + def _mask_scores_2d(self, scores, mask): + if scores.dtype == torch.half: + min_value = -65504 + else: + min_value = -1e20 + + batch_size = scores.size(0) + num_blocks = scores.size(2) + context1 = scores.size(3) + context2 = scores.size(4) + mask_blocks = torch.ones_like(scores, dtype=mask.dtype) + t1_start = 0 + t2_start = 0 + mask = mask.unsequeeze(1) + for block in range(num_blocks): + t1_end = min(t1_start + context1, mask.size(1)) + t2_end = min(t2_start + context2, mask.size(2)) + mask_blocks[:, :, block, :(t1_end - t1_start), :( + t2_end - t2_start)] = mask[:, :, t1_start:t1_end, + t2_start:t2_end] + t1_start += context1 + t2_start += context2 + + return scores.masked_fill(mask_blocks, min_value) + + def _compute_softmax(self, scores, mask, t1, t2): + + if mask is not None: + # put to -inf scores in points where mask==0 + if mask.dim() == 3: + # case when mask is 2d matrix per batch element + mask = mask.eq(0) # (batch, time1, time2) + + # first, we mask block diagonal blocks + scores = self._mask_scores_2d(scores, mask) + + elif mask.dim() == 2: + # case when mask is 1d vector per batch element, + # meaning that time1 and time2 are the same, so mask is symmetric + pad2 = scores.size(2) * scores.size(3) - mask.size(-1) + mask = nn.functional.pad(mask, (0, pad2)) + mask = mask.eq(0) # (batch, time) + + # first, we mask block diagonal blocks + scores = self._mask_scores_1d(scores, mask) + + else: + raise ValueError() - # # self._compute_softmax(scores1, scores2, mask, - # # q_left_shift, k_left_shift, t1, t2) - # # return self._apply_attn(v0, t1) + self.attn = self._softmax(scores, t1, t2) + def _apply_attn(self, v0, t1): if self.dropout_rate > 0: - p_attn1 = self.dropout(self.attn1) - p_attn2 = self.dropout(self.attn2) + p_attn = self.dropout(self.attn) else: - p_attn1 = self.attn1 - p_attn2 = self.attn2 + p_attn = self.attn - v = ( - v0.view(batch_size, -1, self.num_heads, self.d_v) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) - ) + batch_size = p_attn.size(0) + num_blocks = p_attn.size(2) + context_q = p_attn.size(3) + context_k = p_attn.size(4) + q_left_shift = context_q // 2 + k_left_shift = context_k // 2 + q_right_shift = context_q - q_left_shift + k_right_shift = context_k - k_left_shift + + v = (v0.view(batch_size, -1, self.num_heads, + self.d_v).transpose(1, 2).contiguous().view( + batch_size, self.num_heads, num_blocks, -1, self.d_k)) # (batch, heads, blocks, time2, d_v) - # print('8',p_attn1.shape,p_attn2.shape, v.shape) - # (batch, blocks, head, time1, time2) x (batch, blocks, head, time2, d_v) - x = torch.matmul(p_attn1, v) # (batch, heads, blocks, time1, d_k) - # print('9',x.shape) - x = ( - x.view(batch_size, self.num_heads, -1, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) - # (batch, time1, d_model) - # print('10',x.shape) - v = ( - v0[:, k_left_shift:-k_right_shift] - .view(batch_size, -1, self.num_heads, self.d_v) - .transpose(1, 2) - .contiguous() - .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_v) - ) - # (batch, blocks-1, head, time2, d_v) - # print('11',p_attn1.shape,p_attn2.shape, v.shape) - # (batch, blocks-1, head, time1, time2) x (batch, blocks-1, head, time2, d_v) - x2 = torch.matmul(p_attn2, v) # (batch, heads, blocks-1, time1, d_k) - # print('12',x2.shape) - x2 = ( - x2.view(batch_size, self.num_heads, -1, self.d_k) - .transpose(1, 2) - .contiguous() - .view(batch_size, -1, self.num_heads * self.d_v) - ) + # (batch, head, blocks, time1, time2) x (batch, head, blocks, time2, d_v) + x = torch.matmul(p_attn, v) # (batch, heads, blocks, time1, d_k) + x = (x.view(batch_size, self.num_heads, -1, + self.d_k).transpose(1, 2).contiguous().view( + batch_size, -1, self.num_heads * self.d_v)) # (batch, time1, d_model) - # print('12',x2.shape) - x[:, q_left_shift:-q_right_shift:] = x[:, q_left_shift:-q_right_shift:] + x2 + x = x[:, :t1] return self.linear_out(x) # (batch, time1, d_model) @@ -897,35 +777,24 @@ def forward(self, query, key, value, mask): Attention weigthed average of the values with size=(batch, time1, out_feats) """ batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) + t1 = query.size(1) + t2 = key.size(1) - if t2 <= 2 * self.context: + if t2 < 2 * self.context: return super().forward(query, key, value, mask) q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - query, key, value - ) + query, key, value) # q0 size=(batch, time1, head*d_k) # k0 size=(batch, time2, head*d_k) # v0 size=(batch, time2, head*d_v) # compute block diagonal affinity matrix - scores1 = self._compute_scores(q0, k0, num_blocks, context_q, context_k, 0, 0) + scores = self._compute_scores(q0, k0, num_blocks, context_q, context_k) # (batch, head, blocks context_q, context_k) - # compute shifted block diagonal affinity matrix - q_left_shift = context_q // 2 - k_left_shift = context_k // 2 - scores2 = self._compute_scores( - q0, k0, num_blocks - 1, context_q, context_k, q_left_shift, k_left_shift - ) - # (batch, head, blocks-1 context_q, context_k) - # combine both block diagonal affinity matrix to do the softmax - self._compute_softmax( - scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 - ) + self._compute_softmax(scores, mask, t1, t2) return self._apply_attn(v0, t1) @@ -942,8 +811,6 @@ class ScaledDotProdAttRelPosEncV1(ScaledDotProdAttV1): d_v: value projection dimension causal_pos_enc: positional encoder is 0 for attending future frames. dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input - dimensions are (batch, time, in_feats) """ def __init__( @@ -955,7 +822,6 @@ def __init__( d_v, causal_pos_enc=False, dropout_rate=0, - time_dim=1, ): super().__init__( in_feats, @@ -964,7 +830,6 @@ def __init__( d_k, d_v, dropout_rate=dropout_rate, - time_dim=time_dim, ) self.linear_pos = nn.Linear(in_feats, num_heads * d_k) @@ -992,19 +857,17 @@ def _apply_tril(self, x): 1 1 1 1 ] """ diag = x.size(3) - x.size(2) - if ( - self._tril is None - or self._tril.size(2) < x.size(2) - or self._tril.size(3) < x.size(3) - or self._tril_diag != diag - ): + if (self._tril is None or self._tril.size(2) < x.size(2) + or self._tril.size(3) < x.size(3) or self._tril_diag != diag): # in these cases we need to recompute the lower triangular mask - ones = torch.ones((x.size(2), x.size(3)), dtype=x.dtype, device=x.device) + ones = torch.ones((x.size(2), x.size(3)), + dtype=x.dtype, + device=x.device) self._tril = torch.tril(ones, diag)[None, None, :, :] self._tril_diag = diag tril = self._tril else: - tril = self._tril[:, :, : x.size(2), : x.size(3)] + tril = self._tril[:, :, :x.size(2), :x.size(3)] return x * tril @@ -1019,19 +882,17 @@ def _apply_triu(self, x): """ # we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice diag = x.size(3) - x.size(2) + 1 - if ( - self._triu is None - or self._triu.size(2) < x.size(2) - or self._triu.size(3) < x.size(3) - or self._triu_diag != diag - ): + if (self._triu is None or self._triu.size(2) < x.size(2) + or self._triu.size(3) < x.size(3) or self._triu_diag != diag): # in these cases we need to recompute the lower triangular mask - ones = torch.ones((x.size(2), x.size(3)), dtype=x.dtype, device=x.device) + ones = torch.ones((x.size(2), x.size(3)), + dtype=x.dtype, + device=x.device) self._triu = torch.triu(ones, diag)[None, None, :, :] self._triu_diag = diag triu = self._triu else: - triu = self._triu[:, :, -x.size(2) :, -x.size(3) :] + triu = self._triu[:, :, -x.size(2):, -x.size(3):] return x * triu @@ -1095,7 +956,8 @@ def forward(self, query, key, value, pos_emb=None, mask=None): q, k, v = self._compute_qkv(query, key, value) pos_batch_size = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, self.d_k) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) p = p.transpose(1, 2) # (batch, head, time2, d_k) q = q.transpose(1, 2) # (batch, time1, head, d_k) @@ -1103,13 +965,14 @@ def forward(self, query, key, value, pos_emb=None, mask=None): q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) # compute A(a) + A(c) in Sec3.3, 2nd Eq. - AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) # (batch, head, time1, time2) + AC = torch.matmul(q_plus_u, + k.transpose(-2, -1)) # (batch, head, time1, time2) # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part # This is the sum of Btilde and Dtilde in the Appendix of the paper - BDtilde = torch.matmul( - q_plus_v, p.transpose(-2, -1) - ) # (batch, head, time1, time2) + BDtilde = torch.matmul(q_plus_v, + p.transpose(-2, + -1)) # (batch, head, time1, time2) # apply left shift as indicated in the Appendix to geth B+D BD = self._left_shift(BDtilde) @@ -1119,19 +982,15 @@ def forward(self, query, key, value, pos_emb=None, mask=None): # we assume that t2 >= t1 dt = key.size(1) - query.size(1) pos_emb_noncausal = pos_emb[:, dt:].flip( - dims=(1,) - ) # we flip to get R_0, ..., R_{L-1} - pos_emb_noncausal[ - :, :, 0::2 - ] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] p = self.linear_pos(pos_emb_noncausal).view( - pos_batch_size, -1, self.num_heads, self.d_k - ) + pos_batch_size, -1, self.num_heads, self.d_k) p = p.transpose(1, 2) # (batch, head, time2-dt, d_k) - BDtilde = torch.matmul( - q_plus_v, p.transpose(-2, -1) - ) # (batch, head, time1, time2-dt) + BDtilde = torch.matmul(q_plus_v, p.transpose( + -2, -1)) # (batch, head, time1, time2-dt) BD_noncausal = self._right_shift(BDtilde) BD[:, :, :, dt:] += BD_noncausal @@ -1158,8 +1017,6 @@ class LocalScaledDotProdAttRelPosEncV1(LocalScaledDotProdAttV1): context: maximum attention temporal context. causal_pos_enc: positional encoder is 0 for attending future frames. dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input - dimensions are (batch, time, in_feats) """ def __init__( @@ -1172,7 +1029,6 @@ def __init__( context=25, causal_pos_enc=False, dropout_rate=0, - time_dim=1, ): super().__init__( in_feats, @@ -1182,7 +1038,6 @@ def __init__( d_v, context, dropout_rate=dropout_rate, - time_dim=time_dim, ) self.linear_pos = nn.Linear(in_feats, num_heads * d_k) @@ -1210,19 +1065,17 @@ def _apply_tril(self, x): 1 1 1 1 ] """ diag = x.size(4) - x.size(3) - if ( - self._tril is None - or self._tril.size(3) < x.size(3) - or self._tril.size(4) < x.size(4) - or self._tril_diag != diag - ): + if (self._tril is None or self._tril.size(3) < x.size(3) + or self._tril.size(4) < x.size(4) or self._tril_diag != diag): # in these cases we need to recompute the lower triangular mask - ones = torch.ones((x.size(3), x.size(4)), dtype=x.dtype, device=x.device) + ones = torch.ones((x.size(3), x.size(4)), + dtype=x.dtype, + device=x.device) self._tril = torch.tril(ones, diag)[None, None, None, :, :] self._tril_diag = diag tril = self._tril else: - tril = self._tril[:, :, :, : x.size(3), : x.size(4)] + tril = self._tril[:, :, :, :x.size(3), :x.size(4)] return x * tril @@ -1237,19 +1090,17 @@ def _apply_triu(self, x): """ # we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice diag = x.size(4) - x.size(3) + 1 - if ( - self._triu is None - or self._triu.size(3) < x.size(3) - or self._triu.size(4) < x.size(4) - or self._triu_diag != diag - ): + if (self._triu is None or self._triu.size(3) < x.size(3) + or self._triu.size(4) < x.size(4) or self._triu_diag != diag): # in these cases we need to recompute the lower triangular mask - ones = torch.ones((x.size(3), x.size(4)), dtype=x.dtype, device=x.device) + ones = torch.ones((x.size(3), x.size(4)), + dtype=x.dtype, + device=x.device) self._triu = torch.triu(ones, diag)[None, None, None, :, :] self._triu_diag = diag triu = self._triu else: - triu = self._triu[:, :, :, -x.size(3) :, -x.size(4) :] + triu = self._triu[:, :, :, -x.size(3):, -x.size(4):] return x * triu @@ -1320,27 +1171,52 @@ def forward(self, query, key, value, pos_emb=None, mask=None): Attention weigthed average of the value with size=(batch, time1, out_feats) """ batch_size = query.size(0) - t1 = query.size(self.time_dim) - t2 = key.size(self.time_dim) - q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - query, key, value - ) - # q0 size=(batch, time1, head*d_k) - # k0 size=(batch, time2, head*d_k) - # v0 size=(batch, time2, head*d_v) - - q_plus_u0 = q0 + self.u.view(-1, q0.size(-1)) # (batch, time1, head*d_k) - - # q = q.transpose(1, 2) # (batch, time1, head, d_k) - # q_plus_u = (q + self.u).transpose(1, 2) #(batch, head, time1, d_k) - # q_plus_v = (q + self.v).transpose(1, 2) #(batch, head, time1, d_k) + t1 = query.size(1) + t2 = key.size(1) + if round(t2 / self.context) > 1: + return self._forward_nblocks(query, key, value, pos_emb, mask) + else: + return self._forward_1block(query, key, value, pos_emb, mask) - # compute A(a) + A(c) in Sec3.3, 2nd Eq. block diagonals - # 1) compute block diagonal affinity matrix - AC1 = self._compute_scores( - q_plus_u0, k0, num_blocks, context_q, context_k, 0, 0 - ) - # (batch, head, blocks, context_q, context_k) + def _forward_nblocks(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention' for the case that we have + more than 1block in the block diagonal attention matrix. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = query.size(0) + t1 = query.size(1) + t2 = key.size(1) + + q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( + query, key, value) + # q0 size=(batch, time1, head*d_k) + # k0 size=(batch, time2, head*d_k) + # v0 size=(batch, time2, head*d_v) + + q_plus_u0 = q0 + self.u.view(-1, + q0.size(-1)) # (batch, time1, head*d_k) + + # q = q.transpose(1, 2) # (batch, time1, head, d_k) + # q_plus_u = (q + self.u).transpose(1, 2) #(batch, head, time1, d_k) + # q_plus_v = (q + self.v).transpose(1, 2) #(batch, head, time1, d_k) + + # compute A(a) + A(c) in Sec3.3, 2nd Eq. block diagonals + # 1) compute block diagonal affinity matrix + AC1 = self._compute_scores(q_plus_u0, k0, num_blocks, context_q, + context_k, 0, 0) + # (batch, head, blocks, context_q, context_k) # 2) compute shifted block diagonal matrix q_left_shift = context_q // 2 @@ -1359,28 +1235,27 @@ def forward(self, query, key, value, pos_emb=None, mask=None): pos_emb = pos_emb[:, -context_k:] # (1, context_k, d_model) pos_batch_size = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, self.d_k) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) p = p.transpose(1, 2) # (1, head, context_k, d_k) - q = q0.view( - batch_size, -1, self.num_heads, self.d_k - ) # (batch, time1, head, d_k) + q = q0.view(batch_size, -1, self.num_heads, + self.d_k) # (batch, time1, head, d_k) q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part # This is the sum of Btilde and Dtilde in the Appendix of the paper BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( - self.d_k - ) # (batch, head, time1, context_k) + self.d_k) # (batch, head, time1, context_k) # apply left shift as indicated in the Appendix to geth B+D # 1) block-diagonal part of BD: BD1 BD1 = self._left_shift( - BDtilde, context_q, 0 - ) # (batch, head, blocks, context_q, context_k) + BDtilde, context_q, + 0) # (batch, head, blocks, context_q, context_k) # 2) shifted block diagonal part of BD: BD2 BD2 = self._left_shift( - BDtilde, context_q, q_left_shift - ) # (batch, head, blocks-1, context_q, context_k) + BDtilde, context_q, + q_left_shift) # (batch, head, blocks-1, context_q, context_k) # print('BD\n',BD1[0,0,0,:10,:10]) # print(BD2[0,0,0,:10,:10]) @@ -1391,22 +1266,18 @@ def forward(self, query, key, value, pos_emb=None, mask=None): # we assume that t2 >= t1, and therefore context_k >= context_q dt = context_k - context_q pos_emb_noncausal = pos_emb[:, dt:].flip( - dims=(1,) - ) # we flip to get R_0, ..., R_{L-1} - pos_emb_noncausal[ - :, :, 0::2 - ] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] p = self.linear_pos(pos_emb_noncausal).view( - pos_batch_size, -1, self.num_heads, self.d_k - ) + pos_batch_size, -1, self.num_heads, self.d_k) p = p.transpose(1, 2) # (batch, head, context_k-dt, d_k) BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( - self.d_k - ) # (batch, head, time1, context_k-dt) + self.d_k) # (batch, head, time1, context_k-dt) BD_noncausal1 = self._right_shift( - BDtilde, context_q, 0 - ) # (batch, head, blocks, context_q, context_k-dt) + BDtilde, context_q, + 0) # (batch, head, blocks, context_q, context_k-dt) BD_noncausal2 = self._right_shift( BDtilde, context_q, q_left_shift ) # (batch, head, blocks-1, context_q, context_k-dt) @@ -1422,7 +1293,394 @@ def forward(self, query, key, value, pos_emb=None, mask=None): # add AC and BD for block-diag s scores1 = AC1 + BD1 # (batch, head, blocks, context_q, context_k) scores2 = AC2 + BD2 # (batch, head, blocks-1, context_q, context_k) - self._compute_softmax( - scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 + self._compute_softmax(scores1, scores2, mask, q_left_shift, + k_left_shift, t1, t2) + return self._apply_attn(v0, t1) + + def _forward_1block(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention' for the case that + there is only one block in the block-diagonal attention matrix. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = value.size(0) + q, k, v = self._compute_qkv(query, key, value) + context_q = query.size(1) + + pos_batch_size = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) + p = p.transpose(1, 2) # (batch, head, time2, d_k) + + q = q.transpose(1, 2) # (batch, time1, head, d_k) + q_plus_u = (q + self.u).transpose(1, 2) # (batch, head, time1, d_k) + q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) + + # compute A(a) + A(c) in Sec3.3, 2nd Eq. + AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) + # AC = (batch, head, time1, time2) + + # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part + # This is the sum of Btilde and Dtilde in the Appendix of the paper + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) + # BDtilde = (batch, head, time1, time2) + # apply left shift as indicated in the Appendix to geth B+D + BD = self._left_shift(BDtilde, context_q, 0).squeeze(2) + + if not self.causal_pos_enc: + # compute A(b) + A(d) for the non-causal part, + # this is not included in the paper because it doesn't allow to attent to future postions + # we assume that t2 >= t1 + dt = key.size(1) - query.size(1) + pos_emb_noncausal = pos_emb[:, dt:].flip( + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] + p = self.linear_pos(pos_emb_noncausal).view( + pos_batch_size, -1, self.num_heads, self.d_k) + p = p.transpose(1, 2) # (batch, head, time2-dt, d_k) + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) + # BDtilde = (batch, head, time1, time2-dt) + BD_noncausal = self._right_shift(BDtilde, context_q, 0).squeeze(2) + BD[:, :, :, dt:] += BD_noncausal + + # add and normalize + scores = (AC + BD) / math.sqrt(self.d_k) # (batch, head, time1, time2) + self.attn = self._base_compute_softmax(scores, mask) + return self._base_apply_attn(v) + + +class BlockScaledDotProdAttRelPosEncV1(BlockScaledDotProdAttV1): + """Block Scaled dot product multihead attention layer + It calculates self-attention with block diagonal mask + + It uses relative positional encoders as defined in + https://arxiv.org/pdf/1901.02860.pdf + + Attributes: + in_feats: input feature dimension + out_feats: output feature dimension + num_heads: number of heads + d_k: key/query projection dimension + d_v: value projection dimension + context: maximum attention temporal context. + causal_pos_enc: positional encoder is 0 for attending future frames. + dropout_rate: dropout rate + """ + + def __init__( + self, + in_feats, + out_feats, + num_heads, + d_k, + d_v, + context=25, + causal_pos_enc=False, + dropout_rate=0, + ): + super().__init__( + in_feats, + out_feats, + num_heads, + d_k, + d_v, + context, + dropout_rate=dropout_rate, ) + + self.linear_pos = nn.Linear(in_feats, num_heads * d_k) + # u, v in paper, Sec 3.3, 2nd eq. + self.u = nn.Parameter(torch.Tensor(num_heads, d_k)) + self.v = nn.Parameter(torch.Tensor(num_heads, d_k)) + # we use same init as in espnet + nn.init.xavier_uniform_(self.u) + nn.init.xavier_uniform_(self.v) + + self.causal_pos_enc = causal_pos_enc + + self._tril = None + self._tril_diag = 0 + self._triu = None + self._triu_diag = 0 + + def _apply_tril(self, x): + """Applies lower triangular mask to (Q + v^T) W R_{i-j} attention matrix + to keep causal attention points, i.e., i-j >= 0 + E.g., + if t1=3, t2=4 this will apply a mask + [1 1 0 0; + 1 1 1 0; + 1 1 1 1 ] + """ + diag = x.size(4) - x.size(3) + if (self._tril is None or self._tril.size(3) < x.size(3) + or self._tril.size(4) < x.size(4) or self._tril_diag != diag): + # in these cases we need to recompute the lower triangular mask + ones = torch.ones((x.size(3), x.size(4)), + dtype=x.dtype, + device=x.device) + self._tril = torch.tril(ones, diag)[None, None, None, :, :] + self._tril_diag = diag + tril = self._tril + else: + tril = self._tril[:, :, :, :x.size(3), :x.size(4)] + + return x * tril + + def _apply_triu(self, x): + """Applies upper triangular mask to (Q + v^T) W R_{i-j} attention matrix + to keep non-causal attention points, i.e., i-j < 0 + E.g., + if t1=3, t2=4 this will apply a mask + [0 0 1 1; + 0 0 0 1; + 0 0 0 0 ] + """ + # we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice + diag = x.size(4) - x.size(3) + 1 + if (self._triu is None or self._triu.size(3) < x.size(3) + or self._triu.size(4) < x.size(4) or self._triu_diag != diag): + # in these cases we need to recompute the lower triangular mask + ones = torch.ones((x.size(3), x.size(4)), + dtype=x.dtype, + device=x.device) + self._triu = torch.triu(ones, diag)[None, None, None, :, :] + self._triu_diag = diag + triu = self._triu + else: + triu = self._triu[:, :, :, -x.size(3):, -x.size(4):] + + return x * triu + + def _left_shift(self, x, context): + """Applies left shifts to the rows of x + to get scores with relative pos encodings R_{i-j} + i-j >=0, causal attention + + E.g. + [q0 R3, q0 R2, q0 R1, q0 R0; + q1 R3, q1 R2, q1 R1, q1 R0; + q2 R3, q2 R2, q2 R1, q2 R0] + + becomes: + [q0 R1, q0 R0, 0 , 0 ; + q1 R2, q1 R1, q1 R0, 0 ; + q2 R3, q2 R2, q2 R1, q2 R0] + """ + x = x.view(x.size(0), x.size(1), -1, context, x.size(-1)) + x_pad = nn.functional.pad(x, (1, 0), mode="constant", value=0) + x_pad = x_pad.view(*x.size()[:3], x.size(4) + 1, x.size(3)) + x = x_pad[:, :, :, 1:].view_as(x) + return self._apply_tril(x) + + def _right_shift(self, x, context): + """Applies right shifts to the rows of x + to get scores with relative pos encodings R_{i-j} + i-j < 0, non-causal attention + + E.g. + [q0 R_0, q0 R_{-1}, q0 R_{-2}; + q1 R_0, q1 R_{-1}, q1 R_{-2}; + q2 R_0, q1 R_{-1}, q2 R_{-2}] + + becomes: + [ 0, q0 R_{-1}, q0 R_{-2}; + 0, 0 , q1 R_{-1}; + 0, 0 , 0 ] + """ + x = x.view(x.size(0), x.size(1), -1, context, x.size(-1)) + x_pad = nn.functional.pad(x, (0, 1), mode="constant", value=0) + x_pad = x_pad.view(*x.size()[:3], x.size(4) + 1, x.size(3)) + x = x_pad[:, :, :, :-1].view_as(x) + return self._apply_triu(x) + + def forward(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention'. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = query.size(0) + t1 = query.size(1) + t2 = key.size(1) + if t2 // self.context > 1: + return self._forward_nblocks(query, key, value, pos_emb, mask) + else: + return self._forward_1block(query, key, value, pos_emb, mask) + + def _forward_nblocks(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention' for the case that we have + more than 1block in the block diagonal attention matrix. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = query.size(0) + t1 = query.size(1) + t2 = key.size(1) + + q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( + query, key, value) + # q0 size=(batch, time1, head*d_k) + # k0 size=(batch, time2, head*d_k) + # v0 size=(batch, time2, head*d_v) + + q_plus_u0 = q0 + self.u.view(-1, + q0.size(-1)) # (batch, time1, head*d_k) + + # q = q.transpose(1, 2) # (batch, time1, head, d_k) + # q_plus_u = (q + self.u).transpose(1, 2) #(batch, head, time1, d_k) + # q_plus_v = (q + self.v).transpose(1, 2) #(batch, head, time1, d_k) + + # compute A(a) + A(c) in Sec3.3, 2nd Eq. block diagonals + # 1) compute block diagonal affinity matrix + AC1 = self._compute_scores(q_plus_u0, k0, num_blocks, context_q, + context_k) + # (batch, head, blocks, context_q, context_k) + + # AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) # (batch, head, time1, time2) + + pos_emb = pos_emb[:, -context_k:] # (1, context_k, d_model) + pos_batch_size = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) + p = p.transpose(1, 2) # (1, head, context_k, d_k) + + q = q0.view(batch_size, -1, self.num_heads, + self.d_k) # (batch, time1, head, d_k) + q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) + + # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part + # This is the sum of Btilde and Dtilde in the Appendix of the paper + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( + self.d_k) + # BDtilde = (batch, head, time1, context_k) + # apply left shift as indicated in the Appendix to geth B+D + # 1) block-diagonal part of BD: BD1 + BD1 = self._left_shift(BDtilde, context_q) + # BD = (batch, head, blocks, context_q, context_k) + # print('BD\n',BD1[0,0,0,:10,:10]) + + if not self.causal_pos_enc: + # compute A(b) + A(d) for the non-causal part, + # this is not included in the paper because it doesn't allow to attent to future postions + # we assume that t2 >= t1, and therefore context_k >= context_q + dt = context_k - context_q + pos_emb_noncausal = pos_emb[:, dt:].flip( + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] + p = self.linear_pos(pos_emb_noncausal).view( + pos_batch_size, -1, self.num_heads, self.d_k) + p = p.transpose(1, 2) # (batch, head, context_k-dt, d_k) + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( + self.d_k) # (batch, head, time1, context_k-dt) + BD_noncausal1 = self._right_shift(BDtilde, context_q) + # BD = (batch, head, blocks, context_q, context_k-dt) + # print(BD_noncausal1[0,0,0,:10,:10]) + BD1[:, :, :, :, dt:] += BD_noncausal1 + + # print(BD1[0,0,0,:10,:10]) + + # add AC and BD for block-diag s + scores = AC1 + BD1 # (batch, head, blocks, context_q, context_k) + self._compute_softmax(scores, mask, t1, t2) return self._apply_attn(v0, t1) + + def _forward_1block(self, query, key, value, pos_emb=None, mask=None): + """Computes 'Scaled Dot Product Attention' for the case that + there is only one block in the block-diagonal attention matrix. + + Args: + query: query with size=(batch, time1, in_feats), + where time1 is the output time dimension + key: key with size=(batch, time2, in_feats) + where time1 is the input time dimension + value: value with size=(batch, time2, in_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 + mask: optional mask with size=(batch, time1, time2), + to zero attention between some time steps + or size=(batch, time) to make time1=time2 + Returns: + Attention weigthed average of the value with size=(batch, time1, out_feats) + """ + batch_size = value.size(0) + q, k, v = self._compute_qkv(query, key, value) + context_q = query.size(1) + + pos_batch_size = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, + self.d_k) + p = p.transpose(1, 2) # (batch, head, time2, d_k) + + q = q.transpose(1, 2) # (batch, time1, head, d_k) + q_plus_u = (q + self.u).transpose(1, 2) # (batch, head, time1, d_k) + q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) + + # compute A(a) + A(c) in Sec3.3, 2nd Eq. + AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) + # AC = (batch, head, time1, time2) + + # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part + # This is the sum of Btilde and Dtilde in the Appendix of the paper + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) + # BDtilde = (batch, head, time1, time2) + # apply left shift as indicated in the Appendix to geth B+D + BD = self._left_shift(BDtilde, context_q).squeeze(2) + + if not self.causal_pos_enc: + # compute A(b) + A(d) for the non-causal part, + # this is not included in the paper because it doesn't allow to attent to future postions + # we assume that t2 >= t1 + dt = key.size(1) - query.size(1) + pos_emb_noncausal = pos_emb[:, dt:].flip( + dims=(1, )) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[:, :, 0:: + 2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] + p = self.linear_pos(pos_emb_noncausal).view( + pos_batch_size, -1, self.num_heads, self.d_k) + p = p.transpose(1, 2) # (batch, head, time2-dt, d_k) + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) + # BDtilde = (batch, head, time1, time2-dt) + BD_noncausal = self._right_shift(BDtilde, context_q).squeeze(2) + BD[:, :, :, dt:] += BD_noncausal + + # add and normalize + scores = (AC + BD) / math.sqrt(self.d_k) # (batch, head, time1, time2) + self.attn = self._base_compute_softmax(scores, mask) + return self._base_apply_attn(v) diff --git a/hyperion/torch/layers/pos_encoder.py b/hyperion/torch/layers/pos_encoder.py index f3aa17e9..b6f3672e 100644 --- a/hyperion/torch/layers/pos_encoder.py +++ b/hyperion/torch/layers/pos_encoder.py @@ -3,10 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import math +from typing import Union import torch from torch import nn +from .activation_factory import ActivationFactory as AF + class PosEncoder(nn.Module): """Positional encoding. @@ -16,7 +19,7 @@ class PosEncoder(nn.Module): dropout_rate: dropout rate """ - def __init__(self, num_feats, dropout_rate=0): + def __init__(self, num_feats: int, dropout_rate: float = 0): super().__init__() self.num_feats = num_feats self.dropout_rate = dropout_rate @@ -29,9 +32,9 @@ def __repr__(self): return self.__str__() def __str__(self): - s = "{}(num_feats={}, dropout_rate={})".format( - self.__class__.__name__, self.num_feats, self.dropout_rate - ) + s = "{}(num_feats={}, dropout_rate={})".format(self.__class__.__name__, + self.num_feats, + self.dropout_rate) return s def _pe(self, x, relative=False): @@ -45,22 +48,21 @@ def _pe(self, x, relative=False): pe = torch.zeros(x.size(1), self.num_feats) if relative: # this is for relative positional encoders - position = torch.arange( - x.size(1) - 1, -1, -1, dtype=torch.float32 - ).unsqueeze(1) + position = torch.arange(x.size(1) - 1, -1, -1, + dtype=torch.float32).unsqueeze(1) else: - position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + position = torch.arange(0, x.size(1), + dtype=torch.float32).unsqueeze(1) div_term = torch.exp( - torch.arange(0, self.num_feats, 2, dtype=torch.float32) - * -(math.log(10000.0) / self.num_feats) - ) + torch.arange(0, self.num_feats, 2, dtype=torch.float32) * + -(math.log(10000.0) / self.num_feats)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) self.pe = pe.to(device=x.device, dtype=x.dtype) return self.pe - def forward(self, x): + def forward(self, x: torch.Tensor): """Add positional encoding. Args: @@ -70,7 +72,7 @@ def forward(self, x): x-scaled + pos-encoder """ pe = self._pe(x) - x = x * self.xscale + pe[:, : x.size(1)] + x = x * self.xscale + pe[:, :x.size(1)] if self.dropout_rate > 0: return self.dropout(x) return x @@ -88,10 +90,10 @@ class RelPosEncoder(PosEncoder): dropout_rate: dropout rate """ - def __init__(self, num_feats, dropout_rate=0): + def __init__(self, num_feats: int, dropout_rate: float = 0): super().__init__(num_feats, dropout_rate) - def forward(self, x): + def forward(self, x: torch.Tensor): """Add positional encoding. Args: @@ -105,7 +107,7 @@ def forward(self, x): x = x * self.xscale # we want embedding [R_L,..., R_0] # while in non relative we want [R_0, ..., R_L] - pos_emb = self.pe[:, -x.size(1) :] + pos_emb = self.pe[:, -x.size(1):] # this pos_emb is matrix Q in # https://arxiv.org/pdf/1901.02860.pdf Appendix B # I think it should have been denoted as R, @@ -126,7 +128,7 @@ class NoPosEncoder(nn.Module): def __init__(self): super().__init__() - def forward(self, x): + def forward(self, x: torch.Tensor): """Identity map Args: @@ -136,3 +138,35 @@ def forward(self, x): x """ return x + + +class ConvPosEncoder(nn.Module): + """Convolutional positional encoder like the one used in wav2vec2 + + Attributes: + num_feats: number of input/output features + kernel_size: kernel size of convolution + num_groups: number of groups of the convolution + activation: hidden activation + """ + + def __init__(self, num_feats: int, kernel_size: int, num_groups: int, + activation: Union[str, nn.Module]): + super().__init__() + self.conv = nn.Conv1d(num_feats, + num_feats, + kernel_size=kernel_size, + padding=kernel_size // 2, + groups=num_groups) + self.activation = AF.create(activation) + self.num_pad_remove = 1 if kernel_size % 2 == 0 else 0 + + def forward(self, x: torch.Tensor): + x = x.transpose(1, 2) + x = self.conv(x) + if self.num_pad_remove > 0: + x = x[:, :, :-self.num_pad_remove] + + x = self.activation(x).transpose(1, 2) + + return x diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index 4bd086ad..ab518ad4 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -14,6 +14,7 @@ class LRSchedulerFactory(object): + def create( optimizer, lrsch_type, @@ -168,6 +169,8 @@ def create( eps=eps, ) + raise ValueError(f"invalid lrsch_type={lrsch_type}") + @staticmethod def filter_args(**kwargs): @@ -218,11 +221,9 @@ def add_class_args(parser, prefix=None): "noam_lr", "triangular_lr", ], - help=( - "Learning rate schedulers: None, Exponential," - "Cosine Annealing, Cosine Annealing for Adam," - "Reduce on Plateau" - ), + help=("Learning rate schedulers: None, Exponential," + "Cosine Annealing, Cosine Annealing for Adam," + "Reduce on Plateau"), ) parser.add_argument( @@ -231,22 +232,29 @@ def add_class_args(parser, prefix=None): type=float, help=("LR decay rate in exp lr"), ) - parser.add_argument( - "--decay-steps", default=100, type=int, help=("LR decay steps in exp lr") - ) - parser.add_argument( - "--power", default=0.5, type=float, help=("power in inverse power lr") - ) - - parser.add_argument( - "--hold-steps", default=10, type=int, help=("LR hold steps in exp lr") - ) - parser.add_argument("--t", default=10, type=int, help=("Period in cos lr")) + parser.add_argument("--decay-steps", + default=100, + type=int, + help=("LR decay steps in exp lr")) + parser.add_argument("--power", + default=0.5, + type=float, + help=("power in inverse power lr")) + + parser.add_argument("--hold-steps", + default=10, + type=int, + help=("LR hold steps in exp lr")) + parser.add_argument("--t", + default=10, + type=int, + help=("Period in cos lr")) parser.add_argument( "--t-mul", default=1, type=int, - help=("Period multiplicator for each restart in cos/triangular lr"), + help=( + "Period multiplicator for each restart in cos/triangular lr"), ) parser.add_argument( "--gamma", @@ -262,9 +270,9 @@ def add_class_args(parser, prefix=None): help=("Do warm restarts in cos lr"), ) - parser.add_argument( - "--monitor", default="val_loss", help=("Monitor metric to reduce lr") - ) + parser.add_argument("--monitor", + default="val_loss", + help=("Monitor metric to reduce lr")) parser.add_argument( "--mode", default="min", @@ -276,21 +284,24 @@ def add_class_args(parser, prefix=None): "--factor", default=0.1, type=float, - help=("Factor by which the learning rate will be reduced on plateau"), + help=( + "Factor by which the learning rate will be reduced on plateau" + ), ) parser.add_argument( "--patience", default=10, type=int, - help=( - "Number of epochs with no improvement after which learning rate will be reduced" - ), + help= + ("Number of epochs with no improvement after which learning rate will be reduced" + ), ) - parser.add_argument( - "--threshold", default=1e-4, type=float, help=("Minimum metric improvement") - ) + parser.add_argument("--threshold", + default=1e-4, + type=float, + help=("Minimum metric improvement")) parser.add_argument( "--threshold_mode", @@ -303,16 +314,20 @@ def add_class_args(parser, prefix=None): "--cooldown", default=0, type=int, - help=( - "Number of epochs to wait before resuming normal operation after lr has been reduced" - ), + help= + ("Number of epochs to wait before resuming normal operation after lr has been reduced" + ), ) - parser.add_argument( - "--eps", default=1e-8, type=float, help=("Minimum decay applied to lr") - ) + parser.add_argument("--eps", + default=1e-8, + type=float, + help=("Minimum decay applied to lr")) - parser.add_argument("--min-lr", default=0, type=float, help=("Minimum lr")) + parser.add_argument("--min-lr", + default=0, + type=float, + help=("Minimum lr")) parser.add_argument( "--warmup-steps", @@ -341,7 +356,8 @@ def add_class_args(parser, prefix=None): ) if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) # help='learning rate scheduler options') add_argparse_args = add_class_args diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 2df4e047..0ab63adf 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -8,6 +8,7 @@ from .vae.vq_vae import VQVAE from .transducer import RNNTransducer, RNNRNNTransducer from .wav2transducer import (HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2ConformerV1RNNTransducer, HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer) from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, HFWavLM2ResNet1dXVector) diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py index ee3c85f5..984e15ec 100644 --- a/hyperion/torch/models/transducer/__init__.py +++ b/hyperion/torch/models/transducer/__init__.py @@ -4,6 +4,7 @@ """ +from .conformer_v1_rnn_transducer import ConformerV1RNNTransducer from .rnn_rnn_transducer import RNNRNNTransducer from .rnn_transducer import RNNTransducer, RNNTransducerOutput from .transducer import Transducer diff --git a/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py b/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py new file mode 100644 index 00000000..05a82103 --- /dev/null +++ b/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py @@ -0,0 +1,87 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Tuple, Union + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 + +import torch + +from ...narchs import ConformerEncoderV1 +from .rnn_transducer import RNNTransducer + + +class ConformerV1RNNTransducer(RNNTransducer): + """RNN-T with Conformer Encoder + + Attributes: + encoder: dictionary of options to initialize RNNEncoder class or RNNEncoder object + decoder: RNN-T Decoder config. dictionary or module. + + """ + + def __init__(self, encoder, decoder): + if isinstance(encoder, dict): + encoder = ConformerEncoderV1(**encoder) + else: + assert isinstance(encoder, RNNEncoder) + + super().__init__(encoder, decoder) + + @staticmethod + def filter_args(**kwargs): + args = RNNTransducer.filter_args(**kwargs) + encoder_args = ConformerEncoderV1.filter_args(**kwargs["encoder"]) + args["encoder"] = encoder_args + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerEncoderV1.add_class_args(parser, prefix="encoder", skip=skip) + RNNTransducer.add_class_args(parser) + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + def change_config( + self, + encoder, + decoder, + ): + logging.info("changing transducer encoder config") + self.encoder.change_config(**encoder) + super().chage_config(**decoder) + + @staticmethod + def filter_finetune_args(**kwargs): + args = RNNTransducer.filter_finetune_args(**kwargs) + encoder_args = ConformerEncoderV1.filter_finetune_args( + **kwargs["encoder"]) + args["encoder"] = encoder_args + return args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerEncoderV1.add_finetune_args(parser, prefix="encoder") + RNNTransducer.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/lstm_rnn_transducer.py b/hyperion/torch/models/transducer/lstm_rnn_transducer.py deleted file mode 100644 index 983334d4..00000000 --- a/hyperion/torch/models/transducer/lstm_rnn_transducer.py +++ /dev/null @@ -1,151 +0,0 @@ -""" - Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import logging -from typing import Dict, Optional, Union - -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - -try: - import k2 -except ModuleNotFoundError: - from ...utils import dummy_k2 as k2 - -import torch - -from ...torch_model import TorchModel -from ..narchs import RNNTransducerDecoder - - -class RNNTransducer(TorchModel): - """ Base-class for RNN-T in - "Sequence Transduction with Recurrent Neural Networks" - https://arxiv.org/pdf/1211.3711.pdf - - Attributes: - encoder: Encoder network module - decoder: RNN-T Decoder config. dictionary or module. - """ - - def __init__( - self, - encoder: TorchModel, - decoder: Union[Dict, RNNTransducerDecoder], - ): - super().__init__() - assert isinstance(encoder, TorchModel) - if isinstance(decoder, dict): - decoder = RNNTransducerDecoder(**decoder) - else: - assert isinstance(decoder, RNNTransducerDecoder) - - self.encoder = encoder - self.decoder = decoder - - def forward( - self, - x: torch.Tensor, - x_lengths: torch.Tensor, - y: k2.RaggedTensor, - ) -> torch.Tensor: - """ - Args: - x: input features with shape = (N, T, C) - x_lengths: feature number for frames with shape = (N,) - y: ragged tensor with 2 axes [utt][label]. It contains labels of each - utterance. - Returns: - - Token logits with shape = (N, vocab_size) - - RNN-T loss. - """ - assert x.ndim == 3, x.shape - assert x_lengths.ndim == 1, x_lengths.shape - assert y.num_axes == 2, y.num_axes - - assert x.size(0) == x_lengths.size(0) == y.dim0 - - x, x_lengths = self.encoder(x, x_lengths) - assert torch.all(x_lengths > 0) - - logits, loss = self.decoder(x, x_lengths, y) - return logits, loss - - def set_train_mode(self, mode): - if mode == self._train_mode: - return - - if mode == "full": - self.unfreeze() - elif mode == "frozen": - self.freeze() - else: - raise ValueError(f"invalid train_mode={mode}") - - self._train_mode = mode - - def _train(self, train_mode: str): - if train_mode in ["full", "frozen"]: - super()._train(train_mode) - else: - raise ValueError(f"invalid train_mode={train_mode}") - - @staticmethod - def valid_train_modes(): - return ["full", "frozen"] - - def get_config(self): - dec_cfg = self.decoder.get_config() - config = { - "decoder": dec_cfg, - } - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) - - @staticmethod - def filter_args(**kwargs): - - # get arguments for pooling - decoder_args = RNNTransducerDecoder.filter_args(**kwargs["decoder"]) - args["decoder"] = decoder_args - return args - - @staticmethod - def add_class_args(parser, prefix=None, skip=set()): - - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - RNNTransducerDecoder.add_class_args(parser, prefix="decoder") - - if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) - - def change_config( - self, - decoder, - ): - logging.info("changing transducer config") - self.decoder.change_config(**decoder) - - @staticmethod - def filter_finetune_args(**kwargs): - # get arguments for pooling - decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) - args["decoder"] = decoder_args - return args - - @staticmethod - def add_finetune_args(parser, prefix=None): - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - RNNTransducerDecoder.add_finetune_args(parser, prefix="decoder") - - if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/rnn_rnn_transducer.py b/hyperion/torch/models/transducer/rnn_rnn_transducer.py index 1c0704f5..02d0c482 100644 --- a/hyperion/torch/models/transducer/rnn_rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_rnn_transducer.py @@ -50,7 +50,7 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - RNNEncoder.add_class_args(parser, prefix="encoder") + RNNEncoder.add_class_args(parser, prefix="encoder", skip=skip) RNNTransducer.add_class_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py index 0b886fdf..3326ef81 100644 --- a/hyperion/torch/models/transducer/rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -50,6 +50,8 @@ def __init__( if encoder is not None: assert isinstance(encoder, TorchModel) if isinstance(decoder, dict): + if encoder is not None: + decoder["in_feats"] = encoder.out_shape()[-1] decoder = RNNTransducerDecoder(**decoder) else: assert isinstance(decoder, RNNTransducerDecoder) diff --git a/hyperion/torch/models/wav2transducer/__init__.py b/hyperion/torch/models/wav2transducer/__init__.py index 79af6349..71e82b98 100644 --- a/hyperion/torch/models/wav2transducer/__init__.py +++ b/hyperion/torch/models/wav2transducer/__init__.py @@ -5,5 +5,7 @@ """ from .hf_wav2vec2_transducer import HFWav2Vec2Transducer +from .hf_wav2vec2conformer_v1_rnn_transducer import \ + HFWav2Vec2ConformerV1RNNTransducer from .hf_wav2vec2rnn_rnn_transducer import HFWav2Vec2RNNRNNTransducer from .hf_wav2vec2rnn_transducer import HFWav2Vec2RNNTransducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py index f68a6f7a..47dfd910 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py @@ -38,7 +38,6 @@ def __init__(self, self.hf_feats = hf_feats if isinstance(transducer, dict): transducer["decoder"]["in_feats"] = hf_feats.hidden_size - #transducer["joiner"]["in_feats"] = hf_feats.hidden_size if "class_name" in transducer: del transducer["class_name"] @@ -48,7 +47,6 @@ def __init__(self, assert isinstance(transducer, RNNTransducer) if transducer.encoder is None: assert transducer.decoder.in_feats == hf_feats.hidden_size - #assert transducer.joiner.in_feats == hf_feats.hidden_size self.transducer = transducer self.feat_fusion_start = feat_fusion_start diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py new file mode 100644 index 00000000..09b0196e --- /dev/null +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py @@ -0,0 +1,105 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import ConformerV1RNNTransducer +from .hf_wav2rnn_transducer import HFWav2RNNTransducer + + +class HFWav2Vec2ConformerV1RNNTransducer(HFWav2RNNTransducer): + """Class for Conformer based RNN-T with Wav2Vec2 features + + Attributes: + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + transducer: Transducer configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, ConformerV1RNNTransducer], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(transducer, dict): + transducer["encoder"]["in_feats"] = hf_feats.hidden_size + if "class_name" in transducer: + del transducer["class_name"] + + transducer = ConformerV1RNNTransducer(**transducer) + else: + assert isinstance(transducer, ConformerV1RNNTransducer) + + super().__init__(hf_feats, transducer, feat_fusion_start, + feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2RNNTransducer.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1RNNTransducer.filter_args( + **kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + ConformerV1RNNTransducer.add_class_args(parser, + prefix="transducer", + skip={"in_feats"}) + HFWav2RNNTransducer.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1RNNTransducer.filter_finetune_args( + **kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + ConformerV1RNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py index d2b13fb6..a4d2b0cc 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py @@ -44,8 +44,7 @@ def __init__( assert isinstance(hf_feats, HFWav2Vec2) if isinstance(transducer, dict): - transducer["decoder"]["in_feats"] = hf_feats.hidden_size - #transducer["joiner"]["in_feats"] = hf_feats.hidden_size + transducer["encoder"]["in_feats"] = hf_feats.hidden_size if "class_name" in transducer: del transducer["class_name"] @@ -72,7 +71,9 @@ def add_class_args(parser, prefix=None): parser = ArgumentParser(prog="") HFWav2Vec2.add_class_args(parser, prefix="hf_feats") - RNNRNNTransducer.add_class_args(parser, prefix="transducer") + RNNRNNTransducer.add_class_args(parser, + prefix="transducer", + skip={"in_feats"}) HFWav2RNNTransducer.add_class_args(parser) if prefix is not None: diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index 98160a25..ed328223 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -5,14 +5,15 @@ import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ArgumentParser, ActionYesNo +from ...utils.misc import filter_func_args from ..layer_blocks import ConformerEncoderBlockV1 as EBlock from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler from ..layers import ActivationFactory as AF -from ..layers import NoPosEncoder from ..layers import NormLayer1dFactory as NLF -from ..layers import PosEncoder, RelPosEncoder +from ..layers import NoPosEncoder, PosEncoder, RelPosEncoder, ConvPosEncoder +from ..utils import seq_lengths_to_mask, scale_seq_lengths from .net_arch import NetArch @@ -37,7 +38,7 @@ class ConformerEncoderV1(NetArch): d_model: encoder blocks feature dimension num_heads: number of heads num_blocks: number of self attn blocks - att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1'] + att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1', 'block-scaled-dot-prod-att-v1'] att_context: maximum context range for local attention conv_repeats: number of conv blocks in each conformer block conv_kernel_sizes: kernel size for conv blocks @@ -49,11 +50,10 @@ class ConformerEncoderV1(NetArch): pos_dropout_rate: dropout rate for positional encoder att_dropout_rate: dropout rate for attention block in_layer_type: input layer block type in ['linear','conv2d-sub', 'embed', None] - pos_enc_type: type of positional encoder ['no', 'abs', 'rel'] + pos_enc_type: type of positional encoder ['no', 'abs', 'rel', 'conv'] causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes that query q_i only attents to key k_j when j<=i - no_pos_enc: if True, it doesn't use positional encoder. hid_act: hidden activations in ff and input blocks conv_norm_layer: norm layer constructor or str for conv block, if None it uses BatchNorm1d @@ -68,8 +68,6 @@ class ConformerEncoderV1(NetArch): padding_idx: padding idx for embed layer in_time_dim: time dimension in the input Tensor out_time_dim: dimension that we want to be time in the output tensor - rel_pos_enc: if True, use relative postional encodings, absolute encodings otherwise. (deprecated) - red_lnorm: (deprecated) """ def __init__( @@ -92,17 +90,17 @@ def __init__( in_layer_type="conv2d-sub", pos_enc_type="rel", causal_pos_enc=False, + pos_kernel_size=128, + pos_num_groups=16, hid_act="swish", conv_norm_layer=None, se_r=None, ff_macaron=True, - red_lnorms=False, + red_lnorms=True, concat_after=False, padding_idx=-1, - in_time_dim=-1, + in_time_dim=1, out_time_dim=1, - rel_pos_enc=True, - red_lnorm=False, ): super().__init__() @@ -115,14 +113,11 @@ def __init__( self.att_context = att_context self.conv_repeats = self._standarize_cblocks_param( - conv_repeats, num_blocks, "conv_repeats" - ) + conv_repeats, num_blocks, "conv_repeats") self.conv_kernel_sizes = self._standarize_cblocks_param( - conv_kernel_sizes, num_blocks, "conv_kernel_sizes" - ) + conv_kernel_sizes, num_blocks, "conv_kernel_sizes") self.conv_strides = self._standarize_cblocks_param( - conv_strides, num_blocks, "conv_strides" - ) + conv_strides, num_blocks, "conv_strides") self.ff_type = ff_type self.d_ff = d_ff @@ -141,6 +136,8 @@ def __init__( self.in_time_dim = in_time_dim self.out_time_dim = out_time_dim self.hid_act = hid_act + self.pos_kernel_size = pos_kernel_size + self.pos_num_groups = pos_num_groups self.conv_norm_layer = conv_norm_layer norm_groups = None @@ -174,8 +171,7 @@ def __init__( ff_macaron=ff_macaron, out_lnorm=self.red_lnorms, concat_after=concat_after, - ) - ) + )) self.blocks = nn.ModuleList(blocks) if not self.red_lnorms: @@ -210,6 +206,9 @@ def _make_in_layer(self): pos_enc = RelPosEncoder(d_model, self.pos_dropout_rate) elif self.pos_enc_type == "abs": pos_enc = PosEncoder(d_model, self.pos_dropout_rate) + elif self.pos_enc_type == "conv": + pos_enc = ConvPosEncoder(d_model, self.pos_kernel_size, + self.pos_num_groups, self.hid_act) else: raise Exception("wrong pos-enc-type={}".format(self.pos_enc_type)) @@ -224,13 +223,15 @@ def _make_in_layer(self): pos_enc, ) elif self.in_layer_type == "conv2d-sub": - self.in_layer = Conv2dSubsampler( - in_feats, d_model, hid_act, pos_enc, time_dim=self.in_time_dim - ) + self.in_layer = Conv2dSubsampler(in_feats, + d_model, + hid_act, + pos_enc, + time_dim=self.in_time_dim) elif self.in_layer_type == "embed": self.in_layer = nn.Sequential( - nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc - ) + nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), + pos_enc) elif isinstance(self.in_layer_type, nn.Module): self.in_layer = nn.Sequential(self.in_layer_type, pos_enc) elif self.in_layer_type is None: @@ -238,7 +239,12 @@ def _make_in_layer(self): else: raise ValueError("unknown in_layer_type: " + self.in_layer_type) - def forward(self, x, x_lengths=None, x_mask=None, target_shape=None): + def forward(self, + x, + x_lengths=None, + x_mask=None, + return_mask=False, + target_shape=None): """Forward pass function Args: @@ -246,14 +252,20 @@ def forward(self, x, x_lengths=None, x_mask=None, target_shape=None): x_lengths: lengths of the input sequences. x_mask: mask to indicate valid time steps for x (batch, time). It overwrites the mask of x_lengths. + return_mask: if True, it also return the output mask + target_shape: unused Returns: Tensor with output features - Tensor with mask + Tensor with output lengths + Tensor with mask if return_mask is True """ + max_in_length = x.size(self.in_time_dim) + if x_mask is None and x_lengths is not None: + x_mask = seq_lengths_to_mask(x_lengths, max_in_length, time_dim=1) if isinstance(self.in_layer, Conv2dSubsampler): - x, mask = self.in_layer(x, x_mask) + x, x_mask = self.in_layer(x, x_mask) else: if self.in_time_dim != 1: x = x.transpose(1, self.in_time_dim).contiguous() @@ -266,7 +278,7 @@ def forward(self, x, x_lengths=None, x_mask=None, target_shape=None): b_args = {} for i in range(len(self.blocks)): - x, mask = self.blocks[i](x, mask=mask, **b_args) + x, x_mask = self.blocks[i](x, mask=x_mask, **b_args) if not self.red_lnorms: x = self.norm_out(x) @@ -274,10 +286,13 @@ def forward(self, x, x_lengths=None, x_mask=None, target_shape=None): if self.out_time_dim != 1: x = x.transpose(1, self.out_time_dim) - if mask is None: - return x + if x_lengths is not None: + x_lengths = scale_seq_lengths(x_lengths, x.size(1), max_in_length) - return x, mask + if return_mask: + return x, x_lengths, x_mask + + return x, x_lengths def get_config(self): """Gets network config @@ -303,6 +318,8 @@ def get_config(self): "in_layer_type": self.in_layer_type, "pos_enc_type": self.pos_enc_type, "causal_pos_enc": self.causal_pos_enc, + "pos_kernel_size": self.pos_kernel_size, + "pos_num_groups": self.pos_num_groups, "hid_act": self.hid_act, "se_r": self.se_r, "ff_macaron": self.ff_macaron, @@ -372,41 +389,11 @@ def filter_args(**kwargs): Returns: args dictionary """ - - if "no_ff_macaron" in kwargs: - kwargs["ff_macaron"] = not kwargs["no_ff_macaron"] - - valid_args = ( - "num_blocks", - "in_feats", - "d_model", - "num_heads", - "att_type", - "att_context", - "conv_repeats", - "conv_kernel_sizes", - "conv_strides", - "ff_type", - "d_ff", - "ff_kernel_size", - "dropout_rate", - "pos_dropout_rate", - "att_dropout_rate", - "in_layer_type", - "hid_act", - "pos_enc_type", - "causal_pos_enc", - "conv_norm_layer", - "se_r", - "ff_macaron", - "red_lnorms", - "concat_after", - ) - - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args = filter_func_args(ConformerEncoderV1.__init__, kwargs) + return args @staticmethod - def add_class_args(parser, prefix=None, in_feats=False): + def add_class_args(parser, prefix=None, skip=set()): """Adds Conformer config parameters to argparser Args: @@ -417,18 +404,21 @@ def add_class_args(parser, prefix=None, in_feats=False): outer_parser = parser parser = ArgumentParser(prog="") - if in_feats: - parser.add_argument( - "--in-feats", type=int, default=80, help=("input feature dimension") - ) + if "in_feats" not in skip: + parser.add_argument("--in-feats", + type=int, + default=80, + help=("input feature dimension")) - parser.add_argument( - "--num-blocks", default=6, type=int, help=("number of tranformer blocks") - ) + parser.add_argument("--num-blocks", + default=6, + type=int, + help=("number of tranformer blocks")) - parser.add_argument( - "--d-model", default=512, type=int, help=("encoder layer sizes") - ) + parser.add_argument("--d-model", + default=512, + type=int, + help=("encoder layer sizes")) parser.add_argument( "--num-heads", @@ -440,7 +430,10 @@ def add_class_args(parser, prefix=None, in_feats=False): parser.add_argument( "--att-type", default="scaled-dot-prod-v1", - choices=["scaled-dot-prod-v1", "local-scaled-dot-prod-v1"], + choices=[ + "scaled-dot-prod-v1", "local-scaled-dot-prod-v1", + "block-scaled-dot-prod-v1" + ], help=("type of self-attention"), ) @@ -464,7 +457,9 @@ def add_class_args(parser, prefix=None, in_feats=False): default=[31], nargs="+", type=int, - help=("kernels sizes for the depth-wise convs of each conformer block"), + help=( + "kernels sizes for the depth-wise convs of each conformer block" + ), ) parser.add_argument( @@ -496,10 +491,9 @@ def add_class_args(parser, prefix=None, in_feats=False): help=("kernel size in convolutional feed forward block"), ) - try: - parser.add_argument("--hid-act", default="swish", help="hidden activation") - except: - pass + parser.add_argument("--hid-act", + default="swish", + help="hidden activation") parser.add_argument( "--pos-dropout-rate", @@ -507,12 +501,14 @@ def add_class_args(parser, prefix=None, in_feats=False): type=float, help="positional encoder dropout", ) - parser.add_argument( - "--att-dropout-rate", default=0, type=float, help="self-att dropout" - ) - parser.add_argument( - "--dropout-rate", default=0.1, type=float, help="feed-forward layer dropout" - ) + parser.add_argument("--att-dropout-rate", + default=0, + type=float, + help="self-att dropout") + parser.add_argument("--dropout-rate", + default=0.1, + type=float, + help="feed-forward layer dropout") parser.add_argument( "--in-layer-type", @@ -521,37 +517,45 @@ def add_class_args(parser, prefix=None, in_feats=False): help=("type of input layer"), ) - # parser.add_argument('--abs-pos-enc', default=False, action='store_true', - # help='use absolute positional encoder') parser.add_argument( "--pos-enc-type", default="rel", - choices=["no", "rel", "abs"], + choices=["no", "rel", "abs", "conv"], help=("type of positional encoder"), ) parser.add_argument( "--causal-pos-enc", default=False, - action="store_true", - help="relative positional encodings are zero when attending to the future", + action=ActionYesNo, + help= + "relative positional encodings are zero when attending to the future", + ) + parser.add_argument( + "--pos-kernel-size", + default=128, + type=int, + help="kernel size for conv positional encoder", + ) + parser.add_argument( + "--pos-num-groups", + default=16, + type=int, + help="number of conv groups for conv positional encoder", ) - try: - parser.add_argument( - "--conv-norm-layer", - default=None, - choices=[ - "batch-norm", - "group-norm", - "instance-norm", - "instance-norm-affine", - "layer-norm", - ], - help="type of normalization layer for conv block in conformer", - ) - except: - pass + parser.add_argument( + "--conv-norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer for conv block in conformer", + ) parser.add_argument( "--se-r", @@ -561,30 +565,26 @@ def add_class_args(parser, prefix=None, in_feats=False): ) parser.add_argument( - "--no-ff-macaron", - default=False, - action="store_true", + "--ff-macaron", + default=True, + action=ActionYesNo, help="do not use macaron style ff layers ", ) parser.add_argument( "--red-lnorms", - default=False, - action="store_true", + default=True, + action=ActionYesNo, help="use redundant Lnorm at conformer blocks' outputs", ) parser.add_argument( "--concat-after", default=False, - action="store_true", + action=ActionYesNo, help="concatenate attention input and output instead of adding", ) - # parser.add_argument('--in-norm', default=False, action='store_true', - # help='batch normalization at the input') if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='conformer encoder options') - - add_argparse_args = add_class_args + outer_parser.add_argument("--" + prefix, + action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/rnn_encoder.py b/hyperion/torch/narchs/rnn_encoder.py index 593405c5..0c3b623a 100644 --- a/hyperion/torch/narchs/rnn_encoder.py +++ b/hyperion/torch/narchs/rnn_encoder.py @@ -13,6 +13,7 @@ from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence +from ...utils.misc import filter_func_args from ..layer_blocks import TransformerConv2dSubsampler as Subsampler from ..layers import ActivationFactory as AF #from ..layers import NormLayer1dFactory as NLF @@ -46,7 +47,7 @@ def __init__(self, bidirectional: bool = False, dropout_rate: float = 0.0, subsample_input: bool = False, - subsampling_act: str = "relu6"): + subsampling_act: str = "relu"): super().__init__() if rnn_type != "lstm": proj_feats = 0 @@ -74,7 +75,7 @@ def __init__(self, if rnn_type == "lstm": self.rnn = nn.LSTM( - input_size=hid_feats, + input_size=lstm_in_dim, hidden_size=hid_feats, num_layers=num_layers, bias=True, @@ -85,7 +86,7 @@ def __init__(self, ) else: self.rnn = nn.GRU( - input_size=hid_feats, + input_size=lstm_in_dim, hidden_size=hid_feats, num_layers=num_layers, bias=True, @@ -113,7 +114,7 @@ def forward(self, x: torch.Tensor, batch_first=True, enforce_sorted=True) x, _ = self.rnn(x) - x = pad_packed_sequence(x, batch_first=True) + x, x_lengths = pad_packed_sequence(x, batch_first=True) if self.out_feats > 0: x = self.output(x) @@ -149,7 +150,7 @@ def change_config(self, override_dropouts, dropout_rate): @staticmethod def filter_args(**kwargs): - args = filter_func_args(RNNEncoder.__init__, **kwargs) + args = filter_func_args(RNNEncoder.__init__, kwargs) return args @staticmethod @@ -166,7 +167,7 @@ def add_class_args(parser, prefix=None, skip=set()): parser.add_argument( "--hid-feats", - default=512, + default=1024, type=int, help=("num of hidden dimensions of RNN layers"), ) @@ -182,7 +183,7 @@ def add_class_args(parser, prefix=None, skip=set()): parser.add_argument( "--proj-feats", - default=512, + default=0, type=int, help=("projection features of LSTM layers"), ) @@ -225,7 +226,7 @@ def add_class_args(parser, prefix=None, skip=set()): help="whether to subsaple input features x4", ) parser.add_argument("--subsampling-act", - default="relu6", + default="relu", help="activation for subsampler block") if "dropout_rate" not in skip: diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index 265f2c9b..e9c50197 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -30,7 +30,7 @@ class Hypothesis: log_prob: float # log prob of ys # Optional LSTM predictor state. - pred_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None + pred_state: Optional[Tuple[torch.Tensor, ...]] = None class RNNTransducerDecoder(NetArch): @@ -115,6 +115,7 @@ def _make_predictor(self): pred_args = filter_func_args(ConvPredictor.__init__, self.predictor_args) self.predictor = ConvPredictor(**pred_args) + self.predictor_args["out_feats"] = self.predictor.embed_dim else: raise ValueError(f"Unknown predictor type {pred_type}") @@ -263,7 +264,7 @@ def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale) pruned_loss_scale = 0.1 + 0.9 * r self.cur_step += 1 - print(simple_loss_scale, pruned_loss_scale) + #print(simple_loss_scale, pruned_loss_scale) loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned @@ -338,7 +339,7 @@ def decode_greedy(self, sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1) - pred_out, (h, c) = self.predictor(sos) + pred_out, state = self.predictor(sos) T = x.size(1) t = 0 hyp = [] @@ -357,7 +358,7 @@ def decode_greedy(self, if y != blank_id: hyp.append(y.item()) y = y.reshape(1, 1) - pred_out, (h, c) = self.predictor(y, (h, c)) + pred_out, state = self.predictor(y, state) sym_per_utt += 1 sym_per_frame += 1 @@ -379,7 +380,7 @@ def decode_time_sync_beam_search(self, device = x.device sos = torch.tensor([blank_id], device=device).reshape(1, 1) - pred_out, (h, c) = self.predictor(sos) + pred_out, state = self.predictor(sos) T = x.size(1) t = 0 B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] @@ -498,7 +499,7 @@ def decode_align_length_sync_beam_search( device = x.device sos = torch.tensor([blank_id], device=device).reshape(1, 1) - pred_out, (h, c) = self.predictor(sos) + pred_out, state = self.predictor(sos) T = x.size(1) #t = 0 B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 540697f7..65e5884d 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -11,14 +11,15 @@ import torch import torch.nn as nn -torch_model_registry = {} - class TorchModel(nn.Module): + """Base class for all Pytorch Models and NNet architectures + """ + registry = {} def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) - torch_model_registry[cls.__name__] = cls + TorchModel.registry[cls.__name__] = cls def __init__(self): super().__init__() diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py index 1bb5a644..fb93b439 100644 --- a/hyperion/torch/utils/masking.py +++ b/hyperion/torch/utils/masking.py @@ -14,7 +14,12 @@ def scale_seq_lengths(lengths, max_out_length, max_in_length=None): if max_in_length is None: max_in_length = lengths.max() - return torch.div(lengths * max_out_length, max_in_length, rounding_mode="floor") + if max_in_length == max_out_length: + return lengths + + return torch.div(lengths * max_out_length, + max_in_length, + rounding_mode="floor") def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): From 8b81ffaa8d315eb630a3fafbffa83b316212f671 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 3 May 2023 15:44:38 -0400 Subject: [PATCH 096/154] adv.v2 adapted to persephone branck --- egs/voxceleb/adv.v2/cmd.sh | 3 +- egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml | 4 - egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml | 4 - ...bank80_stmn_lresnet34_attacknet_same.v1.sh | 8 +- .../local/calibrate_voxceleb1_o_clean.sh | 4 +- .../run_022_attack_type_classif_allknown.sh | 33 ++-- .../adv.v2/run_023_snr_classif_allknown.sh | 36 ++-- .../run_024_threat_model_classif_allknown.sh | 35 ++-- ...un_031_attack_type_verif_and_noveltydet.sh | 33 ++-- egs/voxceleb/adv.v2/run_032_snr_verif.sh | 29 ++-- .../adv.v2/run_033_threat_model_verif.sh | 30 ++-- .../steps_backend/eval-calibration-v1.py | 69 ++++++++ .../adv.v2/steps_backend/eval-classif-perf.py | 4 +- .../steps_backend/train-calibration-v1.py | 94 ++++++++++ egs/voxceleb/v1.1/README.md | 63 ++++++- .../train_cfwseresnet34_xvec_stage1_v3.0.yaml | 71 ++++++++ .../train_cfwseresnet34_xvec_stage2_v3.0.yaml | 69 ++++++++ .../train_cwseresnet34_xvec_stage1_v3.0.yaml | 71 ++++++++ .../train_cwseresnet34_xvec_stage2_v3.0.yaml | 69 ++++++++ .../train_fwseresnet34_xvec_stage1_v3.0.yaml | 71 ++++++++ .../train_fwseresnet34_xvec_stage2_v3.0.yaml | 69 ++++++++ .../conf/train_resnet34_xvec_stage1_v3.0.yaml | 70 ++++++++ .../train_tseresnet34_xvec_stage1_v3.0.yaml | 71 ++++++++ .../train_tseresnet34_xvec_stage2_v3.0.yaml | 69 ++++++++ .../config_fbank80_stmn_cfwseresnet34.v3.0.sh | 44 +++++ .../config_fbank80_stmn_cwseresnet34.v3.0.sh | 45 +++++ .../config_fbank80_stmn_fwseresnet34.v3.0.sh | 44 +++++ .../config_fbank80_stmn_resnet34.v3.0.sh | 44 +++++ .../config_fbank80_stmn_tseresnet34.v3.0.sh | 44 +++++ .../eval_xvec_logits_from_wav.py} | 0 .../generate_adv_attacks_xvector_classif.py | 10 +- .../bin/generate_adv_attacks_xvector_verif.py | 83 ++++----- hyperion/bin/prepare_data.py | 2 + hyperion/data_prep/data_prep.py | 57 +++++- hyperion/data_prep/voxceleb2.py | 163 +++++++++++++----- .../adv_attacks/random_attack_factory.py | 7 +- hyperion/torch/layer_blocks/se_blocks.py | 10 +- hyperion/utils/__init__.py | 3 + hyperion/utils/class_info.py | 12 +- hyperion/utils/dataset.py | 159 +++++++++++++++++ hyperion/utils/info_table.py | 7 +- hyperion/utils/misc.py | 4 + 42 files changed, 1569 insertions(+), 248 deletions(-) create mode 100755 egs/voxceleb/adv.v2/steps_backend/eval-calibration-v1.py create mode 100755 egs/voxceleb/adv.v2/steps_backend/train-calibration-v1.py create mode 100644 egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_resnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh rename hyperion/{bin_deprec2/torch-eval-xvec-logits-from-wav.py => bin/eval_xvec_logits_from_wav.py} (100%) create mode 100644 hyperion/utils/dataset.py diff --git a/egs/voxceleb/adv.v2/cmd.sh b/egs/voxceleb/adv.v2/cmd.sh index 56b7eeeb..8f2d9b19 100755 --- a/egs/voxceleb/adv.v2/cmd.sh +++ b/egs/voxceleb/adv.v2/cmd.sh @@ -13,7 +13,8 @@ if [ "$(hostname -d)" == "cm.gemini" ];then #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" - export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 20G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else diff --git a/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml b/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml index d07a2126..03a4b141 100644 --- a/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml +++ b/egs/voxceleb/adv.v2/conf/lresnet34_atnet.yaml @@ -3,8 +3,6 @@ data: dataset: class_names: - class_id - aug_cfgs: - - conf/reverb_noise_aug.yaml return_segment_info: - class_id sampler: @@ -20,8 +18,6 @@ data: dataset: class_names: - class_id - aug_cfgs: - - conf/reverb_noise_aug.yaml return_segment_info: - class_id sampler: diff --git a/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml b/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml index 94e26f24..a617622c 100644 --- a/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml +++ b/egs/voxceleb/adv.v2/conf/res2net50_atnet.yaml @@ -3,8 +3,6 @@ data: dataset: class_names: - class_id - aug_cfgs: - - conf/reverb_noise_aug.yaml return_segment_info: - class_id sampler: @@ -20,8 +18,6 @@ data: dataset: class_names: - class_id - aug_cfgs: - - conf/reverb_noise_aug.yaml return_segment_info: - class_id sampler: diff --git a/egs/voxceleb/adv.v2/global_conf/config_spknet_fbank80_stmn_lresnet34_attacknet_same.v1.sh b/egs/voxceleb/adv.v2/global_conf/config_spknet_fbank80_stmn_lresnet34_attacknet_same.v1.sh index 5ae7f68e..ed10ff0a 100644 --- a/egs/voxceleb/adv.v2/global_conf/config_spknet_fbank80_stmn_lresnet34_attacknet_same.v1.sh +++ b/egs/voxceleb/adv.v2/global_conf/config_spknet_fbank80_stmn_lresnet34_attacknet_same.v1.sh @@ -6,8 +6,8 @@ spknet_command=resnet spknet_data=voxceleb2cat_train spknet_config=conf/lresnet34_spknet.yaml -spknet_batch_size_1gpu=128 -spknet_eff_batch_size=512 # effective batch size +#spknet_batch_size_1gpu=128 +#spknet_eff_batch_size=512 # effective batch size spknet_name=lresnet34 spknet_dir=exp/xvector_nnets/$spknet_name spknet=$spknet_dir/model_ep0070.pth @@ -26,8 +26,8 @@ spkv_attacks_common_opts="--save-failed" #save failed attacks also # Attack model LResNet34 configuration sign_nnet_command=resnet sign_nnet_config=conf/lresnet34_atnet.yaml -sign_nnet_batch_size_1gpu=128 -sign_nnet_eff_batch_size=512 # effective batch size +#sign_nnet_batch_size_1gpu=128 +#sign_nnet_eff_batch_size=512 # effective batch size sign_nnet_name=lresnet34 # SNRs in -100, 100 diff --git a/egs/voxceleb/adv.v2/local/calibrate_voxceleb1_o_clean.sh b/egs/voxceleb/adv.v2/local/calibrate_voxceleb1_o_clean.sh index 736c3fb0..01c06036 100755 --- a/egs/voxceleb/adv.v2/local/calibrate_voxceleb1_o_clean.sh +++ b/egs/voxceleb/adv.v2/local/calibrate_voxceleb1_o_clean.sh @@ -30,7 +30,7 @@ train_scores=$score_dir/voxceleb1_scores train_key=data/voxceleb1_test/trials_o_clean $cmd $cal_score_dir/train_cal_tel.log \ - steps_be/train-calibration-v1.py --score-file $train_scores \ + steps_backend/train-calibration-v1.py --score-file $train_scores \ --key-file $train_key --model-file $model_file --prior $prior --lambda-reg $l2_reg ndxs=(voxceleb1_test/trials_o_clean) @@ -43,7 +43,7 @@ do scores_out=$cal_score_dir/${scores[$i]}_scores ndx=data/${ndxs[$i]} $cmd $cal_score_dir/eval_cal_${scores[$i]}.log \ - steps_be/eval-calibration-v1.py --in-score-file $scores_in \ + steps_backend/eval-calibration-v1.py --in-score-file $scores_in \ --ndx-file $ndx --model-file $model_file --out-score-file $scores_out & done diff --git a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh index 53e8e5a6..71c0c89f 100755 --- a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=4 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,15 +25,9 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/$attack_type_split_tag args="" -if [ "$resume" == "true" ];then - args="--resume" -fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl @@ -53,18 +45,17 @@ if [ $stage -le 1 ]; then mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_dir/trainval_wav.scp \ - --time-durs-file $list_dir/trainval_utt2dur \ - --train-list $list_dir/train_utt2attack \ - --val-list $list_dir/val_utt2attack \ - --class-file $list_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_dir/train_utt2attack \ + --data.train.dataset.class-file $list_dir/class_file \ + --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + fi if [ $stage -le 2 ]; then @@ -82,7 +73,7 @@ fi proj_dir=$sign_dir/test/tsne_${attack_type_split_tag} if [ $stage -le 3 ];then echo "Make TSNE plots on all test attacks" - echo "Result will be left in $proj_idr" + echo "Result will be left in $proj_dir" for p in 30 100 250 do for e in 12 64 @@ -112,7 +103,7 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ];then - echo "Compute cofusion matrices" + echo "Compute confusion matrices" echo "Result is left in $logits_dir/test/eval_acc.log" $train_cmd $logits_dir/test/eval_acc.log \ hyp_utils/conda_env.sh steps_backend/eval-classif-perf.py \ diff --git a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh index 79bf810a..a928ae29 100755 --- a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=8 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,15 +25,9 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/$snr_split_tag args="" -if [ "$resume" == "true" ];then - args="--resume" -fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl @@ -53,19 +45,16 @@ if [ $stage -le 1 ]; then mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_dir/trainval_wav.scp \ - --time-durs-file $list_dir/trainval_utt2dur \ - --train-list $list_dir/train_utt2attack \ - --val-list $list_dir/val_utt2attack \ - --class-file $list_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args - + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_dir/train_utt2attack \ + --data.train.dataset.class-file $list_dir/class_file \ + --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu fi if [ $stage -le 2 ]; then @@ -83,7 +72,7 @@ fi proj_dir=$sign_dir/test/tsne if [ $stage -le 3 ];then echo "Make TSNE plots on all test attacks" - echo "Result will be left in $proj_idr" + echo "Result will be left in $proj_dir" for p in 30 100 250 do for e in 12 64 @@ -101,7 +90,6 @@ if [ $stage -le 3 ];then wait fi - if [ $stage -le 4 ]; then echo "Eval signature network logits on test attacks" mkdir -p $list_dir/test @@ -114,7 +102,7 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ];then - echo "Compute cofusion matrices" + echo "Compute confusion matrices" echo "Result is left in $logits_dir/test/eval_acc.log" $train_cmd $logits_dir/test/eval_acc.log \ hyp_utils/conda_env.sh steps_backend/eval-classif-perf.py \ diff --git a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh index 3a4e9147..bed225a3 100755 --- a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh @@ -10,7 +10,6 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false num_workers=4 xvec_use_gpu=false @@ -27,16 +26,10 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/$threat_model_split_tag list_attack_type_dir=data/$attack_type_split_tag args="" -if [ "$resume" == "true" ];then - args="--resume" -fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl @@ -53,19 +46,17 @@ if [ $stage -le 1 ]; then echo "Train signature network on all attacks" mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_dir/trainval_wav.scp \ - --time-durs-file $list_dir/trainval_utt2dur \ - --train-list $list_dir/train_utt2attack \ - --val-list $list_dir/val_utt2attack \ - --class-file $list_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_dir/train_utt2attack \ + --data.train.dataset.class-file $list_dir/class_file \ + --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu fi if [ $stage -le 2 ]; then @@ -83,7 +74,7 @@ fi proj_dir=$sign_dir/test/tsne_${attack_type_split_tag} if [ $stage -le 3 ];then echo "Make TSNE plots on all test attacks with colors indicating attack type" - echo "Result will be left in $proj_idr" + echo "Result will be left in $proj_dir" for p in 30 100 250 do for e in 12 64 @@ -137,7 +128,7 @@ if [ $stage -le 5 ]; then fi if [ $stage -le 6 ];then - echo "Compute cofusion matrices" + echo "Compute confusion matrices" echo "Result is left in $logits_dir/test/eval_acc.log" $train_cmd $logits_dir/test/eval_acc.log \ hyp_utils/conda_env.sh steps_backend/eval-classif-perf.py \ diff --git a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh index 31cd6139..55cb8459 100755 --- a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh +++ b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=4 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,18 +25,12 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) #list with only the known attacks list_someknown_dir=data/$sk_attack_type_split_tag # list with all the attacks list_all_dir=data/$attack_type_split_tag args="" -if [ "$resume" == "true" ];then - args="--resume" -fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl @@ -55,19 +47,18 @@ if [ $stage -le 1 ]; then echo "Train attack signature network on known attacks only" mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_someknown_dir/trainval_wav.scp \ - --time-durs-file $list_someknown_dir/trainval_utt2dur \ - --train-list $list_someknown_dir/train_utt2attack \ - --val-list $list_someknown_dir/val_utt2attack \ - --class-file $list_someknown_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ + --data.train.dataset.class-file $list_someknown_dir/class_file \ + --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu + fi if [ $stage -le 2 ]; then diff --git a/egs/voxceleb/adv.v2/run_032_snr_verif.sh b/egs/voxceleb/adv.v2/run_032_snr_verif.sh index 8e4f0d41..3886c339 100755 --- a/egs/voxceleb/adv.v2/run_032_snr_verif.sh +++ b/egs/voxceleb/adv.v2/run_032_snr_verif.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=4 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,9 +25,6 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) #list with only the known attacks list_someknown_dir=data/$sk_snr_split_tag # list with all the attacks @@ -55,19 +50,17 @@ if [ $stage -le 1 ]; then echo "Train attack signature network on known attacks only" mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_someknown_dir/trainval_wav.scp \ - --time-durs-file $list_someknown_dir/trainval_utt2dur \ - --train-list $list_someknown_dir/train_utt2attack \ - --val-list $list_someknown_dir/val_utt2attack \ - --class-file $list_someknown_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ + --data.train.dataset.class-file $list_someknown_dir/class_file \ + --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu fi if [ $stage -le 2 ]; then diff --git a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh index 1e87d749..392bffb5 100755 --- a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh +++ b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh @@ -10,9 +10,7 @@ set -e stage=1 ngpu=1 config_file=default_config.sh -resume=false interactive=false -num_workers=4 xvec_use_gpu=false xvec_chunk_length=12800 @@ -27,9 +25,6 @@ else xvec_cmd="$train_cmd" fi -batch_size=$(($sign_nnet_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $sign_nnet_eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) #list with only the known attacks list_someknown_dir=data/$sk_threat_model_split_tag # list with all the attacks @@ -56,19 +51,18 @@ if [ $stage -le 1 ]; then echo "Train attack signature network on known attacks only" mkdir -p $sign_nnet_dir/log $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-train-xvec-from-wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --audio-path $list_someknown_dir/trainval_wav.scp \ - --time-durs-file $list_someknown_dir/trainval_utt2dur \ - --train-list $list_someknown_dir/train_utt2attack \ - --val-list $list_someknown_dir/val_utt2attack \ - --class-file $list_someknown_dir/class_file \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $sign_nnet_dir $args + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ + --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ + --data.train.dataset.class-file $list_someknown_dir/class_file \ + --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ + --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ + --trainer.exp-path $sign_nnet_dir $args \ + --num-gpus $ngpu + fi if [ $stage -le 2 ]; then diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-calibration-v1.py b/egs/voxceleb/adv.v2/steps_backend/eval-calibration-v1.py new file mode 100755 index 00000000..fdd5516f --- /dev/null +++ b/egs/voxceleb/adv.v2/steps_backend/eval-calibration-v1.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Evals calibration +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.trial_scores import TrialScores +from hyperion.utils.trial_key import TrialKey +from hyperion.utils.trial_ndx import TrialNdx +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): + + logging.info("load ndx: %s" % ndx_file) + try: + ndx = TrialNdx.load_txt(ndx_file) + except: + ndx = TrialKey.load_txt(ndx_file) + + logging.info("load scores: %s" % in_score_file) + scr = TrialScores.load_txt(in_score_file) + scr = scr.align_with_ndx(ndx) + + logging.info("load model: %s" % model_file) + lr = LR.load(model_file) + logging.info("apply calibration") + s_cal = lr.predict(scr.scores.ravel()) + scr.scores = np.reshape(s_cal, scr.scores.shape) + + logging.info("save scores: %s" % out_score_file) + scr.save_txt(out_score_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Evals linear calibration") + + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_calibration(**namespace_to_dict(args)) diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py index c174cb3b..d3d828a5 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py @@ -52,7 +52,7 @@ def eval_classif_perf(score_file, key_file, class_file, output_path=None, **kwar acc = compute_accuracy(y_true, y_pred) logging.info("Classification accuracy %.2f %%" % (acc * 100)) - labels = np.arange(len(classes), dtype=np.int) + labels = np.arange(len(classes), dtype=int) C = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=False) logging.info("Unnormalized Confusion Matrix:") print_confusion_matrix(C, labels_true=classes) @@ -69,8 +69,6 @@ def eval_classif_perf(score_file, key_file, class_file, output_path=None, **kwar parser.add_argument("--score-file", required=True) parser.add_argument("--key-file", required=True) parser.add_argument("--class-file", required=True) - - # parser.add_argument('--output-path', dest='output_path', required=True) parser.add_argument( "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int ) diff --git a/egs/voxceleb/adv.v2/steps_backend/train-calibration-v1.py b/egs/voxceleb/adv.v2/steps_backend/train-calibration-v1.py new file mode 100755 index 00000000..489ceed9 --- /dev/null +++ b/egs/voxceleb/adv.v2/steps_backend/train-calibration-v1.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Trains calibration for SRE18 tel condition +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.trial_scores import TrialScores +from hyperion.utils.trial_key import TrialKey +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): + + logging.info("load key: %s" % key_file) + key = TrialKey.load_txt(key_file) + logging.info("load scores: %s" % score_file) + scr = TrialScores.load_txt(score_file) + tar, non = scr.get_tar_non(key) + ntar = len(tar) + nnon = len(non) + + min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + logging.info("train calibration") + x = np.concatenate((tar, non)) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) + lr.fit(x, y) + print(lr.A) + print(lr.b) + logging.info("save calibration at %s" % model_file) + lr.save(model_file) + + logging.info("calibrate scores") + tar_cal = lr.predict(tar) + non_cal = lr.predict(non) + act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Trains llr calibration") + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", dest="lambda_reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + + train_calibration(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index b8a17dc6..23e0a26f 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -95,10 +95,26 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | | | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | | | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.40 | 0.065 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.039 | 0.048 | +| | | | Cosine + QMF | 0.62 | 0.034 | 0.042 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.76 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.041 | 0.061 | +| | | | Cosine + QMF | 0.62 | 0.037 | 0.056 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | +| | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| +| | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | | | | | Cosine + AS-Norm | 0.52 | 0.33 | 0.045 | | | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | + ### VoxCeleb 1 Entire-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -109,6 +125,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | | | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | | | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.86 | 0.054 | 0.098 | +| | | | Cosine + AS-Norm | 0.81 | 0.049 | 0.087 | +| | | | Cosine + QMF | 0.77 | 0.046 | 0.082 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.89 | 0.058 | 0.098 | +| | | | Cosine + AS-Norm | 0.84 | 0.053 | 0.087| +| | | | Cosine + QMF | 0.80 | 0.050 | 0.081 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | +| | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | +| | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| | | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | | | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | @@ -123,6 +154,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | | | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | | | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.62 | 0.098 | 0.164 | +| | | | Cosine + AS-Norm | 1.45 | 0.085 | 0.142 | +| | | | Cosine + QMF | 1.36 | 0.082 | 0.137 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.70 | 0.1 | 0.165 | +| | | | Cosine + AS-Norm | 1.50 | 0.086 | 0.138 | +| | | | Cosine + QMF | 1.44 | 0.085 | 0.139 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | +| | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | +| | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | | | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | | | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | @@ -137,6 +183,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | | | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | | | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.19 | 0.142 | 0.242 | +| | | | Cosine + AS-Norm | 2.00 | 0.133 | 0.254 | +| | | | Cosine + QMF | 1.86 | 0.126 | 0.229 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.34 | 0.145 | 0.246 | +| | | | Cosine + AS-Norm | 2.10 | 0.135 | 0.248 | +| | | | Cosine + QMF | 2.01 | 0.127 | 0.218 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | +| | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | +| | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | +| | | | Cosine + AS-Norm | | | | +| | | | Cosine + QMF | | | | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | | | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | | | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | diff --git a/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..727f40a3 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: cfwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..04665cac --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..e2fb4c40 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: seresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..04665cac --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..db559c14 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..04665cac --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_resnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_resnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..bff4a00b --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_resnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..1d864080 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: tseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 128 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..04665cac --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh new file mode 100644 index 00000000..32c91da2 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cfwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh new file mode 100644 index 00000000..dbbf6fa7 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh @@ -0,0 +1,45 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + + +nnet_s2_base_cfg=conf/train_cwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh new file mode 100644 index 00000000..62b02c28 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_fwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh new file mode 100644 index 00000000..c49936e0 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34.v3.0 + +nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh new file mode 100644 index 00000000..42af2d52 --- /dev/null +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_tseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py similarity index 100% rename from hyperion/bin_deprec2/torch-eval-xvec-logits-from-wav.py rename to hyperion/bin/eval_xvec_logits_from_wav.py diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 88b0b1d9..27a7e573 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -160,7 +164,7 @@ def generate_attacks( logging.info("opening audio read stream: %s" % (wav_file)) audio_args = AR.filter_args(**kwargs) - audio_reader = AR(wav_file) + audio_reader = AR(wav_file ** audio_args) wav_scale = audio_reader.wav_scale logging.info("opening audio write stream: %s" % (output_wav_dir)) diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index a4df5091..882a36a6 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -102,20 +106,6 @@ def init_model(model_path, embed_layer, cal_file, threshold, **kwargs): xvector_model.freeze() logging.info("xvector-model={}".format(xvector_model)) - # feat_args = AFF.filter_args(prefix='feats', **kwargs) - # logging.info('initializing feature extractor args={}'.format(feat_args)) - # feat_extractor = AFF.create(**feat_args) - - # mvn_args = MVN.filter_args(prefix='mvn', **kwargs) - # mvn = None - # if mvn_args['norm_mean'] or mvn_args['norm_var']: - # logging.info('initializing short-time mvn args={}'.format(mvn_args)) - # mvn = MVN(**mvn_args) - - # logging.info('loading model {}'.format(model_path)) - # xvector_model = TML.load(model_path) - # xvector_model.freeze() - calibrator = None if cal_file is not None: logging.info("loading calibration params {}".format(cal_file)) @@ -200,16 +190,17 @@ def generate_attacks( key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) - logging.info("opening audio read stream: %s" % (test_wav_file)) + logging.info("opening audio read stream: %s", test_wav_file) audio_args = AR.filter_args(**kwargs) - audio_reader = AR(test_wav_file) + audio_reader = AR(test_wav_file, **audio_args) wav_scale = audio_reader.wav_scale + kwargs["wav_scale"] = wav_scale - logging.info("opening audio write stream: %s" % (output_wav_dir)) + logging.info("opening audio write stream: %s", output_wav_dir) audio_writer = AW(output_wav_dir, audio_format="flac") if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) + logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") attack_factory = init_attack_factory(**kwargs) @@ -217,7 +208,7 @@ def generate_attacks( for j in range(key.num_tests): t1 = time.time() - logging.info("scoring test utt %s" % (key.seg_set[j])) + logging.info("scoring test utt %s", key.seg_set[j]) s, fs = audio_reader.read([key.seg_set[j]]) s = s[0] fs = fs[0] @@ -235,13 +226,11 @@ def generate_attacks( ) model.vad_t = vad logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key.seg_set[j], - speech_frames, - tot_frames, - speech_frames / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, ) t2 = time.time() @@ -253,23 +242,23 @@ def generate_attacks( if key.tar[i, j] or key.non[i, j]: t3 = time.time() if skip_attack(key.tar[i, j], p_tar_attack, p_non_attack): - logging.info("skipping attack for tar trial %s" % (trial_id)) + logging.info("skipping attack for tar trial %s", trial_id) continue - model.x_e = x_e[i].to(device) + model.x_e = x_e[i : i + 1].to(device) with torch.no_grad(): score_benign = model(s) if key.tar[i, j] and score_benign < 0: logging.info( - "target trial %s failed benign classification, skipping..." - % (trial_id) + "target trial %s failed benign classification, skipping...", + trial_id, ) continue elif key.non[i, j] and score_benign > 0: logging.info( - "non-target trial %s failed benign classification, skipping..." - % (trial_id) + "non-target trial %s failed benign classification, skipping...", + trial_id, ) continue @@ -293,19 +282,19 @@ def generate_attacks( success = False if not save_failed: logging.info( - "attack on target trial %s failed, skipping..." % (trial_id) + "attack on target trial %s failed, skipping...", trial_id ) continue elif key.non[i, j] and score_adv < 0: success = False if not save_failed: logging.info( - "attack on non-target trial %s failed benign classification, skipping..." - % (trial_id) + "attack on non-target trial %s failed benign classification, skipping...", + trial_id, ) continue if success: - logging.info("attack on trial %s successful" % (trial_id)) + logging.info("attack on trial %s successful", trial_id) stats_ij = compute_stats_adv_attack(s, s_adv) stats_ij = [float(stat.detach().cpu().numpy()[0]) for stat in stats_ij] @@ -344,18 +333,16 @@ def generate_attacks( ( "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " "rt-factor=%.4f" - ) - % ( - key.seg_set[j], - t7 - t1, - t2 - t1, - trial_time, - num_trials, - num_trials * len(s) / fs / (t7 - t1), - ) + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + num_trials * len(s) / fs / (t7 - t1), ) - logging.info("saving attack info to %s" % (attack_info_file)) + logging.info("saving attack info to %s", attack_info_file) Path(attack_info_file).parent.mkdir(parents=True, exist_ok=True) with open(attack_info_file, "w") as f: diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py index b7370b9b..df212a94 100755 --- a/hyperion/bin/prepare_data.py +++ b/hyperion/bin/prepare_data.py @@ -14,6 +14,7 @@ ) from hyperion.data_prep import DataPrep +from hyperion.hyp_defs import config_logger def make_parser(data_prep_class): @@ -34,6 +35,7 @@ def make_parser(data_prep_class): subcommands.add_subcommand(k, parser_k) args = parser.parse_args() + config_logger(1) data_prep_class = DataPrep.registry[args.subcommand] args = namespace_to_dict(args)[args.subcommand] diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index 966adeef..bb91e3a5 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -4,6 +4,9 @@ """ from jsonargparse import ActionYesNo from pathlib import Path +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +from ..utils import PathLike class DataPrep: @@ -14,15 +17,24 @@ class DataPrep: output_dir: output data directory use_kaldi_ids: puts speaker-id in front of segment id like kaldi target_sample_freq: target sampling frequency to convert the audios to. + num_threads: number of parallel threads """ registry = {} - def __init__(self, corpus_dir, output_dir, use_kaldi_ids, target_sample_freq): + def __init__( + self, + corpus_dir: PathLike, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): self.corpus_dir = Path(corpus_dir) self.output_dir = Path(output_dir) self.use_kaldi_ids = use_kaldi_ids self.target_sample_freq = target_sample_freq + self.num_threads = num_threads self.output_dir.mkdir(exist_ok=True, parents=True) @@ -34,6 +46,42 @@ def __init_subclass__(cls, **kwargs): def dataset_name(): raise NotImplementedError() + @staticmethod + def _get_recording_duration(scp, i, n): + from ..io import SequentialAudioReader as AR + + durations = [] + fss = [] + with AR(scp, part_idx=i, num_parts=n) as reader: + for data in reader: + key, x, fs = data + duration = x.shape[0] / fs + fss.append(fs) + durations.append(duration) + + return fss, durations + + def get_recording_duration(self, recording_set): + + from ..utils import SCPList + import itertools + + scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) + futures = [] + with ThreadPoolExecutor(max_workers=self.num_threads) as pool: + for i in range(self.num_threads): + future = pool.submit( + DataPrep._get_recording_duration, scp, i, self.num_threads + ) + futures.append(future) + + res = [f.result() for f in tqdm(futures)] + fss = list(itertools.chain(*[r[0] for r in res])) + durations = list(itertools.chain(*[r[0] for r in res])) + + recording_set["duration"] = durations + recording_set["sample_freq"] = fss + @staticmethod def add_class_args(parser): parser.add_argument( @@ -54,3 +102,10 @@ def add_class_args(parser): type=int, help="""target sampling frequency to convert the audios to""", ) + + parser.add_argument( + "--num-threads", + default=10, + type=int, + help="""number of parallel threads""", + ) diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index 25692349..5c04f86c 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -5,12 +5,15 @@ from jsonargparse import ActionYesNo from pathlib import Path import re +import logging +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor import pandas as pd import numpy as np -from ..utils.misc import urlretrieve_progress -from ..utils import RecordingSet, SegmentSet, ClassInfo +from ..utils.misc import urlretrieve_progress, PathLike +from ..utils import RecordingSet, SegmentSet, ClassInfo, Dataset from .data_prep import DataPrep @@ -28,14 +31,20 @@ class VoxCeleb2DataPrep(DataPrep): def __init__( self, - corpus_dir, - subset, - cat_videos, - output_dir, - use_kaldi_ids, - target_sample_freq, + corpus_dir: PathLike, + subset: str, + cat_videos: bool, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, ): - super().__init__(corpus_dir, output_dir, use_kaldi_ids, target_sample_freq) + if cat_videos: + use_kaldi_ids = True + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + self.subset = subset self.cat_videos = cat_videos @@ -69,8 +78,9 @@ def _get_metadata(self): file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) df_meta = pd.read_csv(file_path, sep="\t") - print(df_meta.head()) - df_meta.set_index("VoxCeleb2 ID") + df_meta.rename(columns=str.strip, inplace=True) + df_meta = df_meta.applymap(lambda x: str.strip(x) if isinstance(x, str) else x) + df_meta.set_index("VoxCeleb2 ID", inplace=True) return df_meta def _get_langs_est(self): @@ -84,49 +94,83 @@ def _get_langs_est(self): df_lang = pd.read_csv(file_path, sep=",") - def get_video(x): - x = re.sub("/.*.wav$", "", x) - x = re.sub("^.*/", "", x) - return x + if self.cat_videos: + + def get_video(x): + x = re.sub("/[^/]*.wav$", "", x) + return re.sub("/", "-", x) - df_lang["video"] = df_lang["filename"].apply(get_video) - df_lang["filename"].drop(["filename"], axis=1, inplace=True) + elif self.use_kaldi_ids: + + def get_video(x): + x = re.sub(".wav$", "", x) + return re.sub("/", "-", x) + + else: + + def get_video(x): + x = re.sub(".wav$", "", x) + x = re.sub("^[^/]*/", "", x) + return re.sub("/", "-", x) + + df_lang["id"] = df_lang["filename"].apply(get_video) + df_lang.drop(["filename"], axis=1, inplace=True) df_lang.drop_duplicates(inplace=True) - df_lang.set_index("video") + df_lang.set_index("id", inplace=True) + df_lang["lang"] = df_lang["lang"].apply(str.lower) return df_lang + @staticmethod + def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i): + list_file = lists_cat_dir / f"{rec_id}.txt" + with open(list_file, "w") as fw: + rec_idx = (video_idx == i).nonzero()[0] + recs_i = [f"file {rec_files[j]}" for j in rec_idx] + recs_i.sort() + recs_i = "\n".join(recs_i) + fw.write(f"{recs_i}\n") + + file_path = ( + f"ffmpeg -v 8 -f concat -safe 0 -i {list_file} -f wav -acodec pcm_s16le -|" + ) + return file_path + def prepare(self): + logging.info("getting audio meta-data") df_meta = self._get_metadata() + logging.info("getting language estimations") df_lang = self._get_langs_est() rec_dir = self.corpus_dir / self.subset + logging.info("searching audio files in %s", str(rec_dir)) rec_files = list(rec_dir.glob("**/*.m4a")) speakers = [f.parents[1].name for f in rec_files] video_ids = [f.parent.name for f in rec_files] - if self.concat_videos: + if self.cat_videos: lists_cat_dir = self.output_dir / "lists_cat" lists_cat_dir.mkdir(exist_ok=True, parents=True) uniq_video_ids, uniq_video_idx, video_idx = np.unique( video_ids, return_index=True, return_inverse=True ) rec_ids = uniq_video_ids - speakers = speakers[uniq_video_idx] - if self.use_kaldi_ids: - rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] - else: - rec_ids = uniq_video_ids + speakers = [speakers[i] for i in uniq_video_idx] + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] file_paths = [] - for i, video_id in enumerate(uniq_video_ids): - list_file = lists_cat_dir / f"{video_id}.txt" - with open(list_file, "w") as fw: - rec_mask = video_idx == i - recs_i = rec_files[rec_mask] - for rec in recs_i: - fw.write(f"{rec}\n") - - file_path = f"ffmpeg -v 8 -f concat -safe 0 -i {list_file} -f wav -acodec pcm_s16le -|" - file_paths.append(file_path) - + futures = [] + logging.info("making video cat lists") + with ThreadPoolExecutor(max_workers=self.num_threads) as pool: + for i, rec_id in enumerate(rec_ids): + future = pool.submit( + VoxCeleb2DataPrep.make_cat_list, + lists_cat_dir, + rec_id, + rec_files, + video_idx, + i, + ) + futures.append(future) + + file_paths = [f.result() for f in tqdm(futures)] video_ids = uniq_video_ids else: @@ -139,21 +183,48 @@ def prepare(self): rec_ids = [f"{v}-{f}" for v, f in zip(video_ids, file_names)] file_paths = [] - for rec_file in rec_files: + logging.info("making pipe commands") + for rec_file in tqdm(rec_files): file_path = f"ffmpeg -v 8 -i {rec_file} -f wav -acodec pcm_s16le - |" file_paths.append(file_path) - recs = pd.DataFrame({"id": rec_ids, "file_path": file_paths}) + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": file_paths}) recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") segments = pd.DataFrame( { "id": rec_ids, "video_ids": video_ids, "speaker": speakers, "gender": df_meta.loc[speakers, "Gender"], + "language_est": [ + df_lang.loc[r, "lang"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "language_est_conf": [ + df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + # "duration": recs.loc[rec_ids, "duration"], } ) + print( + recs.loc[rec_ids, "duration"], + len(segments), + len(recs.loc[rec_ids, "duration"]), + ) segments = SegmentSet(segments) + segments.sort() + + logging.info("making speaker info file") uniq_speakers = np.unique(speakers) speakers = pd.DataFrame( { @@ -164,6 +235,18 @@ def prepare(self): ) speakers = ClassInfo(speakers) - print(recs) - print(segments) - print(speakers) + logging.info("making language info file") + languages = np.unique(df_lang["lang"]) + languages = ClassInfo(pd.DataFrame({"id": languages})) + + logging.info("making dataset") + dataset = Dataset( + segments, + {"speaker": speakers, "languages": languages}, + {"recordings": recs}, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments, %d speakers", len(segments), len(speakers) + ) diff --git a/hyperion/torch/adv_attacks/random_attack_factory.py b/hyperion/torch/adv_attacks/random_attack_factory.py index 0c83bc56..a91c99ac 100644 --- a/hyperion/torch/adv_attacks/random_attack_factory.py +++ b/hyperion/torch/adv_attacks/random_attack_factory.py @@ -128,7 +128,7 @@ def _sample_attack_args(self): ) attack_args["max_iter"] = self._randint(self.min_iter, self.max_iter) attack_args["abort_early"] = self.abort_early - attack_args["c"] = self._uniform(self.min_c, self.max_c) + attack_args["initial_c"] = self._uniform(self.min_c, self.max_c) attack_args["reduce_c"] = self.reduce_c attack_args["c_incr_factor"] = self.c_incr_factor attack_args["tau_decr_factor"] = self.tau_decr_factor @@ -220,10 +220,9 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--norms", - type=float, - default=[float("inf")], + default=["inf"], nargs="+", - choices=[float("inf"), 1, 2], + choices=["inf", "1", "2"], help=("Attack perturbation norms"), ) diff --git a/hyperion/torch/layer_blocks/se_blocks.py b/hyperion/torch/layer_blocks/se_blocks.py index b14c2b60..fd98db2e 100644 --- a/hyperion/torch/layer_blocks/se_blocks.py +++ b/hyperion/torch/layer_blocks/se_blocks.py @@ -43,7 +43,7 @@ def _standardize_mask(self, mask): return mask - def compute_scale_logit(self, x, x_mask=None): + def compute_scale_logits(self, x, x_mask=None): """comptue the scale before the sigmoid Args: @@ -74,8 +74,8 @@ def forward(self, x, x_mask=None): Returns: Tensor with shape = (batch, channels, heigh, width). """ - scale_logit = self.compute_scale_logit(x, x_mask) - scale = self.sigmoid(scale_logit) + scale_logits = self.compute_scale_logits(x, x_mask) + scale = self.sigmoid(scale_logits) y = scale * x return y @@ -201,6 +201,10 @@ def __init__( ): super().__init__() self.cw_se = SEBlock2d(num_channels, r, activation) + # the bottlenet features will have at least dimension 4 + if num_feats // r < 4: + r = num_feats // 4 + self.fw_se = SEBlock2d(num_feats, r, activation) def forward(self, x, x_mask=None): diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index 67f492f9..ecde6139 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -3,12 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .misc import PathLike +from .dataset import Dataset from .class_info import ClassInfo from .feature_set import FeatureSet from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix from .recording_set import RecordingSet from .rttm import RTTM from .scp_list import SCPList + # from .ext_segment_list import ExtSegmentList from .segment_list import SegmentList from .segment_set import SegmentSet diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index 9e158d87..70ee82c8 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -22,6 +22,7 @@ def __init__(self, df): self.df["weights"] /= self.df["weights"].sum() def add_class_idx(self): + self.sort() self.df["class_idx"] = [i for i in range(len(self.df))] def set_uniform_weights(self): @@ -38,18 +39,21 @@ def exp_weights(self, x): weights = self.df["weights"] ** x self.set_weights(weights) - def set_zero_weight(self, id): - self.df.loc[id, "weights"] = 0 + def set_zero_weight(self, ids): + self.df.loc[ids, "weights"] = 0 self.df["weights"] /= self.df["weights"].sum() @property - def weights(self, id): - return self.df.loc[id, "weights"] + def weights(self, ids): + return self.df.loc[ids, "weights"] @property def num_classes(self): return self.df["class_idx"].values.max() + 1 + def sort_by_idx(self, ascending=True): + self.sort("class_idx", ascending) + @classmethod def load(cls, file_path, sep=None): """Loads utt2info list from text file. diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py new file mode 100644 index 00000000..546dd715 --- /dev/null +++ b/hyperion/utils/dataset.py @@ -0,0 +1,159 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from typing import Dict, Optional +from pathlib import Path +import yaml + +from .segment_set import SegmentSet +from .recording_set import RecordingSet +from .feature_set import FeatureSet +from .class_info import ClassInfo +from .misc import PathLike + + +class Dataset: + """ Class that contains all objects + (segments, recordings, features, class_infos) that + conform a dataset + """ + + def __init__( + self, + segments: SegmentSet, + classes: Optional[Dict[str, ClassInfo]] = None, + recordings: Optional[Dict[str, RecordingSet]] = None, + features: Optional[Dict[str, FeatureSet]] = None, + ): + self._segments = segments + self._classes = classes + self._recordings = recordings + self._features = features + + @property + def segments(self): + return self._segments + + @property + def recordings(self): + return self._recordings + + @property + def features(self): + return self._features + + @property + def classes(self): + return self._classes + + @staticmethod + def resolve_dataset_path(dataset_path): + dataset_path = Path(dataset_path) + ext = dataset_path.suffix + if ext in [".yaml", "yml"]: + dataset_file = dataset_path + dataset_dir = dataset_path.parent + else: + dataset_file = dataset_path / "dataset.yaml" + dataset_dir = dataset_path + + return dataset_dir, dataset_file + + @staticmethod + def resolve_file_path(dataset_dir, file_path): + if file_path.is_file(): + return file_path + + return dataset_dir / file_path + + def save(self, dataset_path: PathLike): + """Saves all the dataset objects. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + + """ + dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + dataset = {} + if self.segments is not None: + file_name = "segments.csv" + dataset["segments"] = file_name + file_path = dataset_dir / file_name + self.segments.save(file_path) + + if self.recordings is not None: + file_names = {} + for k, v in self.recordings.items(): + file_name = k + ".csv" + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path) + + dataset["recordings"] = file_names + + if self.features is not None: + file_names = {} + for k, v in self.features.items(): + file_name = k + ".csv" + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path) + + dataset["features"] = file_names + + if self.classes is not None: + file_names = {} + for k, v in self.classes.items(): + file_name = k + ".csv" + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path) + + dataset["classes"] = file_names + + with open(dataset_file, "w") as f: + yaml.dump(dataset, f) + + @classmethod + def load(cls, dataset_path: PathLike): + """Loads all the dataset objects. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + + """ + dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + with open(dataset_file, "w") as f: + dataset = yaml.safe_load(f) + + assert "segments" in dataset + segments = SegmentSet.load( + Dataset.resolve_file_path(dataset_dir, dataset["segments"]) + ) + classes = None + recordings = None + features = None + if "classes" in dataset: + classes = {} + for k, v in dataset["classes"]: + classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v)) + + if "recordings" in dataset: + recordings = {} + for k, v in dataset["recordings"]: + recordings[k] = RecordingSet.load( + Dataset.resolve_file_path(dataset_dir, v) + ) + + if "features" in dataset: + features = {} + for k, v in dataset["features"]: + features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v)) + + return cls(segments, classes, recordings, features) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index f2262217..fdf854c1 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -14,7 +14,7 @@ from .list_utils import split_list, split_list_group_by_key -class InfoTable(object): +class InfoTable: """This is a base class to store information about recordings, segments, features, etc. @@ -131,7 +131,10 @@ def load(cls, file_path, sep=None): def sort(self, column="id", ascending=True): """Sorts the table by column""" - self.df.sort_values(by=column, inplace=True, ascending=ascending) + if column == "id": + self.df.sort_index(inplace=True, ascending=ascending) + else: + self.df.sort_values(by=column, inplace=True, ascending=ascending) def split(self, idx, num_parts, group_by=None): """Splits SCPList into num_parts and return part idx. diff --git a/hyperion/utils/misc.py b/hyperion/utils/misc.py index 369962fd..4ab3ce0a 100644 --- a/hyperion/utils/misc.py +++ b/hyperion/utils/misc.py @@ -4,10 +4,14 @@ Miscellaneous functions """ +from typing import TypeVar from inspect import signature +from pathlib import Path import numpy as np +PathLike = TypeVar("PathLike", str, Path, None) + def generate_data(g): while 1: From e44eb755e1da333c401f5f003cfa110e470a0a40 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 3 May 2023 16:44:00 -0400 Subject: [PATCH 097/154] isort everything again --- hyperion/bin/adv_finetune_xvector_from_wav.py | 10 +++------- hyperion/bin/apply_mvn_select_frames.py | 5 +++-- hyperion/bin/audio_to_duration.py | 5 +++-- hyperion/bin/compute_energy_vad.py | 5 +++-- hyperion/bin/compute_mfcc_feats.py | 5 +++-- hyperion/bin/copy_feats.py | 1 + hyperion/bin/decode_wav2transducer.py | 5 +++-- hyperion/bin/decode_wav2vec2rnn_transducer.py | 5 +++-- .../eval_xvec_cosine_scoring_from_adv_test_wav.py | 5 +++-- ...ec_cosine_scoring_from_adv_test_wav_wavegan.py | 5 +++-- .../eval_xvec_cosine_scoring_from_art_test_wav.py | 9 +++++---- .../bin/eval_xvec_cosine_scoring_from_test_wav.py | 5 +++-- ...c_cosine_scoring_from_transfer_adv_test_wav.py | 5 +++-- ...c_cosine_scoring_from_transfer_art_test_wav.py | 9 +++++---- hyperion/bin/eval_xvec_logits_from_wav.py | 5 +++-- hyperion/bin/extract_wav2vec2xvectors.py | 7 ++++--- hyperion/bin/extract_xvectors_from_feats.py | 5 +++-- hyperion/bin/extract_xvectors_from_wav.py | 5 +++-- .../bin/extract_xvectors_slidwin_from_feats.py | 7 ++++--- hyperion/bin/extract_xvectors_slidwin_from_wav.py | 7 ++++--- hyperion/bin/finetune_wav2vec2transducer.py | 5 +++-- hyperion/bin/finetune_wav2vec2xvector.py | 5 +++-- hyperion/bin/finetune_xvector_dfr_from_feats.py | 5 +++-- hyperion/bin/finetune_xvector_dfr_from_wav.py | 5 +++-- hyperion/bin/finetune_xvector_from_feats.py | 5 +++-- hyperion/bin/finetune_xvector_from_wav.py | 5 +++-- .../bin/generate_adv_attacks_xvector_classif.py | 4 ++-- .../bin/generate_adv_attacks_xvector_verif.py | 4 ++-- hyperion/bin/make_babble_noise_audio_files.py | 7 ++++--- hyperion/bin/pack_wav_rirs.py | 5 +++-- hyperion/bin/plot_embedding_tsne.py | 5 +++-- hyperion/bin/plot_embedding_tsne_per_class.py | 5 +++-- hyperion/bin/prepare_data.py | 8 ++------ hyperion/bin/preprocess_audio_files.py | 7 ++++--- hyperion/bin/train_wav2rnn_transducer.py | 5 +++-- hyperion/bin/train_wav2vec2rnn_transducer.py | 5 +++-- hyperion/bin/train_wav2vec2transducer.py | 5 +++-- hyperion/bin/train_wav2vec2xvector.py | 5 +++-- hyperion/bin/train_xvector_from_feats.py | 5 +++-- hyperion/bin/train_xvector_from_wav.py | 2 +- hyperion/bin_deprec/ark2hyp.py | 1 + hyperion/bin_deprec/arkvad2nist.py | 1 + hyperion/bin_deprec/compute-gmm-post.py | 3 ++- hyperion/bin_deprec/eval-2class-performance.py | 1 + hyperion/bin_deprec/eval-elbo-ubm.py | 1 + hyperion/bin_deprec/eval-q-scoring-homo-gbe.py | 1 + hyperion/bin_deprec/eval-score-norm.py | 1 + hyperion/bin_deprec/h5vad2nist.py | 1 + hyperion/bin_deprec/init-ubm.py | 3 ++- hyperion/bin_deprec/scores2lre_format.py | 1 + .../torch-train-conformer-enc-v1-vq-dvae.py | 1 + .../torch-train-conformer-enc-v1-vq-vae.py | 1 + hyperion/bin_deprec/torch-train-dc1d-dvae.py | 1 + hyperion/bin_deprec/torch-train-dc1d-vae.py | 1 + hyperion/bin_deprec/torch-train-dc2d-dvae.py | 1 + hyperion/bin_deprec/torch-train-dc2d-vae.py | 1 + hyperion/bin_deprec/torch-train-resnet1d-dvae.py | 1 + hyperion/bin_deprec/torch-train-resnet1d-vae.py | 1 + .../bin_deprec/torch-train-resnet1d-vq-dvae.py | 1 + .../bin_deprec/torch-train-resnet1d-vq-vae.py | 1 + hyperion/bin_deprec/torch-train-resnet2d-dvae.py | 1 + hyperion/bin_deprec/torch-train-resnet2d-vae.py | 5 +++-- .../bin_deprec/torch-train-resnet2d-vq-dvae.py | 1 + .../bin_deprec/torch-train-resnet2d-vq-vae.py | 1 + .../torch-train-transformer-enc-v1-dvae.py | 1 + .../torch-train-transformer-enc-v1-vae.py | 1 + .../torch-train-transformer-enc-v1-vq-dvae.py | 1 + .../torch-train-transformer-enc-v1-vq-vae.py | 1 + hyperion/bin_deprec/torch-train-xvector.py | 1 + hyperion/bin_deprec/train-q-scoring-homo-gbe.py | 1 + hyperion/bin_deprec/vectors2scores.py | 1 + hyperion/bin_deprec2/apply-mvn-select-frames.py | 5 +++-- hyperion/bin_deprec2/compute-mfcc-feats.py | 5 +++-- hyperion/bin_deprec2/copy-feats.py | 1 + hyperion/bin_deprec2/eval-cos-1vs1.py | 1 + hyperion/bin_deprec2/eval-linear-gbe-up.py | 1 + hyperion/bin_deprec2/eval-linear-gbe.py | 1 + hyperion/bin_deprec2/eval-linear-svmc.py | 1 + hyperion/bin_deprec2/eval-logistic-regression.py | 1 + hyperion/bin_deprec2/eval-plda-1vs1.py | 1 + hyperion/bin_deprec2/eval-plda-nvs1.py | 1 + hyperion/bin_deprec2/merge-h5-files.py | 1 + hyperion/bin_deprec2/pack-audio-files.py | 3 ++- hyperion/bin_deprec2/plot-vector-hist.py | 1 + hyperion/bin_deprec2/rttm-to-bin-vad.py | 1 + hyperion/bin_deprec2/segments-to-bin-vad.py | 5 +++-- .../torch-adv-finetune-xvec-from-wav.py | 5 +++-- hyperion/bin_deprec2/torch-adv-finetune-xvec.py | 5 +++-- hyperion/bin_deprec2/torch-compute-mfcc-feats.py | 5 +++-- hyperion/bin_deprec2/torch-eval-vae.py | 1 + ...ec-cosine-scoring-from-adv-test-wav-wavegan.py | 5 +++-- ...-eval-xvec-cosine-scoring-from-adv-test-wav.py | 5 +++-- ...-eval-xvec-cosine-scoring-from-art-test-wav.py | 7 ++++--- ...orch-eval-xvec-cosine-scoring-from-test-wav.py | 5 +++-- ...c-cosine-scoring-from-transfer-adv-test-wav.py | 5 +++-- ...c-cosine-scoring-from-transfer-art-test-wav.py | 7 ++++--- .../torch-extract-xvectors-from-wav-with-rttm.py | 5 +++-- .../torch-extract-xvectors-slidwin-from-wav.py | 7 ++++--- .../bin_deprec2/torch-extract-xvectors-slidwin.py | 5 +++-- .../torch-extract-xvectors-vae-preproc.py | 5 +++-- hyperion/bin_deprec2/torch-extract-xvectors.py | 5 +++-- hyperion/bin_deprec2/torch-train-dc1d-ae.py | 1 + hyperion/bin_deprec2/torch-train-dvae.py | 5 +++-- .../torch-train-efficientnet-xvec-from-wav.py | 5 +++-- .../bin_deprec2/torch-train-efficientnet-xvec.py | 5 +++-- .../torch-train-resnet-xvec-from-wav.py | 5 +++-- hyperion/bin_deprec2/torch-train-resnet-xvec.py | 5 +++-- .../torch-train-resnet1d-xvec-from-wav.py | 5 +++-- .../torch-train-spinenet-xvec-from-wav.py | 5 +++-- .../bin_deprec2/torch-train-tdnn-xvec-from-wav.py | 5 +++-- hyperion/bin_deprec2/torch-train-tdnn-xvec.py | 5 +++-- .../torch-train-transformer-xvec-v1-from-wav.py | 5 +++-- .../torch-train-transformer-xvec-v1.py | 5 +++-- hyperion/bin_deprec2/torch-train-vae.py | 5 +++-- hyperion/bin_deprec2/torch-train-vq-dvae.py | 5 +++-- hyperion/bin_deprec2/torch-train-vq-vae.py | 5 +++-- hyperion/bin_deprec2/train-cw-up.py | 1 + hyperion/bin_deprec2/train-cw.py | 1 + hyperion/bin_deprec2/train-gaussianizer.py | 1 + hyperion/bin_deprec2/train-lda.py | 1 + hyperion/bin_deprec2/train-linear-gbe-up.py | 1 + hyperion/bin_deprec2/train-linear-gbe.py | 1 + hyperion/bin_deprec2/train-linear-svmc.py | 1 + hyperion/bin_deprec2/train-logistic-regression.py | 1 + hyperion/bin_deprec2/train-mvn.py | 1 + hyperion/bin_deprec2/train-nda.py | 1 + hyperion/bin_deprec2/train-pca.py | 1 + hyperion/bin_deprec2/train-plda.py | 1 + hyperion/data_prep/data_prep.py | 9 ++++++--- hyperion/data_prep/voxceleb2.py | 14 +++++++------- hyperion/torch/adv_attacks/art_attack_factory.py | 2 +- .../torch/adv_attacks/random_attack_factory.py | 3 ++- hyperion/torch/adv_defenses/wave_gan_white.py | 4 +++- hyperion/torch/data/audio_dataset.py | 5 +++-- hyperion/torch/data/bucketing_seg_sampler.py | 1 + .../torch/data/class_weighted_embed_sampler.py | 3 ++- .../data/class_weighted_seg_chunk_sampler.py | 3 ++- hyperion/torch/data/embed_dataset.py | 3 ++- hyperion/torch/data/embed_sampler.py | 3 ++- hyperion/torch/data/feat_seq_dataset.py | 3 ++- hyperion/torch/data/hyp_sampler.py | 3 ++- hyperion/torch/data/paired_feat_seq_dataset.py | 1 + hyperion/torch/data/seg_chunk_sampler.py | 3 ++- hyperion/torch/data/seg_sampler.py | 3 ++- hyperion/torch/data/weighted_embed_sampler.py | 1 + hyperion/torch/data/weighted_seq_sampler.py | 3 ++- hyperion/torch/layer_blocks/__init__.py | 3 ++- hyperion/torch/layer_blocks/etdnn_blocks.py | 1 + hyperion/torch/layer_blocks/resetdnn_blocks.py | 1 + hyperion/torch/layer_blocks/transducer_joiner.py | 3 ++- .../torch/layer_blocks/transducer_predictor.py | 3 ++- hyperion/torch/layers/__init__.py | 3 ++- hyperion/torch/layers/activation_factory.py | 2 +- hyperion/torch/layers/global_pool.py | 1 + hyperion/torch/layers/mvn.py | 3 ++- hyperion/torch/layers/pool_factory.py | 3 ++- hyperion/torch/layers/spec_augment.py | 3 ++- hyperion/torch/loggers/logger.py | 1 + hyperion/torch/loggers/logger_list.py | 1 + hyperion/torch/lr_schedulers/factory.py | 3 ++- hyperion/torch/models/__init__.py | 6 +++--- hyperion/torch/models/transducer/conformer.py | 3 ++- hyperion/torch/models/transducer/decoder.py | 3 ++- hyperion/torch/models/transducer/joiner.py | 3 ++- hyperion/torch/models/transducer/transducer.py | 5 +++-- hyperion/torch/models/tvector/tvector.py | 3 ++- .../wav2transducer/hf_wav2rnn_transducer.py | 3 ++- .../models/wav2transducer/hf_wav2transducer.py | 3 ++- .../wav2transducer/hf_wav2vec2_transducer.py | 3 ++- .../hf_wav2vec2conformer_v1_rnn_transducer.py | 3 ++- .../hf_wav2vec2rnn_rnn_transducer.py | 3 ++- .../wav2transducer/hf_wav2vec2rnn_transducer.py | 3 ++- .../models/wav2transducer/wav2rnn_transducer.py | 3 ++- .../wav2xvectors/hf_hubert2resnet1d_xvector.py | 3 ++- .../wav2xvectors/hf_wav2vec2resnet1d_xvector.py | 3 ++- .../torch/models/wav2xvectors/hf_wav2xvector.py | 3 ++- .../wav2xvectors/hf_wavlm2resnet1d_xvector.py | 3 ++- .../models/wav2xvectors/wav2resnet1d_xvector.py | 3 ++- .../models/wav2xvectors/wav2resnet_xvector.py | 3 ++- hyperion/torch/models/wav2xvectors/wav2xvector.py | 3 ++- .../models/xvectors/efficient_net_xvector.py | 3 ++- .../torch/models/xvectors/resnet1d_xvector.py | 3 ++- hyperion/torch/models/xvectors/resnet_xvector.py | 3 ++- .../torch/models/xvectors/spinenet_xvector.py | 3 ++- hyperion/torch/models/xvectors/tdnn_xvector.py | 3 ++- .../models/xvectors/transformer_xvector_v1.py | 3 ++- hyperion/torch/models/xvectors/xvector.py | 4 ++-- hyperion/torch/narchs/audio_feats_mvn.py | 3 ++- hyperion/torch/narchs/classif_head.py | 3 ++- hyperion/torch/narchs/conformer_encoder_v1.py | 8 +++++--- hyperion/torch/narchs/dc1d_decoder.py | 3 ++- hyperion/torch/narchs/dc1d_encoder.py | 3 ++- hyperion/torch/narchs/dc2d_decoder.py | 3 ++- hyperion/torch/narchs/dc2d_encoder.py | 3 ++- hyperion/torch/narchs/efficient_net.py | 3 ++- hyperion/torch/narchs/etdnn.py | 1 + hyperion/torch/narchs/net_arch.py | 1 + hyperion/torch/narchs/resetdnn.py | 1 + hyperion/torch/narchs/resnet.py | 15 +++++---------- hyperion/torch/narchs/resnet1d_decoder.py | 3 ++- hyperion/torch/narchs/resnet1d_encoder.py | 3 ++- hyperion/torch/narchs/resnet2d_decoder.py | 3 ++- hyperion/torch/narchs/resnet2d_encoder.py | 3 ++- hyperion/torch/narchs/rnn_encoder.py | 3 ++- hyperion/torch/narchs/rnn_transducer_decoder.py | 8 +++++--- hyperion/torch/narchs/spinenet.py | 1 + hyperion/torch/narchs/tdnn.py | 1 + hyperion/torch/narchs/transformer_encoder_v1.py | 3 ++- hyperion/torch/optim/factory.py | 3 ++- hyperion/torch/tpm/hf/hf_hubert.py | 5 +++-- hyperion/torch/tpm/hf/hf_wav2vec2.py | 5 +++-- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 5 +++-- hyperion/torch/tpm/hf/hf_wavlm.py | 5 +++-- hyperion/torch/trainers/ae_trainer.py | 3 ++- hyperion/torch/trainers/dvae_trainer.py | 3 ++- hyperion/torch/trainers/torch_trainer.py | 5 +++-- hyperion/torch/trainers/transducer_trainer.py | 5 +++-- hyperion/torch/trainers/vae_trainer.py | 3 ++- hyperion/torch/trainers/vq_dvae_trainer.py | 3 ++- hyperion/torch/trainers/vq_vae_trainer.py | 3 ++- hyperion/torch/trainers/xvector_adv_trainer.py | 3 ++- .../trainers/xvector_adv_trainer_from_wav.py | 3 ++- .../trainers/xvector_trainer_deep_feat_reg.py | 3 ++- hyperion/torch/utils/ddp.py | 7 ++++--- hyperion/torch/utils/metric_acc.py | 1 + hyperion/utils/__init__.py | 5 ++--- hyperion/utils/dataset.py | 9 +++++---- hyperion/utils/lexicon.py | 1 + hyperion/utils/misc.py | 2 +- hyperion/utils/text.py | 1 + requirements.txt | 2 ++ 231 files changed, 500 insertions(+), 290 deletions(-) diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py index f387c7ac..7be882e0 100755 --- a/hyperion/bin/adv_finetune_xvector_from_wav.py +++ b/hyperion/bin/adv_finetune_xvector_from_wav.py @@ -11,12 +11,8 @@ from pathlib import Path import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) import torch import torch.nn as nn @@ -24,8 +20,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.adv_attacks import AttackFactory from hyperion.torch.data import AudioDataset as AD -from hyperion.torch.data import SegSamplerFactory from hyperion.torch.data import ClassWeightedSeqSampler as Sampler +from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import EfficientNetXVector as EXVec from hyperion.torch.models import ResNet1dXVector as R1dXVec diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py index 53a01d6d..a2456dc9 100755 --- a/hyperion/bin/apply_mvn_select_frames.py +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import RandomAccessDataReaderFactory as RDRF @@ -18,8 +21,6 @@ from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.utils import Utt2Info from hyperion.utils.kaldi_matrix import compression_methods -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def process_feats( diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py index ac8852a4..38e8dff2 100755 --- a/hyperion/bin/audio_to_duration.py +++ b/hyperion/bin/audio_to_duration.py @@ -9,11 +9,12 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import SequentialAudioReader as AR from hyperion.utils import SegmentSet -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def audio_to_duration(audio_file, output_file, **kwargs): diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index e4d47ef0..15d74f3a 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -9,12 +9,13 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.feats import EnergyVAD -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def compute_vad(input_path, output_path, write_num_frames, **kwargs): diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index c8193e5c..a83f95d1 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -9,14 +9,15 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.np.feats import MFCC -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def compute_mfcc_feats( diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py index 4549caec..0385cc55 100755 --- a/hyperion/bin/copy_feats.py +++ b/hyperion/bin/copy_feats.py @@ -12,6 +12,7 @@ import time import numpy as np + from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index 420f8a9f..81fa8803 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -13,6 +13,9 @@ import numpy as np import pandas as pd import sentencepiece as spm +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -25,8 +28,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py index 4fdc3140..8ef8d414 100755 --- a/hyperion/bin/decode_wav2vec2rnn_transducer.py +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -13,6 +13,9 @@ import numpy as np import pandas as pd import sentencepiece as spm +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -26,8 +29,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index 19ba6546..bb01162f 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -10,6 +10,9 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -26,8 +29,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index 2f7506c7..c483ce39 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -12,6 +12,9 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,8 +32,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) torch.backends.cudnn.enabled = False diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index 4a654212..fba182c4 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -11,10 +11,13 @@ import numpy as np import pandas as pd -import torch -import torch.nn as nn from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch +import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -30,8 +33,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 7ab46d11..3cfde93e 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -24,8 +27,6 @@ from hyperion.torch.utils.misc import l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index b2f6736d..44bdf59d 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -10,6 +10,9 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -26,8 +29,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 0973d3ff..676575fd 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -11,10 +11,13 @@ import numpy as np import pandas as pd -import torch -import torch.nn as nn from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch +import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -30,8 +33,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index b95b2a7c..da6389fb 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -11,6 +11,9 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -21,8 +24,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index de0a8637..37d6a2a6 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -11,8 +11,11 @@ import numpy as np import pandas as pd -import torch import torchaudio.transforms as tat +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -21,8 +24,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) resamplers = {} diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index 13ad4277..926e0bcc 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -19,8 +22,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index 4f48bbdc..addabbcf 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -11,6 +11,9 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -21,8 +24,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index fb6583e2..e3d2fcbb 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -10,8 +10,11 @@ import time import numpy as np -import torch import yaml +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -20,8 +23,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index 9f1728eb..2b1bba3b 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -11,8 +11,11 @@ import numpy as np import pandas as pd -import torch import yaml +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -22,8 +25,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py index 6f17f800..df267e72 100755 --- a/hyperion/bin/finetune_wav2vec2transducer.py +++ b/hyperion/bin/finetune_wav2vec2transducer.py @@ -12,6 +12,9 @@ import k2 import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -22,8 +25,6 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index d9d9c281..b3edd9b5 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -23,8 +26,6 @@ HFWavLM2ResNet1dXVector) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py index 17cafb85..2ac01025 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_feats.py +++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py @@ -12,6 +12,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -22,8 +25,6 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py index f7832a47..ff97d3ca 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_wav.py +++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -21,8 +24,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data( diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py index ac9c2d0b..7a1fb5a9 100755 --- a/hyperion/bin/finetune_xvector_from_feats.py +++ b/hyperion/bin/finetune_xvector_from_feats.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -20,8 +23,6 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 2e120815..227892ea 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -10,6 +10,9 @@ import time from pathlib import Path +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -25,8 +28,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 092a5029..a058893d 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -11,12 +11,12 @@ import numpy as np import pandas as pd -import torch -import torch.nn as nn import yaml from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, namespace_to_dict) +import torch +import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index 5fae0bbb..83375cb6 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -11,12 +11,12 @@ import numpy as np import pandas as pd -import torch -import torch.nn as nn import yaml from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, namespace_to_dict) +import torch +import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 4a356037..972ff01f 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -10,14 +10,15 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal + from hyperion.hyp_defs import config_logger from hyperion.io import AudioWriter as Writer from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal def make_noise(xs): diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index c5ddd25c..dccf58da 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -10,11 +10,12 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def pack_wav_rirs(input_path, output_spec, **kwargs): diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py index e2157e3e..e011dfe8 100755 --- a/hyperion/bin/plot_embedding_tsne.py +++ b/hyperion/bin/plot_embedding_tsne.py @@ -13,12 +13,13 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 6af0202c..6f35f074 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -13,14 +13,15 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.clustering import AHC from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet from hyperion.utils.math import cosine_scoring -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py index df212a94..4105f482 100755 --- a/hyperion/bin/prepare_data.py +++ b/hyperion/bin/prepare_data.py @@ -6,12 +6,8 @@ import logging from pathlib import Path -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from hyperion.data_prep import DataPrep from hyperion.hyp_defs import config_logger diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index e8adfd16..2f4e5cbc 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -10,14 +10,15 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal + from hyperion.hyp_defs import config_logger from hyperion.io import AudioWriter as Writer from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal def process_vad(vad, length, fs, dilation, erosion): diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 8930b299..26fcf72c 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -12,6 +12,9 @@ import k2 import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -20,8 +23,6 @@ from hyperion.torch.models import Wav2RNNRNNTransducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index 7018c406..5daffb6d 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -12,6 +12,9 @@ import k2 import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -22,8 +25,6 @@ HFWav2Vec2RNNTransducer) from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 55f3b996..ce53be86 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -12,6 +12,9 @@ import k2 import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -21,8 +24,6 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 8e1653b1..5e7ecafa 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -22,8 +25,6 @@ HFWavLM2ResNet1dXVector) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index 71bba080..7f4ab0fa 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,8 +28,6 @@ from hyperion.torch.models import TransformerXVectorV1 as TFXVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index a979b56b..5c999dd1 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -10,10 +10,10 @@ import time from pathlib import Path -import torch from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, namespace_to_dict) +import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory diff --git a/hyperion/bin_deprec/ark2hyp.py b/hyperion/bin_deprec/ark2hyp.py index a25c561b..abcb4457 100755 --- a/hyperion/bin_deprec/ark2hyp.py +++ b/hyperion/bin_deprec/ark2hyp.py @@ -13,6 +13,7 @@ import time import numpy as np + from hyperion.hyp_defs import config_logger from hyperion.io import HypDataWriter, KaldiDataReader diff --git a/hyperion/bin_deprec/arkvad2nist.py b/hyperion/bin_deprec/arkvad2nist.py index 15a04f67..559371be 100755 --- a/hyperion/bin_deprec/arkvad2nist.py +++ b/hyperion/bin_deprec/arkvad2nist.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.io import KaldiDataReader diff --git a/hyperion/bin_deprec/compute-gmm-post.py b/hyperion/bin_deprec/compute-gmm-post.py index 45d17623..58675336 100755 --- a/hyperion/bin_deprec/compute-gmm-post.py +++ b/hyperion/bin_deprec/compute-gmm-post.py @@ -14,12 +14,13 @@ import time import numpy as np +from keras import backend as K + from hyperion.helpers import SequenceReader as SR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter from hyperion.pdfs import DiagGMM from hyperion.transforms import TransformList -from keras import backend as K def to_sparse(r, num_comp): diff --git a/hyperion/bin_deprec/eval-2class-performance.py b/hyperion/bin_deprec/eval-2class-performance.py index d149deb2..eff16830 100755 --- a/hyperion/bin_deprec/eval-2class-performance.py +++ b/hyperion/bin_deprec/eval-2class-performance.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.hyp_defs import config_logger from hyperion.metrics import compute_eer from hyperion.utils.trial_key import TrialKey diff --git a/hyperion/bin_deprec/eval-elbo-ubm.py b/hyperion/bin_deprec/eval-elbo-ubm.py index 5f2eab28..bf4839db 100755 --- a/hyperion/bin_deprec/eval-elbo-ubm.py +++ b/hyperion/bin_deprec/eval-elbo-ubm.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import SequenceReader as SR from hyperion.hyp_defs import config_logger, float_cpu from hyperion.pdfs import DiagGMM diff --git a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py index 7817b570..4548e49b 100755 --- a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.classifiers import QScoringHomoGBE as GBE from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger diff --git a/hyperion/bin_deprec/eval-score-norm.py b/hyperion/bin_deprec/eval-score-norm.py index 4f66a8e4..4b620518 100755 --- a/hyperion/bin_deprec/eval-score-norm.py +++ b/hyperion/bin_deprec/eval-score-norm.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.hyp_defs import config_logger from hyperion.score_norm import * from hyperion.utils.trial_ndx import TrialNdx diff --git a/hyperion/bin_deprec/h5vad2nist.py b/hyperion/bin_deprec/h5vad2nist.py index 21d61d3a..fb45c22b 100755 --- a/hyperion/bin_deprec/h5vad2nist.py +++ b/hyperion/bin_deprec/h5vad2nist.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.hyp_defs import config_logger from hyperion.io import HypDataReader diff --git a/hyperion/bin_deprec/init-ubm.py b/hyperion/bin_deprec/init-ubm.py index 77aed464..204ca855 100755 --- a/hyperion/bin_deprec/init-ubm.py +++ b/hyperion/bin_deprec/init-ubm.py @@ -15,11 +15,12 @@ import time import numpy as np +from keras import backend as K + from hyperion.helpers import SequenceReader as SR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.pdfs import DiagGMM from hyperion.utils.multithreading import threadsafe_generator -from keras import backend as K @threadsafe_generator diff --git a/hyperion/bin_deprec/scores2lre_format.py b/hyperion/bin_deprec/scores2lre_format.py index fcba8804..717c1535 100755 --- a/hyperion/bin_deprec/scores2lre_format.py +++ b/hyperion/bin_deprec/scores2lre_format.py @@ -12,6 +12,7 @@ import time import numpy as np + from hyperion.hyp_defs import config_logger from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores diff --git a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py index 5c1b19fc..608a5271 100755 --- a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-dvae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py index 326175ab..a4cc54e6 100755 --- a/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-conformer-enc-v1-vq-vae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-dc1d-dvae.py b/hyperion/bin_deprec/torch-train-dc1d-dvae.py index 7a4f9634..1b88beba 100755 --- a/hyperion/bin_deprec/torch-train-dc1d-dvae.py +++ b/hyperion/bin_deprec/torch-train-dc1d-dvae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-dc1d-vae.py b/hyperion/bin_deprec/torch-train-dc1d-vae.py index 1de4560a..dd5d2e72 100755 --- a/hyperion/bin_deprec/torch-train-dc1d-vae.py +++ b/hyperion/bin_deprec/torch-train-dc1d-vae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-dc2d-dvae.py b/hyperion/bin_deprec/torch-train-dc2d-dvae.py index 5bbc53bf..3f7cb17d 100755 --- a/hyperion/bin_deprec/torch-train-dc2d-dvae.py +++ b/hyperion/bin_deprec/torch-train-dc2d-dvae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-dc2d-vae.py b/hyperion/bin_deprec/torch-train-dc2d-vae.py index b073c4c0..5b97f55c 100755 --- a/hyperion/bin_deprec/torch-train-dc2d-vae.py +++ b/hyperion/bin_deprec/torch-train-dc2d-vae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet1d-dvae.py b/hyperion/bin_deprec/torch-train-resnet1d-dvae.py index c10c6fe7..ca6f6996 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-dvae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vae.py b/hyperion/bin_deprec/torch-train-resnet1d-vae.py index cf460a0a..a6218567 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py b/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py index a1b13d95..89448754 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vq-dvae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py b/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py index a773d9aa..4a84bbff 100755 --- a/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet1d-vq-vae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet2d-dvae.py b/hyperion/bin_deprec/torch-train-resnet2d-dvae.py index a3857701..3f6cd6ba 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-dvae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vae.py b/hyperion/bin_deprec/torch-train-resnet2d-vae.py index 695472cb..4e853230 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vae.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -21,8 +24,6 @@ from hyperion.torch.narchs import ResNet2dEncoder as Encoder from hyperion.torch.trainers import VAETrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py b/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py index fdcc0c47..5e0add50 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vq-dvae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py b/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py index 17d4c474..6398d959 100755 --- a/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-resnet2d-vq-vae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py index ff8ef4dc..0137e101 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-dvae.py @@ -11,6 +11,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py index 92dad725..71021825 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py index 18888706..a6908c4f 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-dvae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py index 566ea106..b3b07682 100755 --- a/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py +++ b/hyperion/bin_deprec/torch-train-transformer-enc-v1-vq-vae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec/torch-train-xvector.py b/hyperion/bin_deprec/torch-train-xvector.py index 88147d37..4c69eb25 100755 --- a/hyperion/bin_deprec/torch-train-xvector.py +++ b/hyperion/bin_deprec/torch-train-xvector.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.torch.data import ClassWeightedSeqSampler as Sampler diff --git a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py index 48967f58..8a348728 100755 --- a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.classifiers import QScoringHomoGBE as GBE from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger diff --git a/hyperion/bin_deprec/vectors2scores.py b/hyperion/bin_deprec/vectors2scores.py index 2ff635c2..ab4be8ac 100755 --- a/hyperion/bin_deprec/vectors2scores.py +++ b/hyperion/bin_deprec/vectors2scores.py @@ -11,6 +11,7 @@ import time import numpy as np + from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.utils.trial_scores import TrialScores diff --git a/hyperion/bin_deprec2/apply-mvn-select-frames.py b/hyperion/bin_deprec2/apply-mvn-select-frames.py index 53a01d6d..a2456dc9 100755 --- a/hyperion/bin_deprec2/apply-mvn-select-frames.py +++ b/hyperion/bin_deprec2/apply-mvn-select-frames.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import RandomAccessDataReaderFactory as RDRF @@ -18,8 +21,6 @@ from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.utils import Utt2Info from hyperion.utils.kaldi_matrix import compression_methods -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def process_feats( diff --git a/hyperion/bin_deprec2/compute-mfcc-feats.py b/hyperion/bin_deprec2/compute-mfcc-feats.py index c8193e5c..a83f95d1 100755 --- a/hyperion/bin_deprec2/compute-mfcc-feats.py +++ b/hyperion/bin_deprec2/compute-mfcc-feats.py @@ -9,14 +9,15 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.np.feats import MFCC -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def compute_mfcc_feats( diff --git a/hyperion/bin_deprec2/copy-feats.py b/hyperion/bin_deprec2/copy-feats.py index 4549caec..0385cc55 100755 --- a/hyperion/bin_deprec2/copy-feats.py +++ b/hyperion/bin_deprec2/copy-feats.py @@ -12,6 +12,7 @@ import time import numpy as np + from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF diff --git a/hyperion/bin_deprec2/eval-cos-1vs1.py b/hyperion/bin_deprec2/eval-cos-1vs1.py index f60fdd4b..de508333 100755 --- a/hyperion/bin_deprec2/eval-cos-1vs1.py +++ b/hyperion/bin_deprec2/eval-cos-1vs1.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import TrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.np.transforms import LNorm, TransformList diff --git a/hyperion/bin_deprec2/eval-linear-gbe-up.py b/hyperion/bin_deprec2/eval-linear-gbe-up.py index ba646498..d82bf967 100755 --- a/hyperion/bin_deprec2/eval-linear-gbe-up.py +++ b/hyperion/bin_deprec2/eval-linear-gbe-up.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter as HDW diff --git a/hyperion/bin_deprec2/eval-linear-gbe.py b/hyperion/bin_deprec2/eval-linear-gbe.py index 9828944d..cf788392 100755 --- a/hyperion/bin_deprec2/eval-linear-gbe.py +++ b/hyperion/bin_deprec2/eval-linear-gbe.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter as HDW diff --git a/hyperion/bin_deprec2/eval-linear-svmc.py b/hyperion/bin_deprec2/eval-linear-svmc.py index 3b8b644b..ba4c5e81 100755 --- a/hyperion/bin_deprec2/eval-linear-svmc.py +++ b/hyperion/bin_deprec2/eval-linear-svmc.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter as HDW diff --git a/hyperion/bin_deprec2/eval-logistic-regression.py b/hyperion/bin_deprec2/eval-logistic-regression.py index 56507a9a..992ca7b8 100755 --- a/hyperion/bin_deprec2/eval-logistic-regression.py +++ b/hyperion/bin_deprec2/eval-logistic-regression.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import ClassifTrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import HypDataWriter as HDW diff --git a/hyperion/bin_deprec2/eval-plda-1vs1.py b/hyperion/bin_deprec2/eval-plda-1vs1.py index 1a966f57..5a810cf7 100755 --- a/hyperion/bin_deprec2/eval-plda-1vs1.py +++ b/hyperion/bin_deprec2/eval-plda-1vs1.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import PLDAFactory as F from hyperion.helpers import TrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu diff --git a/hyperion/bin_deprec2/eval-plda-nvs1.py b/hyperion/bin_deprec2/eval-plda-nvs1.py index 5ead954a..5c5d200c 100755 --- a/hyperion/bin_deprec2/eval-plda-nvs1.py +++ b/hyperion/bin_deprec2/eval-plda-nvs1.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import PLDAFactory as F from hyperion.helpers import TrialDataReader as TDR from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu diff --git a/hyperion/bin_deprec2/merge-h5-files.py b/hyperion/bin_deprec2/merge-h5-files.py index 51207343..aeda3bab 100755 --- a/hyperion/bin_deprec2/merge-h5-files.py +++ b/hyperion/bin_deprec2/merge-h5-files.py @@ -12,6 +12,7 @@ import time import numpy as np + from hyperion.io import H5Merger diff --git a/hyperion/bin_deprec2/pack-audio-files.py b/hyperion/bin_deprec2/pack-audio-files.py index a843825a..5d544df4 100755 --- a/hyperion/bin_deprec2/pack-audio-files.py +++ b/hyperion/bin_deprec2/pack-audio-files.py @@ -11,12 +11,13 @@ import time import numpy as np +from scipy import ndimage, signal + from hyperion.hyp_defs import config_logger from hyperion.io import PackedAudioWriter as Writer from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.io import WSpecifier as WS -from scipy import ndimage, signal def process_vad(vad, length, fs, dilation, erosion): diff --git a/hyperion/bin_deprec2/plot-vector-hist.py b/hyperion/bin_deprec2/plot-vector-hist.py index a4d842c0..75236726 100755 --- a/hyperion/bin_deprec2/plot-vector-hist.py +++ b/hyperion/bin_deprec2/plot-vector-hist.py @@ -15,6 +15,7 @@ matplotlib.use("Agg") import matplotlib.pyplot as plt + from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.transforms import TransformList diff --git a/hyperion/bin_deprec2/rttm-to-bin-vad.py b/hyperion/bin_deprec2/rttm-to-bin-vad.py index 610a0019..19e98d8f 100755 --- a/hyperion/bin_deprec2/rttm-to-bin-vad.py +++ b/hyperion/bin_deprec2/rttm-to-bin-vad.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.utils import RTTM, SegmentList diff --git a/hyperion/bin_deprec2/segments-to-bin-vad.py b/hyperion/bin_deprec2/segments-to-bin-vad.py index 56e6bf9f..24021a4b 100755 --- a/hyperion/bin_deprec2/segments-to-bin-vad.py +++ b/hyperion/bin_deprec2/segments-to-bin-vad.py @@ -10,11 +10,12 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.utils import SegmentList -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def segments_to_bin_vad( diff --git a/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py index 9dde434d..ad33515c 100755 --- a/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-adv-finetune-xvec-from-wav.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -23,8 +26,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-adv-finetune-xvec.py b/hyperion/bin_deprec2/torch-adv-finetune-xvec.py index 88d21cdb..850233e2 100755 --- a/hyperion/bin_deprec2/torch-adv-finetune-xvec.py +++ b/hyperion/bin_deprec2/torch-adv-finetune-xvec.py @@ -12,6 +12,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -22,8 +25,6 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorAdvTrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-compute-mfcc-feats.py b/hyperion/bin_deprec2/torch-compute-mfcc-feats.py index 17565a3c..07f71bfb 100755 --- a/hyperion/bin_deprec2/torch-compute-mfcc-feats.py +++ b/hyperion/bin_deprec2/torch-compute-mfcc-feats.py @@ -8,6 +8,9 @@ import sys import time +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -15,8 +18,6 @@ from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.torch.layers import AudioFeatsFactory as AFF -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def compute_mfcc_feats( diff --git a/hyperion/bin_deprec2/torch-eval-vae.py b/hyperion/bin_deprec2/torch-eval-vae.py index bf99dddd..d676b0f1 100755 --- a/hyperion/bin_deprec2/torch-eval-vae.py +++ b/hyperion/bin_deprec2/torch-eval-vae.py @@ -16,6 +16,7 @@ matplotlib.use("Agg") # matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) import matplotlib.pyplot as plt + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py index 1c00ed2a..aaa91214 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py @@ -12,6 +12,9 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,8 +32,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) torch.backends.cudnn.enabled = False diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py index 27d36d6f..437127b2 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py @@ -10,6 +10,9 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -26,8 +29,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py index f9b77f11..8d4add76 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-art-test-wav.py @@ -11,9 +11,12 @@ import numpy as np import pandas as pd +from art.classifiers import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn -from art.classifiers import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -29,8 +32,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py index 9f6801ef..0e9493c0 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-test-wav.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -24,8 +27,6 @@ from hyperion.torch.utils.misc import l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py index 6fdca983..e0754498 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py @@ -10,6 +10,9 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -26,8 +29,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py index 7ef4815c..0f9f375d 100755 --- a/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py +++ b/hyperion/bin_deprec2/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py @@ -11,9 +11,12 @@ import numpy as np import pandas as pd +from art.classifiers import PyTorchClassifier +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn -from art.classifiers import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -29,8 +32,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py b/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py index a9785a61..fc494448 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-from-wav-with-rttm.py @@ -11,6 +11,9 @@ import numpy as np import pandas as pd +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -21,8 +24,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import RTTM, Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py index 7453e0ba..c85fe4c9 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin-from-wav.py @@ -11,8 +11,11 @@ import numpy as np import pandas as pd -import torch import yaml +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + +import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -22,8 +25,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py index 3153b312..6da57e16 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-slidwin.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, float_cpu from hyperion.io import DataWriterFactory as DWF @@ -19,8 +22,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py b/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py index 347c80f8..6edf60ed 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors-vae-preproc.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -19,8 +22,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-extract-xvectors.py b/hyperion/bin_deprec2/torch-extract-xvectors.py index 83d21692..76d941e0 100755 --- a/hyperion/bin_deprec2/torch-extract-xvectors.py +++ b/hyperion/bin_deprec2/torch-extract-xvectors.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -19,8 +22,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin_deprec2/torch-train-dc1d-ae.py b/hyperion/bin_deprec2/torch-train-dc1d-ae.py index cdba46b3..50ac7d42 100755 --- a/hyperion/bin_deprec2/torch-train-dc1d-ae.py +++ b/hyperion/bin_deprec2/torch-train-dc1d-ae.py @@ -10,6 +10,7 @@ import time import numpy as np + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/bin_deprec2/torch-train-dvae.py b/hyperion/bin_deprec2/torch-train-dvae.py index 6c21bbcf..808bfbba 100755 --- a/hyperion/bin_deprec2/torch-train-dvae.py +++ b/hyperion/bin_deprec2/torch-train-dvae.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,8 +27,6 @@ TransformerEncoderV1) from hyperion.torch.trainers import DVAETrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py index 9db86225..f256f735 100755 --- a/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-efficientnet-xvec-from-wav.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -20,8 +23,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py b/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py index 124e9cb3..622ac62e 100755 --- a/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py +++ b/hyperion/bin_deprec2/torch-train-efficientnet-xvec.py @@ -12,6 +12,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -21,8 +24,6 @@ from hyperion.torch.models import EfficientNetXVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py index 6b9455df..3d135b18 100755 --- a/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-resnet-xvec-from-wav.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,8 +28,6 @@ # from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) # import torch.multiprocessing as mp diff --git a/hyperion/bin_deprec2/torch-train-resnet-xvec.py b/hyperion/bin_deprec2/torch-train-resnet-xvec.py index f035032a..f976cc6e 100755 --- a/hyperion/bin_deprec2/torch-train-resnet-xvec.py +++ b/hyperion/bin_deprec2/torch-train-resnet-xvec.py @@ -12,6 +12,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -21,8 +24,6 @@ from hyperion.torch.models import ResNetXVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py index 616e2cd3..3ee6bf18 100755 --- a/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-resnet1d-xvec-from-wav.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -21,8 +24,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py index f579a807..0857ce5c 100755 --- a/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-spinenet-xvec-from-wav.py @@ -12,6 +12,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -22,8 +25,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py index 486b1d92..7bbbff03 100755 --- a/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-tdnn-xvec-from-wav.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -20,8 +23,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-tdnn-xvec.py b/hyperion/bin_deprec2/torch-train-tdnn-xvec.py index be429344..5614f1b9 100755 --- a/hyperion/bin_deprec2/torch-train-tdnn-xvec.py +++ b/hyperion/bin_deprec2/torch-train-tdnn-xvec.py @@ -12,6 +12,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -21,8 +24,6 @@ from hyperion.torch.models import TDNNXVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py index 3e91da90..6b361583 100755 --- a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py +++ b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1-from-wav.py @@ -10,6 +10,9 @@ import time import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -20,8 +23,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data( diff --git a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py index d08a58a3..62164f15 100755 --- a/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py +++ b/hyperion/bin_deprec2/torch-train-transformer-xvec-v1.py @@ -12,6 +12,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -21,8 +24,6 @@ from hyperion.torch.models import TransformerXVectorV1 as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin_deprec2/torch-train-vae.py b/hyperion/bin_deprec2/torch-train-vae.py index 6f545795..4c41d49c 100755 --- a/hyperion/bin_deprec2/torch-train-vae.py +++ b/hyperion/bin_deprec2/torch-train-vae.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,8 +27,6 @@ TransformerEncoderV1) from hyperion.torch.trainers import VAETrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/torch-train-vq-dvae.py b/hyperion/bin_deprec2/torch-train-vq-dvae.py index 449c3b49..5de1bbd4 100755 --- a/hyperion/bin_deprec2/torch-train-vq-dvae.py +++ b/hyperion/bin_deprec2/torch-train-vq-dvae.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,8 +27,6 @@ TransformerEncoderV1) from hyperion.torch.trainers import VQDVAETrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/torch-train-vq-vae.py b/hyperion/bin_deprec2/torch-train-vq-vae.py index 17dea6aa..2a95f853 100755 --- a/hyperion/bin_deprec2/torch-train-vq-vae.py +++ b/hyperion/bin_deprec2/torch-train-vq-vae.py @@ -11,6 +11,9 @@ from pathlib import Path import numpy as np +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) + import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,8 +27,6 @@ TransformerEncoderV1) from hyperion.torch.trainers import VQVAETrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) enc_dict = { "dc1d": DC1dEncoder, diff --git a/hyperion/bin_deprec2/train-cw-up.py b/hyperion/bin_deprec2/train-cw-up.py index bab22ce7..c1c372ad 100755 --- a/hyperion/bin_deprec2/train-cw-up.py +++ b/hyperion/bin_deprec2/train-cw-up.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal diff --git a/hyperion/bin_deprec2/train-cw.py b/hyperion/bin_deprec2/train-cw.py index e8c693c1..cabca7c2 100755 --- a/hyperion/bin_deprec2/train-cw.py +++ b/hyperion/bin_deprec2/train-cw.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal diff --git a/hyperion/bin_deprec2/train-gaussianizer.py b/hyperion/bin_deprec2/train-gaussianizer.py index 4718d3df..aeb51e46 100755 --- a/hyperion/bin_deprec2/train-gaussianizer.py +++ b/hyperion/bin_deprec2/train-gaussianizer.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal diff --git a/hyperion/bin_deprec2/train-lda.py b/hyperion/bin_deprec2/train-lda.py index 9dfe394f..1887a72f 100755 --- a/hyperion/bin_deprec2/train-lda.py +++ b/hyperion/bin_deprec2/train-lda.py @@ -13,6 +13,7 @@ import time import numpy as np + from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.transforms import LDA, SbSw, TransformList diff --git a/hyperion/bin_deprec2/train-linear-gbe-up.py b/hyperion/bin_deprec2/train-linear-gbe-up.py index 9435d0ad..9986b6bc 100755 --- a/hyperion/bin_deprec2/train-linear-gbe-up.py +++ b/hyperion/bin_deprec2/train-linear-gbe-up.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LinearGBEUP as GBE diff --git a/hyperion/bin_deprec2/train-linear-gbe.py b/hyperion/bin_deprec2/train-linear-gbe.py index 75fe0b67..e9455cb8 100755 --- a/hyperion/bin_deprec2/train-linear-gbe.py +++ b/hyperion/bin_deprec2/train-linear-gbe.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LinearGBE as GBE diff --git a/hyperion/bin_deprec2/train-linear-svmc.py b/hyperion/bin_deprec2/train-linear-svmc.py index f48a573e..90ff8768 100755 --- a/hyperion/bin_deprec2/train-linear-svmc.py +++ b/hyperion/bin_deprec2/train-linear-svmc.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LinearSVMC as SVM diff --git a/hyperion/bin_deprec2/train-logistic-regression.py b/hyperion/bin_deprec2/train-logistic-regression.py index f7036879..1aa128a3 100755 --- a/hyperion/bin_deprec2/train-logistic-regression.py +++ b/hyperion/bin_deprec2/train-logistic-regression.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.classifiers import LogisticRegression as LR diff --git a/hyperion/bin_deprec2/train-mvn.py b/hyperion/bin_deprec2/train-mvn.py index ff03175b..2d10b116 100755 --- a/hyperion/bin_deprec2/train-mvn.py +++ b/hyperion/bin_deprec2/train-mvn.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.pdfs.core import Normal diff --git a/hyperion/bin_deprec2/train-nda.py b/hyperion/bin_deprec2/train-nda.py index ec73db2a..946a8baa 100755 --- a/hyperion/bin_deprec2/train-nda.py +++ b/hyperion/bin_deprec2/train-nda.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger from hyperion.np.transforms import NDA, NSbSw, TransformList diff --git a/hyperion/bin_deprec2/train-pca.py b/hyperion/bin_deprec2/train-pca.py index 9d9ae7a9..25dcb366 100755 --- a/hyperion/bin_deprec2/train-pca.py +++ b/hyperion/bin_deprec2/train-pca.py @@ -13,6 +13,7 @@ import time import numpy as np + from hyperion.helpers import VectorReader as VR from hyperion.hyp_defs import config_logger from hyperion.np.transforms import PCA, TransformList diff --git a/hyperion/bin_deprec2/train-plda.py b/hyperion/bin_deprec2/train-plda.py index f8d24366..520f4cd7 100755 --- a/hyperion/bin_deprec2/train-plda.py +++ b/hyperion/bin_deprec2/train-plda.py @@ -14,6 +14,7 @@ import time import numpy as np + from hyperion.helpers import PLDAFactory as F from hyperion.helpers import VectorClassReader as VCR from hyperion.hyp_defs import config_logger diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index bb91e3a5..d9f6b238 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -2,10 +2,12 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionYesNo -from pathlib import Path from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +from jsonargparse import ActionYesNo from tqdm import tqdm + from ..utils import PathLike @@ -63,9 +65,10 @@ def _get_recording_duration(scp, i, n): def get_recording_duration(self, recording_set): - from ..utils import SCPList import itertools + from ..utils import SCPList + scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) futures = [] with ThreadPoolExecutor(max_workers=self.num_threads) as pool: diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index 5c04f86c..d8b9dd99 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -2,18 +2,18 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionYesNo -from pathlib import Path -import re import logging -from tqdm import tqdm +import re from concurrent.futures import ThreadPoolExecutor +from pathlib import Path -import pandas as pd import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm -from ..utils.misc import urlretrieve_progress, PathLike -from ..utils import RecordingSet, SegmentSet, ClassInfo, Dataset +from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress from .data_prep import DataPrep diff --git a/hyperion/torch/adv_attacks/art_attack_factory.py b/hyperion/torch/adv_attacks/art_attack_factory.py index 801ba948..678470f5 100644 --- a/hyperion/torch/adv_attacks/art_attack_factory.py +++ b/hyperion/torch/adv_attacks/art_attack_factory.py @@ -4,7 +4,7 @@ """ import numpy as np -from jsonargparse import ActionParser, ArgumentParser, ActionYesNo +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser try: from art.attacks import evasion as attacks diff --git a/hyperion/torch/adv_attacks/random_attack_factory.py b/hyperion/torch/adv_attacks/random_attack_factory.py index 133e5e2b..a91c99ac 100644 --- a/hyperion/torch/adv_attacks/random_attack_factory.py +++ b/hyperion/torch/adv_attacks/random_attack_factory.py @@ -5,9 +5,10 @@ import math -import torch from jsonargparse import ActionParser, ArgumentParser +import torch + from .attack_factory import AttackFactory as AF diff --git a/hyperion/torch/adv_defenses/wave_gan_white.py b/hyperion/torch/adv_defenses/wave_gan_white.py index af51dc00..5d045f08 100644 --- a/hyperion/torch/adv_defenses/wave_gan_white.py +++ b/hyperion/torch/adv_defenses/wave_gan_white.py @@ -8,9 +8,10 @@ import librosa import numpy as np -import torch import yaml +import torch + try: # import parallel_wavegan.models from parallel_wavegan.layers import PQMF @@ -20,6 +21,7 @@ pass from sklearn.preprocessing import StandardScaler + from torch import nn diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index cc9a3a5e..1e42a1c3 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -11,10 +11,11 @@ import pandas as pd #import k2 import sentencepiece as spm -import torch -import torch.distributed as dist import torchaudio.transforms as tat from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +import torch +import torch.distributed as dist from torch.utils.data import Dataset from ...io import RandomAccessAudioReader as AR diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index 78bf89b6..c890627e 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -7,6 +7,7 @@ import math import numpy as np + import torch import torch.distributed as dist diff --git a/hyperion/torch/data/class_weighted_embed_sampler.py b/hyperion/torch/data/class_weighted_embed_sampler.py index aed9105d..edf1c00d 100644 --- a/hyperion/torch/data/class_weighted_embed_sampler.py +++ b/hyperion/torch/data/class_weighted_embed_sampler.py @@ -9,9 +9,10 @@ import numpy as np import pandas as pd -import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +import torch + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index b551f342..7fbfbd71 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -9,9 +9,10 @@ import numpy as np import pandas as pd -import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +import torch + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/embed_dataset.py b/hyperion/torch/data/embed_dataset.py index 31fd00fd..519f498d 100644 --- a/hyperion/torch/data/embed_dataset.py +++ b/hyperion/torch/data/embed_dataset.py @@ -10,9 +10,10 @@ import numpy as np import pandas as pd +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.distributed as dist -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.utils.data import Dataset from ...io import RandomAccessDataReaderFactory as RF diff --git a/hyperion/torch/data/embed_sampler.py b/hyperion/torch/data/embed_sampler.py index 8836fe2a..65adcba6 100644 --- a/hyperion/torch/data/embed_sampler.py +++ b/hyperion/torch/data/embed_sampler.py @@ -7,9 +7,10 @@ import math import numpy as np -import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +import torch + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/feat_seq_dataset.py b/hyperion/torch/data/feat_seq_dataset.py index 68dea5c3..bb487dda 100644 --- a/hyperion/torch/data/feat_seq_dataset.py +++ b/hyperion/torch/data/feat_seq_dataset.py @@ -12,9 +12,10 @@ import numpy as np import pandas as pd +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.distributed as dist -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.utils.data import Dataset from ...io import RandomAccessDataReaderFactory as RF diff --git a/hyperion/torch/data/hyp_sampler.py b/hyperion/torch/data/hyp_sampler.py index c5097723..d1bcb0a8 100644 --- a/hyperion/torch/data/hyp_sampler.py +++ b/hyperion/torch/data/hyp_sampler.py @@ -2,9 +2,10 @@ import math import numpy as np +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.distributed as dist -from jsonargparse import ActionParser, ArgumentParser from torch.utils.data import Sampler diff --git a/hyperion/torch/data/paired_feat_seq_dataset.py b/hyperion/torch/data/paired_feat_seq_dataset.py index fc17593e..eff2ed58 100644 --- a/hyperion/torch/data/paired_feat_seq_dataset.py +++ b/hyperion/torch/data/paired_feat_seq_dataset.py @@ -6,6 +6,7 @@ import logging import numpy as np + import torch from ...utils.utt2info import Utt2Info diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py index 76054cd8..2933dcc6 100644 --- a/hyperion/torch/data/seg_chunk_sampler.py +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -8,9 +8,10 @@ import numpy as np import pandas as pd +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.distributed as dist -from jsonargparse import ActionParser, ArgumentParser from ...utils.segment_set import SegmentSet from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index 74726f63..39d1eed2 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -7,9 +7,10 @@ import math import numpy as np -import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +import torch + from .hyp_sampler import HypSampler diff --git a/hyperion/torch/data/weighted_embed_sampler.py b/hyperion/torch/data/weighted_embed_sampler.py index 22da93f9..5870512a 100644 --- a/hyperion/torch/data/weighted_embed_sampler.py +++ b/hyperion/torch/data/weighted_embed_sampler.py @@ -7,6 +7,7 @@ import math import numpy as np + import torch from torch.utils.data import Sampler diff --git a/hyperion/torch/data/weighted_seq_sampler.py b/hyperion/torch/data/weighted_seq_sampler.py index 345c2429..b6f0b670 100644 --- a/hyperion/torch/data/weighted_seq_sampler.py +++ b/hyperion/torch/data/weighted_seq_sampler.py @@ -7,9 +7,10 @@ import math import numpy as np +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.distributed as dist -from jsonargparse import ActionParser, ArgumentParser from torch.utils.data import Sampler diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index 0487ae4f..22cc629d 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -31,7 +31,8 @@ from .spine_blocks import BlockSpec, SpineConv, SpineEndpoints, SpineResample from .tdnn_blocks import TDNNBlock from .transducer_joiner import TransducerJoiner -from .transducer_predictor import TransducerRNNPredictor, TransducerConvPredictor +from .transducer_predictor import (TransducerConvPredictor, + TransducerRNNPredictor) from .transformer_conv2d_subsampler import TransformerConv2dSubsampler from .transformer_encoder_v1 import TransformerEncoderBlockV1 from .transformer_feedforward import (Conv1dLinear, Conv1dx2, diff --git a/hyperion/torch/layer_blocks/etdnn_blocks.py b/hyperion/torch/layer_blocks/etdnn_blocks.py index 10fd09b3..b6afdd29 100644 --- a/hyperion/torch/layer_blocks/etdnn_blocks.py +++ b/hyperion/torch/layer_blocks/etdnn_blocks.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear diff --git a/hyperion/torch/layer_blocks/resetdnn_blocks.py b/hyperion/torch/layer_blocks/resetdnn_blocks.py index 1af632fb..dfea3720 100644 --- a/hyperion/torch/layer_blocks/resetdnn_blocks.py +++ b/hyperion/torch/layer_blocks/resetdnn_blocks.py @@ -5,6 +5,7 @@ # import numpy as np + import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear diff --git a/hyperion/torch/layer_blocks/transducer_joiner.py b/hyperion/torch/layer_blocks/transducer_joiner.py index 738c0cda..d2a7310d 100644 --- a/hyperion/torch/layer_blocks/transducer_joiner.py +++ b/hyperion/torch/layer_blocks/transducer_joiner.py @@ -5,9 +5,10 @@ import logging from typing import Optional, Tuple +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser class TransducerJoiner(nn.Module): diff --git a/hyperion/torch/layer_blocks/transducer_predictor.py b/hyperion/torch/layer_blocks/transducer_predictor.py index 6f43343a..256753c5 100644 --- a/hyperion/torch/layer_blocks/transducer_predictor.py +++ b/hyperion/torch/layer_blocks/transducer_predictor.py @@ -5,9 +5,10 @@ import logging from typing import Optional, Tuple +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ...utils.misc import filter_func_args from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py index b2aa1692..6b508b0e 100644 --- a/hyperion/torch/layers/__init__.py +++ b/hyperion/torch/layers/__init__.py @@ -17,6 +17,7 @@ from .mvn import MeanVarianceNorm from .norm_layer_factory import NormLayer1dFactory, NormLayer2dFactory from .pool_factory import GlobalPool1dFactory -from .pos_encoder import NoPosEncoder, PosEncoder, RelPosEncoder, ConvPosEncoder +from .pos_encoder import (ConvPosEncoder, NoPosEncoder, PosEncoder, + RelPosEncoder) from .spec_augment import AxisMasker, SpecAugment, SpecWarper from .subpixel_convs import ICNR1d, ICNR2d, SubPixelConv1d, SubPixelConv2d diff --git a/hyperion/torch/layers/activation_factory.py b/hyperion/torch/layers/activation_factory.py index 9d972f95..f2467962 100644 --- a/hyperion/torch/layers/activation_factory.py +++ b/hyperion/torch/layers/activation_factory.py @@ -6,7 +6,7 @@ import torch.nn as nn -from .swish import Swish, DoubleSwish, Swish6, DoubleSwish6 +from .swish import DoubleSwish, DoubleSwish6, Swish, Swish6 act_dict = { "elu": nn.ELU, diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 85ba92f6..5e38494f 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -6,6 +6,7 @@ import math import numpy as np + import torch import torch.nn as nn import torch.nn.functional as nnf diff --git a/hyperion/torch/layers/mvn.py b/hyperion/torch/layers/mvn.py index b90a65c8..4b4c5927 100644 --- a/hyperion/torch/layers/mvn.py +++ b/hyperion/torch/layers/mvn.py @@ -2,9 +2,10 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser class MeanVarianceNorm(nn.Module): diff --git a/hyperion/torch/layers/pool_factory.py b/hyperion/torch/layers/pool_factory.py index c0e573af..84d0cbf1 100644 --- a/hyperion/torch/layers/pool_factory.py +++ b/hyperion/torch/layers/pool_factory.py @@ -2,9 +2,10 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser +import torch.nn as nn + from .global_pool import * diff --git a/hyperion/torch/layers/spec_augment.py b/hyperion/torch/layers/spec_augment.py index 761a4e31..f4e03842 100644 --- a/hyperion/torch/layers/spec_augment.py +++ b/hyperion/torch/layers/spec_augment.py @@ -4,10 +4,11 @@ """ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn import torch.nn.functional as nnf -from jsonargparse import ActionParser, ArgumentParser count = 0 diff --git a/hyperion/torch/loggers/logger.py b/hyperion/torch/loggers/logger.py index 46c1130d..7e9c91f2 100644 --- a/hyperion/torch/loggers/logger.py +++ b/hyperion/torch/loggers/logger.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch.distributed as dist diff --git a/hyperion/torch/loggers/logger_list.py b/hyperion/torch/loggers/logger_list.py index 20ae58ec..0291a01f 100644 --- a/hyperion/torch/loggers/logger_list.py +++ b/hyperion/torch/loggers/logger_list.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch.distributed as dist from .tensorboard_logger import TensorBoardLogger as TBL diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index ab518ad4..cf003ca7 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -2,9 +2,10 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import torch from jsonargparse import ActionParser, ArgumentParser +import torch + from .cos_lr import AdamCosineLR, CosineLR from .exp_lr import ExponentialLR from .invpow_lr import InvPowLR diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 0ab63adf..06838ddd 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -4,11 +4,11 @@ """ +from .transducer import RNNRNNTransducer, RNNTransducer from .vae.vae import VAE from .vae.vq_vae import VQVAE -from .transducer import RNNTransducer, RNNRNNTransducer -from .wav2transducer import (HFWav2Vec2RNNRNNTransducer, - HFWav2Vec2ConformerV1RNNTransducer, +from .wav2transducer import (HFWav2Vec2ConformerV1RNNTransducer, + HFWav2Vec2RNNRNNTransducer, HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer) from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, HFWavLM2ResNet1dXVector) diff --git a/hyperion/torch/models/transducer/conformer.py b/hyperion/torch/models/transducer/conformer.py index f62621af..511cc178 100644 --- a/hyperion/torch/models/transducer/conformer.py +++ b/hyperion/torch/models/transducer/conformer.py @@ -20,9 +20,10 @@ import warnings from typing import List, Optional, Tuple +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch from hyperion.utils.text import make_pad_mask, subsequent_chunk_mask -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch import Tensor, nn from .transformer import Transformer diff --git a/hyperion/torch/models/transducer/decoder.py b/hyperion/torch/models/transducer/decoder.py index 7f3698d7..484f6f38 100644 --- a/hyperion/torch/models/transducer/decoder.py +++ b/hyperion/torch/models/transducer/decoder.py @@ -17,9 +17,10 @@ import logging from typing import Optional, Tuple +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser # TODO(fangjun): Support switching between LSTM and GRU diff --git a/hyperion/torch/models/transducer/joiner.py b/hyperion/torch/models/transducer/joiner.py index 983f064a..a7c2e35b 100644 --- a/hyperion/torch/models/transducer/joiner.py +++ b/hyperion/torch/models/transducer/joiner.py @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser class Joiner(nn.Module): diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer.py index c9ba365e..938149ec 100644 --- a/hyperion/torch/models/transducer/transducer.py +++ b/hyperion/torch/models/transducer/transducer.py @@ -26,10 +26,11 @@ import logging -import torch -import torch.nn as nn import torchaudio import torchaudio.functional + +import torch +import torch.nn as nn from hyperion.utils.text import add_sos from ...torch_model import TorchModel diff --git a/hyperion/torch/models/tvector/tvector.py b/hyperion/torch/models/tvector/tvector.py index a4e4d148..a46fc324 100644 --- a/hyperion/torch/models/tvector/tvector.py +++ b/hyperion/torch/models/tvector/tvector.py @@ -5,9 +5,10 @@ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...narchs import ClassifHead, ConformerEncoderV1, TorchNALoader from ..layer_blocks import TDNNBlock diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py index 47dfd910..1d16675c 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py @@ -7,9 +7,10 @@ from dataclasses import dataclass from typing import Dict, List, Union +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...torch_model import TorchModel from ...utils import remove_silence diff --git a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py index 4cebfd66..4f1c500d 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2transducer.py @@ -5,9 +5,10 @@ import contextlib import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...torch_model import TorchModel from ...utils import remove_silence diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py index bd58e2cd..123c9de8 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2_transducer.py @@ -5,9 +5,10 @@ import logging from typing import Dict, Optional, Union +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFWav2Vec2 from ..transducer import Transducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py index 09b0196e..3b18de3a 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2conformer_v1_rnn_transducer.py @@ -5,9 +5,10 @@ import logging from typing import Dict, Optional, Union +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFWav2Vec2 from ..transducer import ConformerV1RNNTransducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py index a4d2b0cc..d9eeaebe 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_rnn_transducer.py @@ -5,9 +5,10 @@ import logging from typing import Dict, Optional, Union +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFWav2Vec2 from ..transducer import RNNRNNTransducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py index f4e02a23..fe82f734 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py @@ -5,9 +5,10 @@ import logging from typing import Dict, Optional, Union +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFWav2Vec2 from ..transducer import RNNTransducer diff --git a/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py index 458e7cae..4b2f235b 100644 --- a/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py @@ -5,9 +5,10 @@ import logging from typing import Dict, Optional, Tuple, Union +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...narchs import AudioFeatsMVN from ...torch_model import TorchModel diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py index fb528809..b75ac53f 100644 --- a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py @@ -5,9 +5,10 @@ import logging from typing import Dict, Optional, Union +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFHubert from ..xvectors import ResNet1dXVector diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py index 739213b4..8a17379c 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -5,9 +5,10 @@ import logging from typing import Dict, Optional, Union +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFWav2Vec2 from ..xvectors import ResNet1dXVector diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 6ff8f8b4..5599fa1e 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -5,9 +5,10 @@ import contextlib import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...torch_model import TorchModel from ...utils import remove_silence diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py index 87e9a6f8..56a19130 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py @@ -5,9 +5,10 @@ import logging from typing import Dict, Optional, Union +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFWavLM from ..xvectors import ResNet1dXVector diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py index b545bfaf..0d9f1bc4 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -5,9 +5,10 @@ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ..xvectors import ResNet1dXVector from .wav2xvector import Wav2XVector diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py index 51e045da..1f7283a0 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py @@ -5,9 +5,10 @@ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ..xvectors import ResNetXVector from .wav2xvector import Wav2XVector diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 838f3342..4c21f478 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -4,9 +4,10 @@ """ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...narchs import AudioFeatsMVN from ...torch_model import TorchModel diff --git a/hyperion/torch/models/xvectors/efficient_net_xvector.py b/hyperion/torch/models/xvectors/efficient_net_xvector.py index dc864f1c..132bb51d 100644 --- a/hyperion/torch/models/xvectors/efficient_net_xvector.py +++ b/hyperion/torch/models/xvectors/efficient_net_xvector.py @@ -5,9 +5,10 @@ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...narchs import EfficientNet as EN from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index bd22f1ae..20865880 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -5,9 +5,10 @@ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...narchs import ResNet1dEncoder as Encoder from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index 21c4ce81..0e9eba22 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -5,9 +5,10 @@ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...narchs import ResNetFactory as RNF from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index e2dbbb2c..0b27a840 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -5,9 +5,10 @@ """ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...narchs import SpineNetFactory as SNF from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/tdnn_xvector.py b/hyperion/torch/models/xvectors/tdnn_xvector.py index af8731d5..38262cc3 100644 --- a/hyperion/torch/models/xvectors/tdnn_xvector.py +++ b/hyperion/torch/models/xvectors/tdnn_xvector.py @@ -5,9 +5,10 @@ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...narchs import TDNNFactory as TF from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/transformer_xvector_v1.py b/hyperion/torch/models/xvectors/transformer_xvector_v1.py index 913c9550..25e9c894 100644 --- a/hyperion/torch/models/xvectors/transformer_xvector_v1.py +++ b/hyperion/torch/models/xvectors/transformer_xvector_v1.py @@ -5,9 +5,10 @@ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...narchs import TransformerEncoderV1 as TE from .xvector import XVector diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index fb5fab46..8556104a 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -6,9 +6,10 @@ from enum import Enum from typing import Optional +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ....utils.misc import filter_func_args from ...layer_blocks import TDNNBlock @@ -17,7 +18,6 @@ from ...torch_model import TorchModel from ...utils import eval_nnet_by_chunks, scale_seq_lengths - # class XVectorTrainMode(Enum): # full = 0 # frozen = 1 diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index 8a877d5e..160ee61b 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -2,9 +2,10 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ..layers import AudioFeatsFactory as AFF from ..layers import MeanVarianceNorm as MVN diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index b5008f28..9f9b280b 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -4,9 +4,10 @@ """ +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.nn import Linear from ...utils.misc import filter_func_args diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index ed328223..97cb6d5b 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -3,17 +3,19 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser, ActionYesNo from ...utils.misc import filter_func_args from ..layer_blocks import ConformerEncoderBlockV1 as EBlock from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler from ..layers import ActivationFactory as AF +from ..layers import ConvPosEncoder, NoPosEncoder from ..layers import NormLayer1dFactory as NLF -from ..layers import NoPosEncoder, PosEncoder, RelPosEncoder, ConvPosEncoder -from ..utils import seq_lengths_to_mask, scale_seq_lengths +from ..layers import PosEncoder, RelPosEncoder +from ..utils import scale_seq_lengths, seq_lengths_to_mask from .net_arch import NetArch diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index 57d9adec..f5ab74d5 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -5,9 +5,10 @@ import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import DC1dDecBlock from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index aaf1bb2d..0c331a5e 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -4,9 +4,10 @@ """ import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks.dc1d_blocks import DC1dEncBlock from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index 87a18bfe..4106cbfd 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -5,9 +5,10 @@ import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import DC2dDecBlock from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index 70eeac3c..ce7b9677 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -5,9 +5,10 @@ import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import DC2dEncBlock from ..layers import ActivationFactory as AF diff --git a/hyperion/torch/narchs/efficient_net.py b/hyperion/torch/narchs/efficient_net.py index 1eddc3ff..b9efdcef 100644 --- a/hyperion/torch/narchs/efficient_net.py +++ b/hyperion/torch/narchs/efficient_net.py @@ -5,9 +5,10 @@ import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.nn import Dropout, Linear from ..layer_blocks import MBConvBlock, MBConvInOutBlock diff --git a/hyperion/torch/narchs/etdnn.py b/hyperion/torch/narchs/etdnn.py index d2b2d298..a73439b7 100644 --- a/hyperion/torch/narchs/etdnn.py +++ b/hyperion/torch/narchs/etdnn.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch import torch.nn as nn from torch.nn import Conv1d, Linear diff --git a/hyperion/torch/narchs/net_arch.py b/hyperion/torch/narchs/net_arch.py index 4b39804c..9a3fc65f 100644 --- a/hyperion/torch/narchs/net_arch.py +++ b/hyperion/torch/narchs/net_arch.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch.nn as nn from ..torch_model import TorchModel diff --git a/hyperion/torch/narchs/resetdnn.py b/hyperion/torch/narchs/resetdnn.py index c4dc7784..eb964fa5 100644 --- a/hyperion/torch/narchs/resetdnn.py +++ b/hyperion/torch/narchs/resetdnn.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index bccd0646..858cf4ea 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -5,20 +5,15 @@ import logging import numpy as np + import torch import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear -from ..layer_blocks import ( - Res2NetBasicBlock, - Res2NetBNBlock, - ResNetBasicBlock, - ResNetBNBlock, - ResNetEndpointBlock, - ResNetInputBlock, - SEResNetBasicBlock, - SEResNetBNBlock, -) +from ..layer_blocks import (Res2NetBasicBlock, Res2NetBNBlock, + ResNetBasicBlock, ResNetBNBlock, + ResNetEndpointBlock, ResNetInputBlock, + SEResNetBasicBlock, SEResNetBNBlock) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..utils import scale_seq_lengths, seq_lengths_to_mask diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index d65bab00..0c577174 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -4,9 +4,10 @@ """ import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import (DC1dDecBlock, ResNet1dBasicDecBlock, ResNet1dBNDecBlock, SEResNet1dBasicDecBlock, diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index ab184467..5bdad186 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -7,9 +7,10 @@ import math import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import (DC1dEncBlock, Res2Net1dBasicBlock, Res2Net1dBNBlock, ResNet1dBasicBlock, diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index 50369c8d..426b37f5 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -5,9 +5,10 @@ import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import (DC2dDecBlock, ResNet2dBasicDecBlock, ResNet2dBNDecBlock, SEResNet2dBasicDecBlock, diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index 8a76e348..84e6599e 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -6,9 +6,10 @@ import logging import math +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layer_blocks import (DC2dEncBlock, Res2Net2dBasicBlock, Res2Net2dBNBlock, ResNet2dBasicBlock, diff --git a/hyperion/torch/narchs/rnn_encoder.py b/hyperion/torch/narchs/rnn_encoder.py index 0c3b623a..7df33274 100644 --- a/hyperion/torch/narchs/rnn_encoder.py +++ b/hyperion/torch/narchs/rnn_encoder.py @@ -8,9 +8,10 @@ from typing import Dict, Optional, Tuple, Union import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from ...utils.misc import filter_func_args diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index e9c50197..8db6c23a 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -6,12 +6,13 @@ from dataclasses import dataclass from typing import Dict, List, Optional, Tuple -import torch -import torch.nn as nn import torchaudio import torchaudio.functional from jsonargparse import ActionParser, ArgumentParser +import torch +import torch.nn as nn + try: import k2 except ModuleNotFoundError: @@ -19,8 +20,9 @@ from ...utils.misc import filter_func_args from ...utils.text import add_sos +from ..layer_blocks import TransducerConvPredictor as ConvPredictor from ..layer_blocks import TransducerJoiner as Joiner -from ..layer_blocks import TransducerRNNPredictor as RNNPredictor, TransducerConvPredictor as ConvPredictor +from ..layer_blocks import TransducerRNNPredictor as RNNPredictor from .net_arch import NetArch diff --git a/hyperion/torch/narchs/spinenet.py b/hyperion/torch/narchs/spinenet.py index da47ffe5..117c0733 100644 --- a/hyperion/torch/narchs/spinenet.py +++ b/hyperion/torch/narchs/spinenet.py @@ -6,6 +6,7 @@ import logging import numpy as np + import torch import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear diff --git a/hyperion/torch/narchs/tdnn.py b/hyperion/torch/narchs/tdnn.py index 8ac9be79..55e47e6a 100644 --- a/hyperion/torch/narchs/tdnn.py +++ b/hyperion/torch/narchs/tdnn.py @@ -4,6 +4,7 @@ """ import numpy as np + import torch import torch.nn as nn from torch.nn import Linear diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index fd3de235..4468185e 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -3,9 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler from ..layer_blocks import TransformerEncoderBlockV1 as EBlock diff --git a/hyperion/torch/optim/factory.py b/hyperion/torch/optim/factory.py index aa1acdc8..95117b05 100644 --- a/hyperion/torch/optim/factory.py +++ b/hyperion/torch/optim/factory.py @@ -4,9 +4,10 @@ """ import logging +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.optim as optim -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_args from .radam import RAdam diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index 659f9dde..b2198924 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -6,11 +6,12 @@ import os from typing import Callable, List, Optional, Tuple, Union -import torch -import torch.nn as nn from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from transformers import HubertConfig, HubertModel +import torch +import torch.nn as nn + from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index 76d80aa4..e1f21153 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -6,11 +6,12 @@ import os from typing import Callable, List, Optional, Tuple, Union -import torch -import torch.nn as nn from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from transformers import Wav2Vec2Config, Wav2Vec2Model +import torch +import torch.nn as nn + from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index 5dd6a539..b0a815c7 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -8,11 +8,12 @@ from turtle import right from typing import List, Optional, Tuple, Union -import torch -import torch.nn as nn from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor +import torch +import torch.nn as nn + from ...torch_model import TorchModel from ...utils import scale_seq_lengths, seq_lengths_to_mask from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index eec88dec..0d5c5ad3 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -6,11 +6,12 @@ import os from typing import Callable, List, Optional, Tuple, Union -import torch -import torch.nn as nn from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from transformers import WavLMConfig, WavLMModel +import torch +import torch.nn as nn + from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 69e97cc6..9f5fafe6 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 0523ad44..e2d2d1f6 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 00a218f9..a6f20a8e 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -11,12 +11,13 @@ from enum import Enum from pathlib import Path +from fairscale.optim.grad_scaler import ShardedGradScaler +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.distributed as dist import torch.nn as nn -from fairscale.optim.grad_scaler import ShardedGradScaler -from jsonargparse import ActionParser, ArgumentParser from torch.optim.swa_utils import SWALR, AveragedModel from ...utils.misc import filter_func_args diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 3f0b3f1f..3a9cc288 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -6,10 +6,11 @@ import os from collections import OrderedDict as ODict -import torch -import torch.nn as nn import torchaudio from jsonargparse import ActionParser, ArgumentParser + +import torch +import torch.nn as nn from torch.distributed.elastic.multiprocessing.errors import record from ...utils.misc import filter_func_args diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index ba401cb7..f4877dc6 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index 03800e0d..fc9d98f1 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 40b6b10d..35946e96 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -7,10 +7,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index af915d6b..303427de 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -7,10 +7,11 @@ import time from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 1e1b1778..2a012dde 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -7,10 +7,11 @@ import time from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 4e791347..9d04af42 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -6,10 +6,11 @@ import os from collections import OrderedDict as ODict +from jsonargparse import ActionParser, ArgumentParser + import torch import torch.cuda.amp as amp import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index ad9c825c..1aefb3d4 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -6,13 +6,14 @@ import logging import os -import torch -import torch.distributed as dist -import torch.nn as nn from fairscale.nn.data_parallel import \ FullyShardedDataParallel as FullyShardedDDP from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP +import torch +import torch.distributed as dist +import torch.nn as nn + from .devices import open_device diff --git a/hyperion/torch/utils/metric_acc.py b/hyperion/torch/utils/metric_acc.py index 7b423a3e..a82c174a 100644 --- a/hyperion/torch/utils/metric_acc.py +++ b/hyperion/torch/utils/metric_acc.py @@ -6,6 +6,7 @@ from collections import OrderedDict as ODict import numpy as np + import torch import torch.distributed as dist diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index ffffc0b4..db035987 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -3,16 +3,15 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .misc import PathLike -from .dataset import Dataset from .class_info import ClassInfo +from .dataset import Dataset from .feature_set import FeatureSet from .hyp_dataclass import HypDataClass from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix +from .misc import PathLike from .recording_set import RecordingSet from .rttm import RTTM from .scp_list import SCPList - # from .ext_segment_list import ExtSegmentList from .segment_list import SegmentList from .segment_set import SegmentSet diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index 546dd715..efb7c114 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -3,15 +3,16 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from typing import Dict, Optional from pathlib import Path +from typing import Dict, Optional + import yaml -from .segment_set import SegmentSet -from .recording_set import RecordingSet -from .feature_set import FeatureSet from .class_info import ClassInfo +from .feature_set import FeatureSet from .misc import PathLike +from .recording_set import RecordingSet +from .segment_set import SegmentSet class Dataset: diff --git a/hyperion/utils/lexicon.py b/hyperion/utils/lexicon.py index 80bd7c1e..6128c0ff 100644 --- a/hyperion/utils/lexicon.py +++ b/hyperion/utils/lexicon.py @@ -22,6 +22,7 @@ from typing import List, Tuple import k2 + import torch diff --git a/hyperion/utils/misc.py b/hyperion/utils/misc.py index 4ab3ce0a..6fb7d24b 100644 --- a/hyperion/utils/misc.py +++ b/hyperion/utils/misc.py @@ -4,9 +4,9 @@ Miscellaneous functions """ -from typing import TypeVar from inspect import signature from pathlib import Path +from typing import TypeVar import numpy as np diff --git a/hyperion/utils/text.py b/hyperion/utils/text.py index be70f638..2846fdbf 100644 --- a/hyperion/utils/text.py +++ b/hyperion/utils/text.py @@ -12,6 +12,7 @@ import numpy as np import pandas as pd + import torch diff --git a/requirements.txt b/requirements.txt index 6f1c8bc1..c3410829 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,9 @@ yapf jsonargparse>=3.5.0 wandb>=0.10.30 librosa>=0.8.1 +isort black twine wheel transformers>=4.16.2 +sentencepiece>=0.1.97 From 35391de52990806d4802a7e034abe0dc84d675ff Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 4 May 2023 09:55:06 -0400 Subject: [PATCH 098/154] new vox2 dataprep --- hyperion/data_prep/data_prep.py | 8 +- hyperion/data_prep/voxceleb2.py | 16 +- .../torch/narchs/rnn_transducer_decoder.py | 407 +++++++++--------- 3 files changed, 224 insertions(+), 207 deletions(-) diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index d9f6b238..fb6fc6c5 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -2,6 +2,7 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -66,21 +67,22 @@ def _get_recording_duration(scp, i, n): def get_recording_duration(self, recording_set): import itertools - from ..utils import SCPList scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) futures = [] + logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: - for i in range(self.num_threads): + for i in tqdm(range(self.num_threads)): future = pool.submit( DataPrep._get_recording_duration, scp, i, self.num_threads ) futures.append(future) + logging.info("waiting threats...") res = [f.result() for f in tqdm(futures)] fss = list(itertools.chain(*[r[0] for r in res])) - durations = list(itertools.chain(*[r[0] for r in res])) + durations = list(itertools.chain(*[r[1] for r in res])) recording_set["duration"] = durations recording_set["sample_freq"] = fss diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index d8b9dd99..a1a9f0c3 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -158,8 +158,9 @@ def prepare(self): file_paths = [] futures = [] logging.info("making video cat lists") + logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: - for i, rec_id in enumerate(rec_ids): + for i, rec_id in tqdm(enumerate(rec_ids)): future = pool.submit( VoxCeleb2DataPrep.make_cat_list, lists_cat_dir, @@ -170,6 +171,7 @@ def prepare(self): ) futures.append(future) + logging.info("waiting threats...") file_paths = [f.result() for f in tqdm(futures)] video_ids = uniq_video_ids @@ -213,14 +215,14 @@ def prepare(self): df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A" for r in rec_ids ], - # "duration": recs.loc[rec_ids, "duration"], + "duration": recs.loc[rec_ids, "duration"].values, } ) - print( - recs.loc[rec_ids, "duration"], - len(segments), - len(recs.loc[rec_ids, "duration"]), - ) + # print( + # recs.loc[rec_ids, "duration"], + # len(segments), + # len(recs.loc[rec_ids, "duration"]), + # ) segments = SegmentSet(segments) segments.sort() diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index 8db6c23a..763ec67c 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -16,7 +16,7 @@ try: import k2 except ModuleNotFoundError: - from ...utils import dummy_k2 as k2 + from ..utils import dummy_k2 as k2 from ...utils.misc import filter_func_args from ...utils.text import add_sos @@ -99,10 +99,8 @@ def __init__( if self.rnnt_loss == "k2_pruned": self.simple_am_proj = nn.Linear(in_feats, vocab_size) - self.simple_lm_proj = nn.Linear(self.predictor.out_feats, - vocab_size) - self.register_buffer("cur_step", torch.as_tensor(0, - dtype=torch.int)) + self.simple_lm_proj = nn.Linear(self.predictor.out_feats, vocab_size) + self.register_buffer("cur_step", torch.as_tensor(0, dtype=torch.int)) def _make_predictor(self): pred_type = self.predictor_args["pred_type"] @@ -110,12 +108,10 @@ def _make_predictor(self): self.predictor_args["vocab_size"] = self.vocab_size self.predictor_args["blank_id"] = self.blank_id if pred_type == "rnn": - pred_args = filter_func_args(RNNPredictor.__init__, - self.predictor_args) + pred_args = filter_func_args(RNNPredictor.__init__, self.predictor_args) self.predictor = RNNPredictor(**pred_args) elif pred_type == "conv": - pred_args = filter_func_args(ConvPredictor.__init__, - self.predictor_args) + pred_args = filter_func_args(ConvPredictor.__init__, self.predictor_args) self.predictor = ConvPredictor(**pred_args) self.predictor_args["out_feats"] = self.predictor.embed_dim else: @@ -127,8 +123,7 @@ def _make_joiner(self): if joiner_type == "basic": pred_feats = self.predictor_args["out_feats"] hid_feats = self.joiner_args["hid_feats"] - self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, - self.vocab_size) + self.joiner = Joiner(self.in_feats, pred_feats, hid_feats, self.vocab_size) else: raise ValueError(f"Unknown joiner type {joiner_type}") @@ -152,9 +147,14 @@ def get_config(self): base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: torch.Tensor, y_lengths: torch.Tensor, - pred_out: torch.Tensor): + def _rnnt_loss_torchaudio( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): logits = self.joiner(x, pred_out) # rnnt_loss requires 0 padded targets # Note: y does not start with SOS @@ -170,14 +170,17 @@ def _rnnt_loss_torchaudio(self, x: torch.Tensor, x_lengths: torch.Tensor, ) return loss - def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: torch.Tensor, y_lengths: torch.Tensor, - pred_out: torch.Tensor): + def _rnnt_loss_k2( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): y_padded = y.pad(mode="constant", padding_value=0) y_padded = y_padded.to(torch.int64) - boundary = torch.zeros((x.size(0), 4), - dtype=torch.int64, - device=x.device) + boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device) boundary[:, 2] = y_lengths boundary[:, 3] = x_lengths @@ -195,15 +198,18 @@ def _rnnt_loss_k2(self, x: torch.Tensor, x_lengths: torch.Tensor, ) return loss - def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, - y: torch.Tensor, y_lengths: torch.Tensor, - pred_out: torch.Tensor): + def _rnnt_loss_k2_pruned( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + pred_out: torch.Tensor, + ): y_padded = y.pad(mode="constant", padding_value=0) y_padded = y_padded.to(torch.int64) - boundary = torch.zeros((x.size(0), 4), - dtype=torch.int64, - device=x.device) + boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device) boundary[:, 2] = y_lengths boundary[:, 3] = x_lengths @@ -266,7 +272,7 @@ def _rnnt_loss_k2_pruned(self, x: torch.Tensor, x_lengths: torch.Tensor, simple_loss_scale = 1.0 - r * (1.0 - self.simple_loss_scale) pruned_loss_scale = 0.1 + 0.9 * r self.cur_step += 1 - #print(simple_loss_scale, pruned_loss_scale) + # print(simple_loss_scale, pruned_loss_scale) loss = simple_loss_scale * loss_simple + pruned_loss_scale * loss_pruned @@ -288,44 +294,48 @@ def forward( loss_simple = loss_pruned = None if self.rnnt_loss == "k2_pruned": loss, loss_simple, loss_pruned = self._rnnt_loss_k2_pruned( - x, x_lengths, y, y_lengths, pred_out) + x, x_lengths, y, y_lengths, pred_out + ) elif self.rnnt_loss == "k2": loss = self._rnnt_loss_k2(x, x_lengths, y, y_lengths, pred_out) elif self.rnnt_loss == "torchaudio": loss_simple = loss_pruned = None - loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, - pred_out) + loss = self._rnnt_loss_torchaudio(x, x_lengths, y, y_lengths, pred_out) return loss, loss_simple, loss_pruned - def decode(self, - x: torch.Tensor, - x_lengths: torch.Tensor = None, - method="time_sync_beam_search", - beam_width: int = 5, - max_sym_per_frame: int = 3, - max_sym_per_utt: int = 1000) -> List[int]: + def decode( + self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[int]: if method == "time_sync_beam_search": - return self.decode_time_sync_beam_search(x, - x_lengths, - beam_width=beam_width) + return self.decode_time_sync_beam_search( + x, x_lengths, beam_width=beam_width + ) elif method == "align_length_sync_beam_search": return self.decode_align_length_sync_beam_search( + x, x_lengths, beam_width=beam_width, max_sym_per_utt=max_sym_per_utt + ) + elif method == "greedy": + return self.decode_greedy( x, x_lengths, - beam_width=beam_width, - max_sym_per_utt=max_sym_per_utt) - elif method == "greedy": - return self.decode_greedy(x, - x_lengths, - max_sym_per_frame=max_sym_per_frame, - max_sym_per_utt=max_sym_per_utt) - - def decode_greedy(self, - x: torch.Tensor, - x_lengths: torch.Tensor = None, - max_sym_per_frame: int = 3, - max_sym_per_utt: int = 1000) -> List[int]: + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt, + ) + + def decode_greedy( + self, + x: torch.Tensor, + x_lengths: torch.Tensor = None, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[int]: """ Args: x: encoder embeddings with shape = (N, T, C) @@ -339,8 +349,7 @@ def decode_greedy(self, blank_id = self.blank_id device = x.device - sos = torch.tensor([blank_id], device=device, - dtype=torch.int64).reshape(1, 1) + sos = torch.tensor([blank_id], device=device, dtype=torch.int64).reshape(1, 1) pred_out, state = self.predictor(sos) T = x.size(1) t = 0 @@ -350,7 +359,7 @@ def decode_greedy(self, sym_per_utt = 0 while t < T and sym_per_utt < max_sym_per_utt: - x_t = x[:, t:t + 1, :] + x_t = x[:, t : t + 1, :] logits = self.joiner(x_t, pred_out) # (1, 1, 1, vocab_size) # logits is @@ -371,10 +380,9 @@ def decode_greedy(self, return hyp - def decode_time_sync_beam_search(self, - x: torch.Tensor, - x_lengths: torch.Tensor = None, - beam_width: int = 5) -> List[int]: + def decode_time_sync_beam_search( + self, x: torch.Tensor, x_lengths: torch.Tensor = None, beam_width: int = 5 + ) -> List[int]: assert x.ndim == 3 assert x.size(0) == 1, x.size(0) @@ -389,11 +397,10 @@ def decode_time_sync_beam_search(self, max_u = 20000 # terminate after this number of steps u = 0 - cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, - torch.Tensor]]] = {} + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} while t < T and u < max_u: - x_t = x[:, t:t + 1, :] + x_t = x[:, t : t + 1, :] A = B B = [] @@ -406,13 +413,9 @@ def decode_time_sync_beam_search(self, cached_key = "_".join(map(str, y_star.ys)) if cached_key not in cache: - pred_in = torch.tensor([y_star.ys[-1]], - device=device).reshape(1, 1) + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - pred_out, pred_state = self.predictor( - pred_in, - y_star.pred_state, - ) + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] @@ -443,7 +446,7 @@ def decode_time_sync_beam_search(self, topk_log_prob = log_prob.topk(beam_width, dim=-1) # Second, choose other labels - #for i, v in enumerate(log_prob.tolist()): + # for i, v in enumerate(log_prob.tolist()): for v, i in zip(*topk_log_prob): v = v.item() i = i.item() @@ -452,9 +455,7 @@ def decode_time_sync_beam_search(self, new_ys = y_star.ys + [i] new_log_prob = y_star.log_prob + v new_hyp = Hypothesis( - ys=new_ys, - log_prob=new_log_prob, - pred_state=pred_state, + ys=new_ys, log_prob=new_log_prob, pred_state=pred_state, ) A.append(new_hyp) @@ -462,12 +463,9 @@ def decode_time_sync_beam_search(self, # check whether B contains more than "beam" elements more probable # than the most probable in A A_most_probable = max(A, key=lambda hyp: hyp.log_prob) - #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) B = sorted( - [ - hyp - for hyp in B if hyp.log_prob > A_most_probable.log_prob - ], + [hyp for hyp in B if hyp.log_prob > A_most_probable.log_prob], key=lambda hyp: hyp.log_prob, reverse=True, ) @@ -483,17 +481,17 @@ def decode_time_sync_beam_search(self, break t += 1 - best_hyp = max(B, - key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + best_hyp = max(B, key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) ys = best_hyp.ys[1:] # [1:] to remove the blank return ys def decode_align_length_sync_beam_search( - self, - x: torch.Tensor, - x_lengths: torch.Tensor, - beam_width: int = 5, - max_sym_per_utt: int = 1000) -> List[int]: + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + beam_width: int = 5, + max_sym_per_utt: int = 1000, + ) -> List[int]: assert x.ndim == 3 assert x.size(0) == 1, x.size(0) @@ -503,39 +501,34 @@ def decode_align_length_sync_beam_search( sos = torch.tensor([blank_id], device=device).reshape(1, 1) pred_out, state = self.predictor(sos) T = x.size(1) - #t = 0 + # t = 0 B = [Hypothesis(ys=[blank_id], log_prob=0.0, pred_state=None)] - #max_u = 20000 # terminate after this number of steps - #u = 0 + # max_u = 20000 # terminate after this number of steps + # u = 0 - cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, - torch.Tensor]]] = {} + cache: Dict[str, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = {} F = [] - #for t < T and u < max_u: + # for t < T and u < max_u: for i in range(T + max_sym_per_utt): A = [] for y_star in B: - #while u < max_u: + # while u < max_u: u = len(y_star.ys) - 1 t = i - u if t >= T: continue - #y_star = max(A, key=lambda hyp: hyp.log_prob) - #A.remove(y_star) - x_t = x[:, t:t + 1, :] + # y_star = max(A, key=lambda hyp: hyp.log_prob) + # A.remove(y_star) + x_t = x[:, t : t + 1, :] # Note: y_star.ys is unhashable, i.e., cannot be used # as a key into a dict cached_key = "_".join(map(str, y_star.ys)) if cached_key not in cache: - pred_in = torch.tensor([y_star.ys[-1]], - device=device).reshape(1, 1) + pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - pred_out, pred_state = self.predictor( - pred_in, - y_star.pred_state, - ) + pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] @@ -563,7 +556,7 @@ def decode_align_length_sync_beam_search( topk_log_prob = log_prob.topk(beam_width, dim=-1) # Second, choose other labels - #for i, v in enumerate(log_prob.tolist()): + # for i, v in enumerate(log_prob.tolist()): for v, i in zip(*topk_log_prob): v = v.item() i = i.item() @@ -572,20 +565,16 @@ def decode_align_length_sync_beam_search( new_ys = y_star.ys + [i] new_log_prob = y_star.log_prob + v new_hyp = Hypothesis( - ys=new_ys, - log_prob=new_log_prob, - pred_state=pred_state, + ys=new_ys, log_prob=new_log_prob, pred_state=pred_state, ) A.append(new_hyp) # check whether B contains more than "beam_width" elements more probable # than the most probable in A - #A_most_probable = max(A, key=lambda hyp: hyp.log_prob) - #print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) + # A_most_probable = max(A, key=lambda hyp: hyp.log_prob) + # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) B0 = sorted( - [hyp for hyp in A], - key=lambda hyp: hyp.log_prob, - reverse=True, + [hyp for hyp in A], key=lambda hyp: hyp.log_prob, reverse=True, ) B = [] B_ys = set() @@ -605,8 +594,7 @@ def decode_align_length_sync_beam_search( B = B[:beam_width] break - best_hyp = max(F, - key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) + best_hyp = max(F, key=lambda hyp: hyp.log_prob / max(1, len(hyp.ys[1:]))) ys = best_hyp.ys[1:] # [1:] to remove the blank return ys @@ -617,8 +605,9 @@ def change_config( rnn_dropout_rate: float = 0.0, ): logging.info("changing decoder config") - self.predictor.change_config(override_dropouts, embed_dropout_rate, - rnn_dropout_rate) + self.predictor.change_config( + override_dropouts, embed_dropout_rate, rnn_dropout_rate + ) @staticmethod def filter_args(**kwargs): @@ -638,49 +627,58 @@ def add_pred_args(parser): "--pred-type", default="rnn", choices=["rnn", "conv"], - help= - """type of predictor between RNN and Convolutional [rnn, conv]""") - pred_parser.add_argument("--embed-dim", - default=1024, - type=int, - help=("token embedding dimension")) + help="""type of predictor between RNN and Convolutional [rnn, conv]""", + ) + pred_parser.add_argument( + "--embed-dim", default=1024, type=int, help=("token embedding dimension") + ) pred_parser.add_argument( "--embed-dropout-rate", default=0.0, type=float, - help=("dropout prob for predictor input embeddings")) - pred_parser.add_argument("--rnn-dropout-rate", - default=0.0, - type=float, - help="""dropout prob for decoder RNN """) + help=("dropout prob for predictor input embeddings"), + ) + pred_parser.add_argument( + "--rnn-dropout-rate", + default=0.0, + type=float, + help="""dropout prob for decoder RNN """, + ) pred_parser.add_argument( "--rnn-type", default="lstm", choices=["lstm", "gru"], - help= - """type of recurrent network for thep predictor in [lstm, gru]""") - - pred_parser.add_argument("--num-layers", - default=2, - type=int, - help="""number of layers of the predictor """) - - pred_parser.add_argument("--hid-feats", - default=512, - type=int, - help="""hidden features of the predictor""") - pred_parser.add_argument("--out-feats", - default=512, - type=int, - help="""output features of the predictor""") - pred_parser.add_argument("--context-size", - default=2, - type=int, - help="""context length of the convolutional - predictor, 1->bigram, 2-> trigram,...""") - - parser.add_argument("--predictor", - action=ActionParser(parser=pred_parser)) + help="""type of recurrent network for thep predictor in [lstm, gru]""", + ) + + pred_parser.add_argument( + "--num-layers", + default=2, + type=int, + help="""number of layers of the predictor """, + ) + + pred_parser.add_argument( + "--hid-feats", + default=512, + type=int, + help="""hidden features of the predictor""", + ) + pred_parser.add_argument( + "--out-feats", + default=512, + type=int, + help="""output features of the predictor""", + ) + pred_parser.add_argument( + "--context-size", + default=2, + type=int, + help="""context length of the convolutional + predictor, 1->bigram, 2-> trigram,...""", + ) + + parser.add_argument("--predictor", action=ActionParser(parser=pred_parser)) @staticmethod def add_joiner_args(parser): @@ -690,39 +688,43 @@ def add_joiner_args(parser): "--joiner-type", default="basic", choices=["basic"], - help= - """type of joiner network, there is only basic joiner for now""") - pred_parser.add_argument("--hid-feats", - default=512, - type=int, - help="""hidden features of the joiner""") - parser.add_argument("--joiner", - action=ActionParser(parser=pred_parser)) + help="""type of joiner network, there is only basic joiner for now""", + ) + pred_parser.add_argument( + "--hid-feats", + default=512, + type=int, + help="""hidden features of the joiner""", + ) + parser.add_argument("--joiner", action=ActionParser(parser=pred_parser)) @staticmethod - def add_class_args(parser, - prefix=None, - skip=set(["in_feats", "blank_id", "vocab_size"])): + def add_class_args( + parser, prefix=None, skip=set(["in_feats", "blank_id", "vocab_size"]) + ): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") if "in_feats" not in skip: - parser.add_argument("--in-feats", - type=int, - required=True, - help=("input feature dimension")) + parser.add_argument( + "--in-feats", type=int, required=True, help=("input feature dimension") + ) if "blank_id" not in skip: - parser.add_argument("--blank-id", - type=int, - default=0, - help=("blank id from tokenizer model")) + parser.add_argument( + "--blank-id", + type=int, + default=0, + help=("blank id from tokenizer model"), + ) if "vocab_size" not in skip: - parser.add_argument("--vocab-size", - type=int, - required=True, - help=("output prediction dimension")) + parser.add_argument( + "--vocab-size", + type=int, + required=True, + help=("output prediction dimension"), + ) RNNTransducerDecoder.add_pred_args(parser) RNNTransducerDecoder.add_joiner_args(parser) @@ -730,56 +732,62 @@ def add_class_args(parser, "--rnnt-loss", default="k2_pruned", choices=["torchaudio", "k2", "k2_pruned"], - help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""") + help="""type of rnn-t loss between torchaudio, k2 or k2_pruned.""", + ) parser.add_argument( "--rnnt-type", default="regular", choices=["regular", "modified", "constrained"], - help= - """type of rnn-t loss between regular, modified or constrained.""") + help="""type of rnn-t loss between regular, modified or constrained.""", + ) parser.add_argument( "--delay-penalty", default=0.0, type=float, - help= - """penalize symbol delay, which is used to make symbol emit earlier - for streaming models.""") + help="""penalize symbol delay, which is used to make symbol emit earlier + for streaming models.""", + ) parser.add_argument( "--reduction", default="sum", choices=["sum", "mean"], - help="""type of reduction for rnn-t loss between sum or mean""") + help="""type of reduction for rnn-t loss between sum or mean""", + ) parser.add_argument( "--prune-range", default=5, type=int, help="""how many symbols to keep for each frame in k2 rnn-t - pruned loss.""") + pruned loss.""", + ) parser.add_argument( "--lm-scale", default=0.25, type=float, - help="""language model scale in rnn-t smoothed loss""") + help="""language model scale in rnn-t smoothed loss""", + ) parser.add_argument( "--am-scale", default=0.0, type=float, - help="""acoustic model scale in rnn-t smoothed loss""") + help="""acoustic model scale in rnn-t smoothed loss""", + ) parser.add_argument( "--simple-loss-scale", default=0.5, type=float, - help="""weight of rnn-t simple loss when using k2 pruned loss""") + help="""weight of rnn-t simple loss when using k2 pruned loss""", + ) parser.add_argument( "--pruned-warmup-steps", default=2000, type=int, help="""number of steps to warm up the k2 rnn-t pruned loss - from 0.1 to 1""") + from 0.1 to 1""", + ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def add_finetune_args(parser, prefix=None, skip=set()): @@ -794,16 +802,21 @@ def add_finetune_args(parser, prefix=None, skip=set()): action=ActionYesNo, help=( "whether to use the dropout probabilities passed in the " - "arguments instead of the defaults in the pretrained model.")) - parser.add_argument("--embed-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder input embeddings")) - parser.add_argument("--rnn-dropout-rate", - default=0.0, - type=float, - help=("dropout prob for decoder RNN ")) + "arguments instead of the defaults in the pretrained model." + ), + ) + parser.add_argument( + "--embed-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder input embeddings"), + ) + parser.add_argument( + "--rnn-dropout-rate", + default=0.0, + type=float, + help=("dropout prob for decoder RNN "), + ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) From cf861bc7b30f9c318ed20308588c71856a545933 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 8 May 2023 14:49:09 -0400 Subject: [PATCH 099/154] fix new vox2 dataprep durations, scp -> RecordingSet --- egs/librispeech/v1/run_011_train_asr.sh | 12 +- egs/librispeech/v1/run_011_train_asr_old.sh | 12 +- .../adv.v1.1/run_005_train_victim_xvector.sh | 4 +- .../run_007_train_transfer_xvector.sh | 4 +- .../run_008_adv_finetune_victim_xvector.sh | 4 +- .../adv.v2/run_011_train_victim_xvector.sh | 4 +- .../run_022_attack_type_classif_allknown.sh | 4 +- .../adv.v2/run_023_snr_classif_allknown.sh | 4 +- .../run_024_threat_model_classif_allknown.sh | 4 +- ...un_031_attack_type_verif_and_noveltydet.sh | 4 +- egs/voxceleb/adv.v2/run_032_snr_verif.sh | 4 +- .../adv.v2/run_033_threat_model_verif.sh | 4 +- .../config_fbank80_stmn_cfwseresnet34.v3.0.sh | 4 +- .../config_fbank80_stmn_fwseresnet34.v3.0.sh | 4 +- .../config_fbank80_stmn_resnet34.v3.0.sh | 2 +- egs/voxceleb/v1.1/run_011_train_xvector.sh | 8 +- egs/voxceleb/v2/run_011_train_xvector.sh | 12 +- hyperion/bin/apply_mvn_select_frames.py | 36 +- hyperion/bin/compute_energy_vad.py | 21 +- hyperion/bin/compute_mfcc_feats.py | 20 +- hyperion/bin/decode_wav2transducer.py | 110 ++-- ...l_xvec_cosine_scoring_from_adv_test_wav.py | 21 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 22 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 26 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 15 +- ...sine_scoring_from_transfer_adv_test_wav.py | 3 +- ...sine_scoring_from_transfer_art_test_wav.py | 20 +- hyperion/bin/eval_xvec_logits_from_wav.py | 19 +- hyperion/bin/extract_wav2vec2xvectors.py | 17 +- hyperion/bin/extract_xvectors_from_wav.py | 16 +- .../extract_xvectors_slidwin_from_feats.py | 15 +- .../bin/extract_xvectors_slidwin_from_wav.py | 18 +- .../generate_adv_attacks_xvector_classif.py | 15 +- .../bin/generate_adv_attacks_xvector_verif.py | 10 +- hyperion/bin/pack_wav_rirs.py | 10 +- hyperion/data_prep/data_prep.py | 8 +- hyperion/io/ark_data_reader.py | 179 ++++--- hyperion/io/ark_data_writer.py | 42 +- hyperion/io/audio_reader.py | 409 ++++++++------- hyperion/io/audio_writer.py | 84 +-- hyperion/io/bin_vad_reader.py | 3 +- hyperion/io/data_reader.py | 62 ++- hyperion/io/data_rw_factory.py | 51 +- hyperion/io/data_writer.py | 51 +- hyperion/io/h5_data_reader.py | 204 +++++--- hyperion/io/h5_data_writer.py | 32 +- hyperion/io/old_audio_reader.py | 477 ++++++++++++++++++ hyperion/io/vad_rw_factory.py | 10 +- hyperion/torch/data/audio_dataset.py | 160 +++--- hyperion/utils/feature_set.py | 16 +- hyperion/utils/info_table.py | 27 +- hyperion/utils/segment_set.py | 27 + hyperion/utils/utt2info.py | 2 +- 53 files changed, 1525 insertions(+), 827 deletions(-) create mode 100644 hyperion/io/old_audio_reader.py diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v1/run_011_train_asr.sh index 99b0065e..81ebbeae 100755 --- a/egs/librispeech/v1/run_011_train_asr.sh +++ b/egs/librispeech/v1/run_011_train_asr.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2rnn_transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ @@ -75,11 +75,11 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ @@ -103,11 +103,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ diff --git a/egs/librispeech/v1/run_011_train_asr_old.sh b/egs/librispeech/v1/run_011_train_asr_old.sh index 3d0e6eb1..3c9f4f5b 100755 --- a/egs/librispeech/v1/run_011_train_asr_old.sh +++ b/egs/librispeech/v1/run_011_train_asr_old.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu --max-split-size-mb 512 \ train_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s1_dir $args \ @@ -75,11 +75,11 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s2_dir $args \ @@ -103,11 +103,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2transducer.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $train_dir/wav.scp \ + --data.train.dataset.recordings-file $train_dir/wav.scp \ --data.train.dataset.segments-file $train_dir/utt2spk \ --data.train.dataset.bpe-model $bpe_model \ --data.train.dataset.text-file $train_dir/text \ - --data.val.dataset.audio-file $val_dir/wav.scp \ + --data.val.dataset.recordings-file $val_dir/wav.scp \ --data.val.dataset.segments-file $val_dir/utt2spk \ --data.val.dataset.text-file $val_dir/text \ --trainer.exp-path $nnet_s3_dir $args \ diff --git a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh index 37a91211..aa779902 100755 --- a/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh +++ b/egs/voxceleb/adv.v1.1/run_005_train_victim_xvector.sh @@ -44,11 +44,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh index 70bab280..420ac59d 100755 --- a/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh +++ b/egs/voxceleb/adv.v1.1/run_007_train_transfer_xvector.sh @@ -54,11 +54,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh index 12f1e5fd..4f2c137b 100755 --- a/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh +++ b/egs/voxceleb/adv.v1.1/run_008_adv_finetune_victim_xvector.sh @@ -53,11 +53,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ adv_finetune_xvector_from_wav.py $nnet_type --cfg $nnet_cfg $nnet_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh index 971b88a3..a1acb1f6 100755 --- a/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh +++ b/egs/voxceleb/adv.v2/run_011_train_victim_xvector.sh @@ -40,11 +40,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_cfg \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_dir \ diff --git a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh index 71c0c89f..b453260f 100755 --- a/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_022_attack_type_classif_allknown.sh @@ -46,11 +46,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_dir/train_utt2attack \ --data.train.dataset.class-file $list_dir/class_file \ - --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh index a928ae29..de811505 100755 --- a/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_023_snr_classif_allknown.sh @@ -46,11 +46,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_dir/train_utt2attack \ --data.train.dataset.class-file $list_dir/class_file \ - --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh index bed225a3..aa17a1ae 100755 --- a/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh +++ b/egs/voxceleb/adv.v2/run_024_threat_model_classif_allknown.sh @@ -48,11 +48,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_dir/train_utt2attack \ --data.train.dataset.class-file $list_dir/class_file \ - --data.val.dataset.audio-file $list_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh index 55cb8459..4ce703ba 100755 --- a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh +++ b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh @@ -49,11 +49,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ --data.train.dataset.class-file $list_someknown_dir/class_file \ - --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_032_snr_verif.sh b/egs/voxceleb/adv.v2/run_032_snr_verif.sh index 3886c339..12d42c99 100755 --- a/egs/voxceleb/adv.v2/run_032_snr_verif.sh +++ b/egs/voxceleb/adv.v2/run_032_snr_verif.sh @@ -52,11 +52,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ --data.train.dataset.class-file $list_someknown_dir/class_file \ - --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh index 392bffb5..cbfaaa81 100755 --- a/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh +++ b/egs/voxceleb/adv.v2/run_033_threat_model_verif.sh @@ -53,11 +53,11 @@ if [ $stage -le 1 ]; then $cuda_cmd --gpu $ngpu $sign_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $sign_nnet_command --cfg $sign_nnet_config \ - --data.train.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.train.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.train.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.train.dataset.segments-file $list_someknown_dir/train_utt2attack \ --data.train.dataset.class-file $list_someknown_dir/class_file \ - --data.val.dataset.audio-file $list_someknown_dir/trainval_wav.scp \ + --data.val.dataset.recordings-file $list_someknown_dir/trainval_wav.scp \ --data.val.dataset.time-durs-file $list_someknown_dir/trainval_utt2dur \ --data.val.dataset.segments-file $list_someknown_dir/val_utt2attack \ --trainer.exp-path $sign_nnet_dir $args \ diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh index 32c91da2..fdb3147f 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh @@ -26,8 +26,8 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth # back-end do_plda=false -do_snorm=false #true -do_qmf=false #true +do_snorm=true +do_qmf=true do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh index 62b02c28..7aa61f00 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh @@ -26,8 +26,8 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth # back-end do_plda=false -do_snorm=true -do_qmf=true +do_snorm=false #true +do_qmf=false #true do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh index c49936e0..b194d1bd 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -16,7 +16,7 @@ nnet_name=${feat_type}_resnet34.v3.0 nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml nnet_s1_name=$nnet_name.s1 -nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2 nnet_s1=$nnet_s1_dir/model_ep0035.pth nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml diff --git a/egs/voxceleb/v1.1/run_011_train_xvector.sh b/egs/voxceleb/v1.1/run_011_train_xvector.sh index a051c136..c8ab552e 100755 --- a/egs/voxceleb/v1.1/run_011_train_xvector.sh +++ b/egs/voxceleb/v1.1/run_011_train_xvector.sh @@ -44,11 +44,11 @@ if [ $stage -le 1 ]; then --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_s1_dir \ @@ -67,11 +67,11 @@ if [ $stage -le 2 ]; then --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s1 \ diff --git a/egs/voxceleb/v2/run_011_train_xvector.sh b/egs/voxceleb/v2/run_011_train_xvector.sh index 0eddb1a6..bc3b5420 100755 --- a/egs/voxceleb/v2/run_011_train_xvector.sh +++ b/egs/voxceleb/v2/run_011_train_xvector.sh @@ -47,11 +47,11 @@ if [ $stage -le 1 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ train_wav2vec2xvector.py $nnet_type \ --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --trainer.exp-path $nnet_s1_dir $args \ @@ -71,11 +71,11 @@ if [ $stage -le 2 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2xvector.py $nnet_type \ --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s1 \ @@ -96,11 +96,11 @@ if [ $stage -le 3 ]; then hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ finetune_wav2vec2xvector.py $nnet_type \ --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ --data.train.dataset.time-durs-file $list_dir/utt2dur \ --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ - --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.time-durs-file $list_dir/utt2dur \ --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ --in-model-file $nnet_s2 \ diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py index a2456dc9..f5a3ce15 100755 --- a/hyperion/bin/apply_mvn_select_frames.py +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -28,7 +32,6 @@ def process_feats( output_spec, vad_spec, write_num_frames_spec, - scp_sep, path_prefix, vad_path_prefix, part_idx, @@ -51,25 +54,16 @@ def process_feats( logging.info("opening output stream: %s" % (output_spec)) with DWF.create( - output_spec, - compress=compress, - compression_method=compression_method, - scp_sep=scp_sep, + output_spec, compress=compress, compression_method=compression_method, ) as writer: logging.info("opening input stream: %s" % (output_spec)) with DRF.create( - input_spec, - path_prefix=path_prefix, - scp_sep=scp_sep, - part_idx=part_idx, - num_parts=num_parts, + input_spec, path_prefix=path_prefix, part_idx=part_idx, num_parts=num_parts, ) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = RDRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = RDRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): key, data = reader.read(1) @@ -112,28 +106,20 @@ def process_feats( parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None ) - parser.add_argument( - "--scp-sep", dest="scp_sep", default=" ", help=("scp file field separator") - ) parser.add_argument( "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix") ) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument( "--part-idx", - dest="part_idx", type=int, default=1, help=("splits the list of files in num-parts and process part_idx"), ) parser.add_argument( "--num-parts", - dest="num_parts", type=int, default=1, help=("splits the list of files in num-parts and process part_idx"), @@ -141,14 +127,12 @@ def process_feats( parser.add_argument( "--compress", - dest="compress", default=False, action="store_true", help="Lossy compress the features", ) parser.add_argument( "--compression-method", - dest="compression_method", default="auto", choices=compression_methods, help=( diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index 15d74f3a..058f982a 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -9,8 +9,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -26,14 +30,14 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) - writer = DWF.create(output_path, scp_sep=" ") + writer = DWF.create(output_path) if write_num_frames is not None: f_num_frames = open(write_num_frames, "w") for data in reader: key, x, fs = data - logging.info("Extracting VAD for %s" % (key)) + logging.info("Extracting VAD for %s", key) t1 = time.time() y = vad.compute(x) dt = (time.time() - t1) * 1000 @@ -41,8 +45,13 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): num_speech_frames = np.sum(y) prob_speech = num_speech_frames / y.shape[0] * 100 logging.info( - "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f" - % (key, num_speech_frames, y.shape[0], prob_speech, dt, rtf) + "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f", + key, + num_speech_frames, + y.shape[0], + prob_speech, + dt, + rtf, ) writer.write([key], [y]) if write_num_frames is not None: diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index a83f95d1..ca6e26f7 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -9,8 +9,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -35,10 +39,7 @@ def compute_mfcc_feats( reader = DRF.create(input_path, **input_args) writer = DWF.create( - output_path, - scp_sep=" ", - compress=compress, - compression_method=compression_method, + output_path, compress=compress, compression_method=compression_method, ) if write_num_frames is not None: @@ -55,8 +56,11 @@ def compute_mfcc_feats( dt = (time.time() - t1) * 1000 rtf = dt / (mfcc.frame_shift * y.shape[0]) logging.info( - "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f" - % (key, y.shape[0], dt, rtf) + "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f", + key, + y.shape[0], + dt, + rtf, ) writer.write([key], [y]) diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index 81fa8803..c7de38f1 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -13,8 +13,12 @@ import numpy as np import pandas as pd import sentencepiece as spm -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -23,8 +27,7 @@ from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.models.wav2transducer.beam_search import (beam_search, - greedy_search) +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info @@ -48,10 +51,11 @@ def load_model(model_path, device): def decode_one_batch( - model: nn.Module, - sp: spm.SentencePieceProcessor, - x: torch.Tensor, - decoding_method="beam_search") -> Dict[str, List[List[str]]]: + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search", +) -> Dict[str, List[List[str]]]: """Decode one batch and return the result in a dict. The dict has the following format: - key: It indicates the setting used for decoding. For example, @@ -77,7 +81,7 @@ def decode_one_batch( the returned dict. """ device = model.device - feature = x #batch["inputs"] + feature = x # batch["inputs"] assert x.shape[0] == 1 assert feature.ndim == 2 @@ -87,7 +91,8 @@ def decode_one_batch( feature_lens = torch.Tensor([x.shape[1]]).int() encoder_out, hid_feats, encoder_out_lens = model.forward_feats( - x=feature, x_lengths=feature_lens) + x=feature, x_lengths=feature_lens + ) hyps = [] batch_size = encoder_out.size(0) @@ -114,8 +119,9 @@ def decode_one_batch( return hyps[0] -def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, - use_gpu, **kwargs): +def decode_transducer( + input_spec, output_spec, model_path, bpe_model, use_gpu, **kwargs +): device = init_device(use_gpu) model = load_model(model_path, device) @@ -129,10 +135,10 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, ar_args = AR.filter_args(**kwargs) logging.info("opening output: %s" % (output_spec)) - # with DWF.create(output_spec, scp_sep=scp_sep) as writer: with open(output_spec, "w") as writer: - logging.info("opening input stream: {} with args={}".format( - input_spec, ar_args)) + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) with AR(input_spec, **ar_args) as reader: while not reader.eof(): t1 = time.time() @@ -147,65 +153,69 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, logging.info("processing utt %s" % (key0)) for aug_id in range(num_augs): t3 = time.time() - key, x = key0, x0 #augment(key0, x0, augmenter, aug_df, aug_id) + key, x = key0, x0 # augment(key0, x0, augmenter, aug_df, aug_id) t4 = time.time() with torch.no_grad(): x = torch.tensor( - x[None, :], - dtype=torch.get_default_dtype()).to(device) + x[None, :], dtype=torch.get_default_dtype() + ).to(device) t5 = time.time() tot_frames = x.shape[1] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" % ( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( key, x.shape[1], tot_frames, x.shape[1] / tot_frames * 100, - )) + ) + ) t6 = time.time() if x.shape[1] == 0: - y = np.zeros((model.embed_dim, ), - dtype=float_cpu()) + y = np.zeros((model.embed_dim,), dtype=float_cpu()) else: y = decode_one_batch(model=model, sp=sp, x=x) t7 = time.time() - writer.write(key + ' ' + ' '.join(y) + "\n") + writer.write(key + " " + " ".join(y) + "\n") t8 = time.time() read_time = t2 - t1 tot_time = read_time + t8 - t3 logging.info( - ("utt %s total-time=%.3f read-time=%.3f " - "aug-time=%.3f feat-time=%.3f " - "vad-time=%.3f embed-time=%.3f write-time=%.3f " - "rt-factor=%.2f") % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - )) + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + ) if __name__ == "__main__": parser = ArgumentParser( - description=("Extracts x-vectors from waveform computing " - "acoustic features on the fly")) + description=( + "Extracts x-vectors from waveform computing " "acoustic features on the fly" + ) + ) parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument("--input", dest="input_spec", required=True) - parser.add_argument("--scp-sep", - default=" ", - help=("scp file field separator")) AR.add_class_args(parser) @@ -216,16 +226,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, parser.add_argument("--bpe-model", required=True) parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument("--use-gpu", - default=False, - action="store_true", - help="extract xvectors in gpu") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index bb01162f..10ea491c 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -10,8 +10,12 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -188,7 +192,7 @@ def eval_cosine_scoring( attack = AttackFactory.create(model, **attack_args) if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -327,9 +331,9 @@ def eval_cosine_scoring( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--key-file", dest="key_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -337,10 +341,7 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index c483ce39..a6f535b3 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -7,13 +7,18 @@ import os import sys import time + # [Added Sonal May21] from pathlib import Path import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -243,7 +248,7 @@ def eval_cosine_scoring_wavegan( attack = AttackFactory.create(model, **attack_args) if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -384,9 +389,9 @@ def eval_cosine_scoring_wavegan( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--key-file", dest="key_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -394,10 +399,7 @@ def eval_cosine_scoring_wavegan( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index fba182c4..5ba42477 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -13,8 +13,12 @@ import pandas as pd from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -25,8 +29,9 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device @@ -195,7 +200,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -354,9 +359,9 @@ def eval_cosine_scoring( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--v-file", dest="v_file", required=True) - parser.add_argument("--key-file", dest="key_file", default=None) - parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) @@ -364,10 +369,7 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 3cfde93e..c3732bd3 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -122,7 +126,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32") with torch.no_grad(): @@ -217,10 +221,7 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index 44bdf59d..c00cf286 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -205,7 +205,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -361,7 +361,6 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--vad-path-prefix", - dest="vad_path_prefix", default=None, help=("scp file_path prefix for vad"), ) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 676575fd..4f2b82ab 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -13,8 +13,12 @@ import pandas as pd from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -25,8 +29,9 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device @@ -213,7 +218,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -386,10 +391,7 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index da6389fb..2f5cf3da 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -11,8 +11,12 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -93,7 +97,6 @@ def eval_xvec( output_spec, vad_spec, write_num_frames_spec, - scp_sep, vad_path_prefix, model_path, chunk_length, @@ -125,8 +128,8 @@ def eval_xvec( num_augs = 1 ar_args = AR.filter_args(**kwargs) - logging.info("opening output stream: %s" % (output_spec)) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: + logging.info("opening output stream: %s", output_spec) + with DWF.create(output_spec) as writer: logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) @@ -135,9 +138,7 @@ def eval_xvec( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): t1 = time.time() @@ -243,7 +244,7 @@ def eval_xvec( parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None ) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 37d6a2a6..c4c4676f 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import torchaudio.transforms as tat -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -44,6 +48,7 @@ def get_resampler(source_fs, target_fs): resamplers[source_fs] = resampler_f return resampler_f + resamplers = {} @@ -122,7 +127,6 @@ def extract_xvectors( output_spec, vad_spec, write_speech_dur, - scp_sep, vad_path_prefix, model_path, hf_chunk_length, @@ -157,16 +161,14 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) ar_args["wav_scale"] = 1.0 logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: + with DWF.create(output_spec) as writer: logging.info(f"opening input stream: {input_spec} with args={ar_args}") with AR(input_spec, **ar_args) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): t1 = time.time() @@ -283,7 +285,6 @@ def extract_xvectors( parser.add_argument("--input", dest="input_spec", required=True) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument("--write-speech-dur", default=None) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index addabbcf..1da1ac05 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -11,8 +11,12 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -95,7 +99,6 @@ def extract_xvectors( output_spec, vad_spec, write_num_frames_spec, - scp_sep, vad_path_prefix, model_path, chunk_length, @@ -129,7 +132,7 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: + with DWF.create(output_spec) as writer: logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) @@ -138,9 +141,7 @@ def extract_xvectors( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) while not reader.eof(): t1 = time.time() @@ -249,7 +250,6 @@ def extract_xvectors( parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None ) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index e3d2fcbb..eaf0a5cc 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -11,8 +11,12 @@ import numpy as np import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -78,7 +82,7 @@ def extract_xvectors( model = load_model(model_path, device) if write_timestamps_spec is not None: - time_writer = DWF.create(write_timestamps_spec, scp_sep=" ") + time_writer = DWF.create(write_timestamps_spec) dr_args = DRF.filter_args(**kwargs) logging.info("opening output stream: %s" % (output_spec)) @@ -205,10 +209,7 @@ def extract_xvectors( ) parser.add_argument("--slidwin-params-path", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) MVN.add_class_args(parser, prefix="mvn") diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index 2b1bba3b..a31bd614 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -83,7 +87,6 @@ def extract_xvectors( vad_spec, write_timestamps_spec, slidwin_params_path, - scp_sep, vad_path_prefix, model_path, chunk_length, @@ -109,7 +112,7 @@ def extract_xvectors( feat_snip_edges = feat_args["snip_edges"] if write_timestamps_spec is not None: - time_writer = DWF.create(write_timestamps_spec, scp_sep=scp_sep) + time_writer = DWF.create(write_timestamps_spec) if aug_cfg is not None: augmenter = SpeechAugment.create(aug_cfg, rng=rng) @@ -121,7 +124,7 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec, scp_sep=scp_sep) as writer: + with DWF.create(output_spec) as writer: logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) @@ -130,9 +133,7 @@ def extract_xvectors( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): t1 = time.time() @@ -275,7 +276,6 @@ def extract_xvectors( ) parser.add_argument("--slidwin-params-path", default=None) - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument( "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") ) diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index a058893d..8c6f38a6 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -168,7 +172,7 @@ def generate_attacks( if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) keys, class_names, class_ids = read_utt_list( list_file, class2int_file, part_idx, num_parts @@ -329,10 +333,7 @@ def generate_attacks( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - dest="vad_path_prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index 83375cb6..fbd3a5fb 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -12,8 +12,12 @@ import numpy as np import pandas as pd import yaml -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch import torch.nn as nn @@ -197,7 +201,7 @@ def generate_attacks( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) attack_factory = init_attack_factory(**kwargs) attacks_info = {} diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index dccf58da..4aafa075 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -10,8 +10,12 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF @@ -20,7 +24,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): - writer = DWF.create(output_spec, scp_sep=" ", compress=False) + writer = DWF.create(output_spec, compress=False) t1 = time.time() with AR(input_path, wav_scale=1) as reader: for data in reader: diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index fb6fc6c5..19420761 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -50,12 +50,12 @@ def dataset_name(): raise NotImplementedError() @staticmethod - def _get_recording_duration(scp, i, n): + def _get_recording_duration(recordings, i, n): from ..io import SequentialAudioReader as AR durations = [] fss = [] - with AR(scp, part_idx=i, num_parts=n) as reader: + with AR(recordings, part_idx=i + 1, num_parts=n) as reader: for data in reader: key, x, fs = data duration = x.shape[0] / fs @@ -69,13 +69,13 @@ def get_recording_duration(self, recording_set): import itertools from ..utils import SCPList - scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) + # scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) futures = [] logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: for i in tqdm(range(self.num_threads)): future = pool.submit( - DataPrep._get_recording_duration, scp, i, self.num_threads + DataPrep._get_recording_duration, recording_set, i, self.num_threads ) futures.append(future) diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py index 3919ddfa..6cf22d5f 100644 --- a/hyperion/io/ark_data_reader.py +++ b/hyperion/io/ark_data_reader.py @@ -4,15 +4,15 @@ """ import multiprocessing as threading -import sys +from typing import Union, Optional, List, Callable, Tuple import numpy as np from ..hyp_defs import float_cpu -from ..utils.kaldi_io_funcs import (init_kaldi_input_stream, is_token, peek, - read_token) +from ..utils.kaldi_io_funcs import init_kaldi_input_stream, is_token, peek, read_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix -from ..utils.scp_list import SCPList + +from ..utils import FeatureSet, PathLike from .data_reader import RandomAccessDataReader, SequentialDataReader @@ -27,10 +27,9 @@ class SequentialArkDataReader(SequentialDataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.f = None self.lock = threading.Lock() @@ -42,7 +41,7 @@ def close(self): self.f.close() self.f = None - def _seek(self, offset): + def _seek(self, offset: int): """Moves the pointer of the input file. Args: @@ -52,7 +51,7 @@ def _seek(self, offset): delta = offset - cur_pos self.f.seek(delta, 1) - def _open_archive(self, file_path, offset=0): + def _open_archive(self, file_path: PathLike, offset: int = 0): """Opens the current file if it is not open and moves the file pointer to a given position. Closes previous open Ark files. @@ -69,7 +68,7 @@ def _open_archive(self, file_path, offset=0): if offset > 0: self._seek(offset) - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -86,7 +85,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -120,10 +119,8 @@ class SequentialArkFileDataReader(SequentialArkDataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): - super(SequentialArkFileDataReader, self).__init__( - file_path, permissive=False, **kwargs - ) + def __init__(self, file_path: PathLike, **kwargs): + super().__init__(file_path, permissive=False, **kwargs) self._open_archive(self.file_path) self._eof = False self._keys = None @@ -151,7 +148,7 @@ def keys(self): return self._keys - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -188,7 +185,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -206,12 +209,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): key: List of recording names. data: List of feature matrices/vectors or 3D/2D numpy array. """ - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] count = 0 @@ -264,28 +263,25 @@ class SequentialArkScriptDataReader(SequentialArkDataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): - super(SequentialArkScriptDataReader, self).__init__( - file_path, permissive=False, **kwargs - ) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): + super().__init__(file_path, permissive=False, **kwargs) + self.feature_set = FeatureSet.load(self.file_path, sep=scp_sep) if self.num_parts > 1: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=self.split_by_key - ) + self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) self.cur_item = 0 @property def keys(self): - return self.scp.key + return self.feature_set["id"] def reset(self): """Closes all the open Ark files and puts the read pointer pointing @@ -295,9 +291,9 @@ def reset(self): def eof(self): """Returns True when all the elements in the scp have been read.""" - return self.cur_item == len(self.scp) + return self.cur_item == len(self.feature_set) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -318,15 +314,18 @@ def read_shapes(self, num_records=0, assert_same_dim=True): for i in range(num_records): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + offset = feature_spec["storage_byte"] + file_path = feature_spec["storage_path"] self._open_archive(file_path, offset) binary = init_kaldi_input_stream(self.f) shape_i = KaldiMatrix.read_shape(self.f, binary, sequential_mode=True) - - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) keys.append(key) shapes.append(shape_i) @@ -338,7 +337,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -359,12 +364,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self.scp) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] @@ -373,7 +374,14 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + offset = feature_spec["storage_byte"] + file_path = feature_spec["storage_path"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows @@ -417,21 +425,24 @@ class RandomAccessArkDataReader(RandomAccessDataReader): features after reading them from disk. permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). """ def __init__( - self, file_path, path_prefix=None, transform=None, permissive=False, scp_sep=" " + self, + file_path: PathLike, + path_prefix: Optional[PathLike] = None, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, ): - super(RandomAccessArkDataReader, self).__init__( - file_path, transform, permissive - ) + super().__init__(file_path, transform, permissive) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) - archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique( + self.feature_set["storage_path"], return_inverse=True + ) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) @@ -448,7 +459,7 @@ def close(self): f.close() self.f = [None] * len(self.f) - def _open_archive(self, key_idx, offset=0): + def _open_archive(self, key_idx: int, offset: int = 0): """Opens the Ark file correspoding to a given feature/matrix if it is not already open and moves the file pointer to the point where we can read that feature matrix. @@ -473,7 +484,9 @@ def _open_archive(self, key_idx, offset=0): return f, self.locks[archive_idx] - def read_num_rows(self, keys, assert_same_dim=True): + def read_num_rows( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -489,7 +502,9 @@ def read_num_rows(self, keys, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=np.int) return num_rows - def read_dims(self, keys, assert_same_dim=True): + def read_dims( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -507,7 +522,9 @@ def read_dims(self, keys, assert_same_dim=True): assert np.all(dims == dims[0]) return dims - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -525,25 +542,26 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: shapes.append((0,)) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + offset = feature_spec["storage_byte"] f, lock = self._open_archive(index) with lock: f.seek(offset, 0) binary = init_kaldi_input_stream(f) shape_i = KaldiMatrix.read_shape(f, binary, sequential_mode=False) - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) shapes.append(shape_i) @@ -553,7 +571,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -574,12 +598,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -588,15 +608,20 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data = [] for i, key in enumerate(keys): - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + offset = feature_spec["storage_byte"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py index 58f5c0a1..6adf78b2 100644 --- a/hyperion/io/ark_data_writer.py +++ b/hyperion/io/ark_data_writer.py @@ -3,15 +3,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +from typing import Union, Optional, List import numpy as np from ..hyp_defs import float_save -from ..utils.kaldi_io_funcs import (init_kaldi_output_stream, is_token, - write_token) +from ..utils.kaldi_io_funcs import init_kaldi_output_stream, is_token, write_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix -from ..utils.scp_list import SCPList +from ..utils import PathLike from .data_writer import DataWriter @@ -28,11 +27,17 @@ class ArkDataWriter(DataWriter): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). + """ - def __init__(self, archive_path, script_path=None, binary=True, **kwargs): - super(ArkDataWriter, self).__init__(archive_path, script_path, **kwargs) + def __init__( + self, + archive_path: PathLike, + script_path: Optional[PathLike] = None, + binary: bool = True, + **kwargs, + ): + super().__init__(archive_path, script_path, **kwargs) self.binary = binary if binary: @@ -40,10 +45,9 @@ def __init__(self, archive_path, script_path=None, binary=True, **kwargs): else: self.f = open(archive_path, "w") - if script_path is not None: - self.f_script = open(script_path, "w") - else: - self.f_script = None + if script_path is not None and not self.script_is_scp: + row = self.script_sep.join(["id", "storage_path", "storage_byte"]) + self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type @@ -67,7 +71,7 @@ def flush(self): if self.f_script is not None: self.f_script.flush() - def _convert_data(self, data): + def _convert_data(self, data: np.array): """Converts the feature matrix from numpy array to KaldiMatrix or KaldiCompressedMatrix. """ @@ -89,7 +93,11 @@ def _convert_data(self, data): raise ValueError("Data is not ndarray or KaldiMatrix") - def write(self, keys, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + ): """Writes data to file. Args: @@ -114,9 +122,11 @@ def write(self, keys, data): data_i.write(self.f, self.binary) if self.f_script is not None: - self.f_script.write( - "%s%s%s:%d\n" % (key_i, self.scp_sep, self.archive_path, pos) - ) + if self.script_is_scp: + self.f_script.write(f"{key_i} {self.archive_path}:{pos}\n") + else: + row = self.script_sep.join([key_i, self.archive_path, str(pos)]) + self.f_script.write(f"{row}\n") if self._flush: self.flush() diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 69cfa65b..1052ce8c 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -10,11 +10,13 @@ import subprocess import numpy as np +import pandas as pd import soundfile as sf from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from typing import Union, Optional, List from ..hyp_defs import float_cpu -from ..utils import SCPList, SegmentList +from ..utils import RecordingSet, SegmentSet, PathLike valid_ext = [ ".wav", @@ -34,7 +36,7 @@ ".sds", ".sf", ".voc", - "w64", + ".w64", ".wve", ".xi", ] @@ -44,38 +46,36 @@ class AudioReader(object): """Class to read audio files from wav, flac or pipe Attributes: - file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. - segments_path: segments file with format: segment_id file_id tbeg tend + recordings: RecordingSet or file path to RecordingSet + segments: SegmentSet or file path to SegmentSet wav_scale: multiplies signal by scale factor """ - def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1): - self.file_path = file_path - if isinstance(file_path, SCPList): - self.scp = file_path - else: - self.scp = SCPList.load(file_path, sep=" ", is_wav=True) - - self.segments_path = segments_path - if segments_path is None: - self.segments = None - self.with_segments = False - else: + def __init__( + self, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 2 ** 15 - 1, + ): + if not isinstance(recordings, RecordingSet): + recordings = RecordingSet.load(recordings) + + self.recordings = recordings + + self.with_segments = False + if segments is not None: self.with_segments = True - if isinstance(file_path, SegmentList): - self.segments = segments_path - else: - self.segments = SegmentList.load(segments_path, - sep=" ", - index_by_file=False) + if not isinstance(segments, SegmentSet): + segments = SegmentSet.load(segments) + self.segments = segments self.wav_scale = wav_scale @property def keys(self): if self.with_segments: - return np.asarray(self.segments["segment_id"]) - return self.scp.key + return self.segments["id"].values + return self.recordings["id"].values def __enter__(self): """Function required when entering contructions of type @@ -94,10 +94,12 @@ def __exit__(self, exc_type, exc_value, traceback): pass @staticmethod - def read_wavspecifier(wavspecifier, - scale=2**15, - time_offset=0, - time_dur=0): + def read_wavspecifier( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0.0, + time_dur: float = 0.0, + ): """Reads an audiospecifier (audio_file/pipe) It reads from pipe or from all the files that can be read by `libsndfile ` @@ -113,59 +115,123 @@ def read_wavspecifier(wavspecifier, wavspecifier = wavspecifier.strip() if wavspecifier[-1] == "|": wavspecifier = wavspecifier[:-1] - x, fs = AudioReader.read_pipe(wavspecifier, scale) - if time_offset == 0 and time_dur == 0: - return x, fs - - start_sample = int(math.floor(time_offset * fs)) - num_samples = int(math.floor(time_dur * fs)) - if num_samples == 0: - return x[start_sample:], fs - - end_sample = start_sample + num_samples - assert end_sample <= len(x) - return x[start_sample:end_sample], fs + return AudioReader.read_pipe(wavspecifier, scale, time_offset, time_dur) ext = os.path.splitext(wavspecifier)[1] if ext in valid_ext: - if time_offset == 0 and time_dur == 0: - x, fs = sf.read(wavspecifier, dtype=float_cpu()) - x *= scale - return x, fs - - with sf.SoundFile(wavspecifier, "r") as f: - fs = f.samplerate - start_sample = int(math.floor(time_offset * fs)) - num_samples = int(math.floor(time_dur * fs)) - f.seek(start_sample) - if num_samples > 0: - x = scale * f.read(num_samples, dtype=float_cpu()) - else: - x = scale * f.read(dtype=float_cpu()) - return x, fs + return AudioReader.read_file(wavspecifier, scale, time_offset, time_dur) raise Exception("Unknown format for %s" % (wavspecifier)) @staticmethod - def read_pipe(wavspecifier, scale=2**15): + def read_pipe( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): """Reads wave file from a pipe Args: wavspecifier: Shell command with pipe output scale: Multiplies signal by scale factor """ - # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - proc = subprocess.Popen(wavspecifier, - shell=True, - stdout=subprocess.PIPE) + if wavspecifier[-1] == "|": + wavspecifier = wavspecifier[:-1] + + proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) pipe = proc.communicate()[0] if proc.returncode != 0: - raise Exception("Wave read pipe command %s returned code %d" % - (wavspecifier, proc.returncode)) + raise Exception( + "Wave read pipe command %s returned code %d" + % (wavspecifier, proc.returncode) + ) x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) x *= scale - return x, fs + if time_offset == 0 and time_dur == 0: + return x, fs + + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + if num_samples == 0: + return x[start_sample:], fs + + end_sample = start_sample + num_samples + assert end_sample <= len(x) + return x[start_sample:end_sample], fs + + @staticmethod + def read_file_sf( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): + if time_offset == 0 and time_dur == 0: + x, fs = sf.read(wavspecifier, dtype=float_cpu()) + x *= scale + return x, fs + + with sf.SoundFile(wavspecifier, "r") as f: + fs = f.samplerate + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + f.seek(start_sample) + if num_samples > 0: + x = scale * f.read(num_samples, dtype=float_cpu()) + else: + x = scale * f.read(dtype=float_cpu()) + + return x, fs + + @staticmethod + def read_file( + wavspecifier: PathLike, + scale: float = 2 ** 15, + time_offset: float = 0, + time_dur: float = 0, + ): + try: + return AudioReader.read_file_sf(wavspecifier, scale, time_offset, time_dur) + except: + # some files produce error in the fseek after reading the data, + # this seems an issue from pysoundfile or soundfile lib itself + # we try to read from + # time-offset to the end of the file, and remove the extra frames later, + # this solves the problem in most cases + logging.info( + ( + "error-1 reading keys=%s offset=%f duration=%f" + "retrying reading until end-of-file ..." + ), + wavspecifier, + time_offset, + time_dur, + ) + try: + x, fs = AudioReader.read_file_sf(wavspecifier, scale, time_offset) + num_samples = int(math.floor(time_dur * fs)) + x = x[:num_samples] + return x, fs + except: + logging.info( + ( + "error-2 reading keys=%s offset=%f duration=%f" + "retrying reading full file ..." + ), + wavspecifier, + time_offset, + time_dur, + ) + + x, fs = AudioReader.read_file_sf(wavspecifier, scale) + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + x = x[start_sample : start_sample + num_samples] + return x, fs - def _read_segment(self, segment, time_offset=0, time_dur=0): + def _read_segment( + self, segment: pd.Series, time_offset: float = 0, time_dur: float = 0 + ): """Reads a wave segment Args: @@ -173,28 +239,11 @@ def _read_segment(self, segment, time_offset=0, time_dur=0): Returns: Wave, sampling frequency """ - file_id = segment["file_id"] - t_beg = segment["tbeg"] + time_offset - t_end = segment["tend"] - if time_dur > 0: - t_end_new = t_beg + time_dur - assert t_end_new <= t_end - t_end = t_end_new - - file_path, _, _ = self.scp[file_id] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale) - num_samples_i = len(x_i) - s_beg = int(t_beg * fs_i) - if s_beg >= num_samples_i: - raise Exception( - "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" - % (file_id, t_beg, s_beg, file_id, num_samples_i)) - - s_end = int(t_end * fs_i) - if s_end > num_samples_i or t_end < 0: - s_end = num_samples_i - - x_i = x_i[s_beg:s_end] + recording_id = segment["recording_id"] + t_start = segment["start"] + time_offset + t_dur = segment["duration"] + storage_path = self.recordings.loc[recording_id, "storage_path"] + x_i, fs_i = self.read_wavspecifier(storage_path, self.wav_scale, t_start, t_dur) return x_i, fs_i def read(self): @@ -202,27 +251,23 @@ def read(self): class SequentialAudioReader(AudioReader): - def __init__( self, - file_path, - segments_path=None, - wav_scale=2**15 - 1, - part_idx=1, - num_parts=1, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 2 ** 15 - 1, + part_idx: int = 1, + num_parts: int = 1, ): - super().__init__(file_path, segments_path, wav_scale=wav_scale) + super().__init__(recordings, segments, wav_scale=wav_scale) self.cur_item = 0 self.part_idx = part_idx self.num_parts = num_parts if self.num_parts > 1: if self.with_segments: - self.segments = self.segments.split(self.part_idx, - self.num_parts) + self.segments = self.segments.split(self.part_idx, self.num_parts) else: - self.scp = self.scp.split(self.part_idx, - self.num_parts, - group_by_key=False) + self.recordings = self.recordings.split(self.part_idx, self.num_parts) def __iter__(self): """Needed to build an iterator, e.g.: @@ -262,9 +307,9 @@ def eof(self): """ if self.with_segments: return self.cur_item == len(self.segments) - return self.cur_item == len(self.scp) + return self.cur_item == len(self.recordings) - def read(self, num_records=0, time_offset=0, time_durs=0): + def read(self, num_records: int = 0, time_offset: float = 0, time_durs: float = 0): """Reads next num_records audio files Args: @@ -281,7 +326,7 @@ def read(self, num_records=0, time_offset=0, time_durs=0): if self.with_segments: num_records = len(self.segments) - self.cur_item else: - num_records = len(self.scp) - self.cur_item + num_records = len(self.recordings) - self.cur_item offset_is_list = isinstance(time_offset, (list, np.ndarray)) dur_is_list = isinstance(time_durs, (list, np.ndarray)) @@ -297,13 +342,14 @@ def read(self, num_records=0, time_offset=0, time_durs=0): dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: - segment = self.segments[self.cur_item] - key = segment["segment_id"] + segment = self.segments.iloc[self.cur_item] + key = segment["id"] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - key, file_path, _, _ = self.scp[self.cur_item] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale, - offset_i, dur_i) + key, file_path = self.recordings.iloc[self.cur_item] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) keys.append(key) data.append(x_i) @@ -318,14 +364,14 @@ def filter_args(**kwargs): return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") parser.add_argument( "--wav-scale", - default=2**15 - 1, + default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) @@ -334,38 +380,50 @@ def add_class_args(parser, prefix=None): "--part-idx", type=int, default=1, - help=("splits the list of files into num-parts and " - "processes part-idx"), + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), ) parser.add_argument( "--num-parts", type=int, default=1, - help=("splits the list of files into num-parts and " - "processes part-idx"), + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), ) except: pass if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args class RandomAccessAudioReader(AudioReader): + def __init__( + self, + recordings: Union[RecordingSet, PathLike], + segments: Union[SegmentSet, PathLike, None] = None, + wav_scale: float = 2 ** 15 - 1, + ): + super().__init__(recordings, segments, wav_scale) - def __init__(self, file_path, segments_path=None, wav_scale=2**15 - 1): - super().__init__(file_path, segments_path, wav_scale) - - def _read(self, keys, time_offset=0, time_durs=0): + def read( + self, + keys: Union[str, List, np.array], + time_offset: float = 0, + time_durs: float = 0, + ): """Reads the waveforms for the recordings in keys. Args: keys: List of recording/segment_ids names. + time_offset: float or float list with time-offsets + time_durs: float or float list with durations Returns: data: List of waveforms @@ -384,93 +442,92 @@ def _read(self, keys, time_offset=0, time_durs=0): dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: - if not (key in self.segments): + if not (key in self.segments.index): raise Exception("Key %s not found" % key) - segment = self.segments[key] + segment = self.segments.loc[key] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - if not (key in self.scp): + if not (key in self.recordings.index): raise Exception("Key %s not found" % key) - file_path, _, _ = self.scp[key] - x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale, - offset_i, dur_i) + file_path = self.recordings.loc[key, "storage_path"] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) data.append(x_i) fs.append(fs_i) return data, fs - def read(self, keys, time_offset=0, time_durs=0): - """Reads the waveforms for the recordings in keys. - - Args: - keys: List of recording/segment_ids names. - - Returns: - data: List of waveforms - fs: List of sampling freq. - """ - try: - x, fs = self._read(keys, - time_offset=time_offset, - time_durs=time_durs) - except: - if isinstance(keys, str): - keys = [keys] - - if not isinstance(time_offset, (list, np.ndarray)): - time_offset = [time_offset] * len(keys) - if not isinstance(time_durs, (list, np.ndarray)): - time_durs = [time_durs] * len(keys) - - try: - # some files produce error in the fseek after reading the data, - # this seems an issue from pysoundfile or soundfile lib itself - # we try to read from - # time-offset to the end of the file, and remove the extra frames later, - # this solves the problem in most cases - logging.info(("error-1 reading at keys={} offset={} " - "retrying reading until end-of-file ...").format( - keys, time_offset)) - x, fs = self._read(keys, time_offset=time_offset) - for i in range(len(x)): - end_sample = int(time_durs[i] * fs[i]) - x[i] = x[i][:end_sample] - except: - # try to read the full file - logging.info(("error-2 reading at key={}, " - "retrying reading full file ...").format(keys)) - x, fs = self._read(keys) - for i in range(len(x)): - start_sample = int(time_offset[i] * fs[i]) - end_sample = start_sample + int(time_durs[i] * fs[i]) - x[i] = x[i][start_sample:end_sample] - - return x, fs + # def read(self, keys, time_offset=0, time_durs=0): + # """Reads the waveforms for the recordings in keys. + + # Args: + # keys: List of recording/segment_ids names. + + # Returns: + # data: List of waveforms + # fs: List of sampling freq. + # """ + # try: + # x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) + # except: + # if isinstance(keys, str): + # keys = [keys] + + # if not isinstance(time_offset, (list, np.ndarray)): + # time_offset = [time_offset] * len(keys) + # if not isinstance(time_durs, (list, np.ndarray)): + # time_durs = [time_durs] * len(keys) + + # try: + # logging.info( + # ( + # "error-1 reading at keys={} offset={} " + # "retrying reading until end-of-file ..." + # ).format(keys, time_offset) + # ) + # x, fs = self._read(keys, time_offset=time_offset) + # for i in range(len(x)): + # end_sample = int(time_durs[i] * fs[i]) + # x[i] = x[i][:end_sample] + # except: + # # try to read the full file + # logging.info( + # ( + # "error-2 reading at key={}, " "retrying reading full file ..." + # ).format(keys) + # ) + # x, fs = self._read(keys) + # for i in range(len(x)): + # start_sample = int(time_offset[i] * fs[i]) + # end_sample = start_sample + int(time_durs[i] * fs[i]) + # x[i] = x[i][start_sample:end_sample] + + # return x, fs @staticmethod def filter_args(**kwargs): - valid_args = ("wav_scale", ) + valid_args = ("wav_scale",) return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") parser.add_argument( "--wav-scale", - default=2**15 - 1, + default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py index f98a3251..e416c209 100644 --- a/hyperion/io/audio_writer.py +++ b/hyperion/io/audio_writer.py @@ -8,12 +8,16 @@ import numpy as np import soundfile as sf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from typing import Union, Optional, List +from pathlib import Path from ..hyp_defs import float_cpu from ..utils.kaldi_io_funcs import is_token -from ..utils.scp_list import SCPList +from ..utils import PathLike from .audio_reader import valid_ext + subtype_to_npdtype = { "PCM_32": "int32", "ALAW": "int16", @@ -37,25 +41,23 @@ class AudioWriter(object): Attributes: output_path: output data file path. - script_path: optional output scp file. + script_path: optional output kaldi .scp or pandas .csv file. audio_format: audio file format audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], if None, it uses soundfile defaults (recommended) - scp_sep: Separator for scp files (default ' '). """ def __init__( self, - output_path, - script_path=None, - audio_format="wav", - audio_subtype=None, - scp_sep=" ", + output_path: PathLike, + script_path: Optional[PathLike] = None, + audio_format: str = "wav", + audio_subtype: Optional[str] = None, ): - self.output_path = output_path - self.script_path = script_path + self.output_path = Path(output_path) + self.script_path = Path(script_path) if script_path is not None else None self.audio_format = audio_format - self.scp_sep = scp_sep + self.output_path.mkdir(exist_ok=True, parents=True) assert "." + self.audio_format in valid_ext if audio_subtype is None: @@ -64,16 +66,23 @@ def __init__( self.subtype = audio_subtype assert sf.check_format(self.audio_format, self.subtype) - if not os.path.exists(output_path): - try: - os.makedirs(output_path) - except FileExistsError: - pass - + self.script_is_scp = False + self.script_sep = None + self.f_script = None if script_path is not None: - self.f_script = open(script_path, "w") - else: - self.f_script = None + self.script_path.parent.mkdir(exist_ok=True, parents=True) + script_ext = self.script_path.suffix + self.script_is_scp = script_ext == ".scp" + + if self.script_is_scp: + self.f_script = open(self.script_path, "w") + else: + self.script_sep = "," if script_ext == ".csv" else "\t" + self.f_script = open(self.script_path, "w", "utf-8") + row = self.script_sep.join( + ["id", "storage_path", "duration", "sample_freq"] + ) + self.f_script.write(f"{row}\n") def __enter__(self): """Function required when entering contructions of type @@ -96,7 +105,12 @@ def close(self): if self.f_script is not None: self.f_script.close() - def write(self, keys, data, fs): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + fs: Union[int, float, List[int], List[float], np.array], + ): """Writes waveform to audio file. Args: @@ -120,14 +134,21 @@ def write(self, keys, data, fs): file_basename, self.audio_format, ) - fs_i = fs[i] if fs_is_list else fs + fs_i = int(fs[i]) if fs_is_list else fs data_i = data[i].astype(dtype, copy=False) sf.write(output_file, data_i, fs_i, subtype=self.subtype) output_files.append(output_file) if self.f_script is not None: - self.f_script.write("%s%s%s\n" % (key_i, self.scp_sep, output_file)) + if self.script_is_scp: + self.f_script.write(f"{key_i} {output_file}\n") + else: + duration_i = data_i.shape[-1] / fs_i + row = self.script_sep.join( + [key_i, output_file, str(duration_i), str(fs_i)] + ) + self.f_script.write(f"{row}\n") self.f_script.flush() return output_files @@ -146,29 +167,30 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") # parser.add_argument(p1+'output-wav-scale', default=1, type=float, # help=('scale to divide the waveform before writing')) parser.add_argument( - p1 + "output-audio-format", + "--output-audio-format", default="flac", choices=["flac", "ogg", "wav"], help=("ouput audio format"), ) parser.add_argument( - p1 + "output-audio-subtype", + "--output-audio-subtype", default=None, choices=["pcm_16", "pcm_24", "float", "double", "vorbis"], help=("coding format for audio file"), ) - # parser.add_argument(p1+'output-fs', default=16000, type=int, - # help=('output sample frequency')) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) add_argparse_args = add_class_args diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py index e4e64777..82e2a0c5 100644 --- a/hyperion/io/bin_vad_reader.py +++ b/hyperion/io/bin_vad_reader.py @@ -18,13 +18,12 @@ def __init__( self, rspecifier, path_prefix=None, - scp_sep=" ", frame_length=25, frame_shift=10, snip_edges=False, ): - r = DRF.create(rspecifier, path_prefix, scp_sep=scp_sep) + r = DRF.create(rspecifier, path_prefix) super().__init__(r.file_path, r.permissive) self.r = r self.frame_shift = frame_shift diff --git a/hyperion/io/data_reader.py b/hyperion/io/data_reader.py index bbefa62d..73c120b5 100644 --- a/hyperion/io/data_reader.py +++ b/hyperion/io/data_reader.py @@ -6,18 +6,24 @@ import logging import multiprocessing from abc import ABCMeta, abstractmethod +from typing import Union, Optional, List, Callable, Tuple import numpy as np from ..hyp_defs import float_cpu from ..np.transforms import TransformList -from ..utils.scp_list import SCPList +from ..utils import PathLike class DataReader(object): __metaclass__ = ABCMeta - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): """Abstract base class to read Ark or hdf5 feature files. Attributes: @@ -57,7 +63,7 @@ def close(self): pass @staticmethod - def _squeeze(data, permissive=False): + def _squeeze(data: np.array, permissive: bool = False): """Converts list of matrices to 3D numpy array or list of vectors to 2D numpy array. @@ -121,7 +127,7 @@ def _combine_ranges(read_range, row_offset, num_rows): return row_offset, num_rows @staticmethod - def _apply_range_to_shape(shape, row_offset, num_rows): + def _apply_range_to_shape(shape: Tuple[int, int], row_offset: int, num_rows: int): """Modifies shape given the user defined row_offset and num_rows to read. If we are reading a matrix of shape (100,4) and row_offset=10, num_rows=20, it returns (20,4). @@ -158,25 +164,22 @@ class SequentialDataReader(DataReader): part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. """ __metaclass__ = ABCMeta def __init__( self, - file_path, - transform=None, - permissive=False, - part_idx=1, - num_parts=1, - split_by_key=False, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + part_idx: int = 1, + num_parts: int = 1, ): super().__init__(file_path, transform, permissive) self.lock = multiprocessing.Lock() self.part_idx = part_idx self.num_parts = num_parts - self.split_by_key = split_by_key def __iter__(self): """Needed to build an iterator, e.g.: @@ -218,7 +221,7 @@ def eof(self): return False @abstractmethod - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -234,7 +237,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -250,7 +253,7 @@ def read_dims(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -266,7 +269,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): pass @abstractmethod - def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -290,7 +299,12 @@ def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): class RandomAccessDataReader(DataReader): __metaclass__ = ABCMeta - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): """Abstract base class to read Ark or hdf5 feature files in random order. @@ -305,7 +319,7 @@ def __init__(self, file_path, transform=None, permissive=False): super().__init__(file_path, transform, permissive) @abstractmethod - def read_num_rows(self, keys=None, assert_same_dim=True): + def read_num_rows(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -320,7 +334,7 @@ def read_num_rows(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read_dims(self, keys=None, assert_same_dim=True): + def read_dims(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -335,7 +349,7 @@ def read_dims(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read_shapes(self, keys=None, assert_same_dim=True): + def read_shapes(self, keys: Union[str, List[str]], assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -350,7 +364,13 @@ def read_shapes(self, keys=None, assert_same_dim=True): pass @abstractmethod - def read(self, keys, squeeze=False, offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str]], + squeeze: bool = False, + offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py index 7868baae..b56e8c27 100644 --- a/hyperion/io/data_rw_factory.py +++ b/hyperion/io/data_rw_factory.py @@ -4,10 +4,13 @@ """ import logging +from typing import Union, Optional, List, Callable, Tuple from jsonargparse import ActionParser, ArgumentParser +import numpy as np from ..utils.kaldi_matrix import compression_methods +from ..utils import PathLike from .ark_data_reader import RandomAccessArkDataReader as RADR from .ark_data_reader import SequentialArkFileDataReader as SAFDR from .ark_data_reader import SequentialArkScriptDataReader as SASDR @@ -17,8 +20,7 @@ from .h5_data_reader import SequentialH5FileDataReader as SH5FDR from .h5_data_reader import SequentialH5ScriptDataReader as SH5SDR from .h5_data_writer import H5DataWriter as H5DW -from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier, - WSpecType) +from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType class DataWriterFactory(object): @@ -27,7 +29,9 @@ class DataWriterFactory(object): """ @staticmethod - def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): + def create( + wspecifier: PathLike, compress: bool = False, compression_method: str = "auto" + ): if isinstance(wspecifier, str): wspecifier = WSpecifier.create(wspecifier) @@ -43,7 +47,6 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): flush=wspecifier.flush, compress=compress, compression_method=compression_method, - scp_sep=scp_sep, ) else: return ADW( @@ -53,21 +56,19 @@ def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): flush=wspecifier.flush, compress=compress, compression_method=compression_method, - scp_sep=scp_sep, ) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "compress", "compression_method") + valid_args = ("compress", "compression_method") return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) parser.add_argument("--compress", default=False, action="store_true") parser.add_argument( "--compression-method", default="auto", choices=compression_methods @@ -80,7 +81,7 @@ def add_class_args(parser, prefix=None): class SequentialDataReaderFactory(object): @staticmethod - def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs): + def create(rspecifier: PathLike, path_prefix: Optional[PathLike] = None, **kwargs): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) @@ -92,27 +93,21 @@ def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs): return SAFDR(rspecifier.archive, **kwargs) else: if rspecifier.archive_type == ArchiveType.H5: - return SH5SDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) + return SH5SDR(rspecifier.script, path_prefix, **kwargs) else: - return SASDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) + return SASDR(rspecifier.script, path_prefix, **kwargs) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "path_prefix", "part_idx", "num_parts") + valid_args = ("path_prefix", "part_idx", "num_parts") return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - try: - parser.add_argument( - "--scp-sep", default=" ", help=("scp file field separator") - ) - except: - pass parser.add_argument( "--path-prefix", default=None, help=("scp file_path prefix") ) @@ -139,7 +134,11 @@ def add_class_args(parser, prefix=None): class RandomAccessDataReaderFactory(object): @staticmethod - def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): + def create( + rspecifier: PathLike, + path_prefix: Optional[PathLike] = None, + transform: Optional[Callable[[np.array], np.array]] = None, + ): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) logging.debug(rspecifier.__dict__) @@ -162,7 +161,6 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): path_prefix, transform=transform, permissive=rspecifier.permissive, - scp_sep=scp_sep, ) else: return RADR( @@ -170,26 +168,19 @@ def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): path_prefix, transform=transform, permissive=rspecifier.permissive, - scp_sep=scp_sep, ) @staticmethod def filter_args(**kwargs): - valid_args = ("scp_sep", "path_prefix") + valid_args = "path_prefix" return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod - def add_class_args(parser, prefix=None): + def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") - try: - parser.add_argument( - "--scp-sep", default=" ", help=("scp file field separator") - ) - except: - pass parser.add_argument( "--path-prefix", default=None, help=("scp file_path prefix") ) diff --git a/hyperion/io/data_writer.py b/hyperion/io/data_writer.py index cf2bb4f9..8adbf87a 100644 --- a/hyperion/io/data_writer.py +++ b/hyperion/io/data_writer.py @@ -5,9 +5,13 @@ import os from abc import ABCMeta, abstractmethod +from typing import Union, Optional, List +from pathlib import Path +import numpy as np +from ..utils import PathLike -class DataWriter(object): +class DataWriter: """Abstract base class to write Ark or hdf5 feature files. Attributes: @@ -19,35 +23,42 @@ class DataWriter(object): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). """ __metaclass__ = ABCMeta def __init__( self, - archive_path, - script_path=None, - flush=False, - compress=False, - compression_method="auto", - scp_sep=" ", + archive_path: PathLike, + script_path: Optional[PathLike] = None, + flush: bool = False, + compress: bool = False, + compression_method: str = "auto", ): - self.archive_path = archive_path - self.script_path = script_path + self.archive_path = Path(archive_path) + self.script_path = Path(script_path) if script_path is not None else None self._flush = flush self.compress = compress self.compression_method = compression_method - self.scp_sep = scp_sep - archive_dir = os.path.dirname(archive_path) - if not os.path.exists(archive_dir): - os.makedirs(archive_dir) + archive_dir = self.archive_path.parent + archive_dir.mkdir(exist_ok=True, parents=True) + self.script_is_scp = False + self.script_sep = None + self.f_script = None if script_path is not None: - script_dir = os.path.dirname(script_path) - if not os.path.exists(script_dir): - os.makedirs(script_dir) + self.script_path.parent.mkdir(exist_ok=True, parents=True) + script_ext = self.script_path.suffix + self.script_is_scp = script_ext == ".scp" + + if self.script_is_scp: + self.f_script = open(self.script_path, "w") + else: + self.script_sep = "," if script_ext == ".csv" else "\t" + self.f_script = open(self.script_path, "w", "utf-8") + row = self.script_sep.join(["id", "storage_path"]) + self.f_script.write(f"{row}\n") def __enter__(self): """Function required when entering contructions of type @@ -77,7 +88,11 @@ def flush(self): pass @abstractmethod - def write(self, key, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + ): """Writes data to file. Args: diff --git a/hyperion/io/h5_data_reader.py b/hyperion/io/h5_data_reader.py index dfefbec3..d509504d 100644 --- a/hyperion/io/h5_data_reader.py +++ b/hyperion/io/h5_data_reader.py @@ -6,8 +6,8 @@ """ import multiprocessing -import sys import time +from typing import Union, Optional, List, Callable, Tuple import h5py import numpy as np @@ -16,11 +16,18 @@ from ..utils.kaldi_io_funcs import is_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix from ..utils.list_utils import split_list, split_list_group_by_key -from ..utils.scp_list import SCPList + +# from ..utils.scp_list import SCPList +from ..utils import FeatureSet, PathLike from .data_reader import RandomAccessDataReader, SequentialDataReader -def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None): +def _read_h5_data( + dset, + row_offset: int = 0, + num_rows: int = 0, + transform: Optional[Callable[[np.array], np.array]] = None, +): """Auxiliary function to read the feature matrix from hdf5 dataset. It decompresses the data if it was compressed. @@ -74,7 +81,7 @@ class SequentialH5DataReader(SequentialDataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.f = None self.cur_file = None @@ -86,7 +93,7 @@ def close(self): self.f.close() self.f = None - def _open_archive(self, file_path): + def _open_archive(self, file_path: PathLike): """Opens the hdf5 file where the next matrix/vector is if it is not open. If there was another hdf5 file open, it closes it. @@ -96,7 +103,7 @@ def _open_archive(self, file_path): self.cur_file = file_path self.f = h5py.File(file_path, "r") - def read_num_rows(self, num_records=0, assert_same_dim=True): + def read_num_rows(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -113,7 +120,7 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - def read_dims(self, num_records=0, assert_same_dim=True): + def read_dims(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -147,7 +154,7 @@ class SequentialH5FileDataReader(SequentialH5DataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, permissive=False, **kwargs) self._open_archive(self.file_path) self._keys = list(self.f.keys()) @@ -172,7 +179,7 @@ def eof(self): """Returns True when it reaches the end of the ark file.""" return self.cur_item == len(self._keys) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -204,7 +211,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -225,12 +238,8 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self._keys) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] with self.lock: @@ -268,7 +277,6 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader): the scp file. This is useful when data is read from a different directory of that it was created. - scp_sep: Separator for scp files (default ' '). transform: TransformList object, applies a transformation to the features after reading them from disk. part_idx: It splits the input into num_parts and writes only @@ -277,20 +285,20 @@ class SequentialH5ScriptDataReader(SequentialH5DataReader): split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): super().__init__(file_path, permissive=False, **kwargs) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if self.num_parts > 1: - self.scp = self.scp.split( - self.part_idx, self.num_parts, group_by_key=self.split_by_key - ) + self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) @property def keys(self): - return self.scp.key + return self.feature_set["id"] def reset(self): """Closes all the open hdf5 files and puts the read pointer pointing @@ -300,9 +308,9 @@ def reset(self): def eof(self): """Returns True when all the elements in the scp have been read.""" - return self.cur_item == len(self.scp) + return self.cur_item == len(self.feature_set) - def read_shapes(self, num_records=0, assert_same_dim=True): + def read_shapes(self, num_records: int = 0, assert_same_dim: bool = True): """Reads the shapes in the feature matrices of the dataset. Args: @@ -316,7 +324,7 @@ def read_shapes(self, num_records=0, assert_same_dim=True): List of tuples with num_records shapes. """ if num_records == 0: - num_records = len(self.scp) - self.cur_item + num_records = len(self.feature_set) - self.cur_item keys = [] shapes = [] @@ -324,14 +332,15 @@ def read_shapes(self, num_records=0, assert_same_dim=True): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - - self._open_archive(file_path) + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + self._open_archive(feature_spec["storage_path"]) shape_i = self.f[key].shape - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) keys.append(key) shapes.append(shape_i) @@ -343,7 +352,13 @@ def read_shapes(self, num_records=0, assert_same_dim=True): return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + num_records: int = 0, + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads next num_records feature matrices/vectors. Args: @@ -362,14 +377,10 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): data: List of feature matrices/vectors or 3D/2D numpy array. """ if num_records == 0: - num_records = len(self.scp) - self.cur_item + num_records = len(self.feature_set) - self.cur_item - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) keys = [] data = [] @@ -378,7 +389,13 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if self.eof(): break - key, file_path, offset, range_spec = self.scp[self.cur_item] + feature_spec = self.feature_set.iloc[self.cur_item] + key = feature_spec["id"] + file_path = feature_spec["storage_path"] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows @@ -413,11 +430,18 @@ class RandomAccessH5DataReader(RandomAccessDataReader): it returns an empty matrix, if False it raises an exception. """ - def __init__(self, file_path, transform=None, permissive=False): + def __init__( + self, + file_path: PathLike, + transform: Optional[Callable[[np.array], np.array]] = None, + permissive: bool = False, + ): super().__init__(file_path, transform, permissive) self.f = None - def read_num_rows(self, keys, assert_same_dim=True): + def read_num_rows( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of rows in the feature matrices of the dataset. Args: @@ -433,7 +457,9 @@ def read_num_rows(self, keys, assert_same_dim=True): num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return num_rows - def read_dims(self, keys, assert_same_dim=True): + def read_dims( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the number of columns in the feature matrices of the dataset. Args: @@ -463,7 +489,7 @@ class RandomAccessH5FileDataReader(RandomAccessH5DataReader): it returns an empty matrix, if False it raises an exception. """ - def __init__(self, file_path, **kwargs): + def __init__(self, file_path: PathLike, **kwargs): super().__init__(file_path, **kwargs) self.lock = multiprocessing.Lock() self._open_archive(file_path) @@ -474,7 +500,7 @@ def close(self): self.f.close() self.f = None - def _open_archive(self, file_path): + def _open_archive(self, file_path: PathLike): """Open the hdf5 file it it is not open.""" if self.f is None: self.close() @@ -484,7 +510,9 @@ def _open_archive(self, file_path): def keys(self): return list(self.f.keys()) - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -518,7 +546,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -539,12 +573,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -589,17 +619,20 @@ class RandomAccessH5ScriptDataReader(RandomAccessH5DataReader): features after reading them from disk. permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). """ - def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + def __init__( + self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs + ): super().__init__(file_path, **kwargs) - self.scp = SCPList.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if path_prefix is not None: - self.scp.add_prefix_to_filepath(path_prefix) + self.feature_set.add_prefix_to_storage_path(path_prefix) - archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique( + self.feature_set["storage_path"], return_inverse=True + ) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) @@ -614,9 +647,9 @@ def close(self): @property def keys(self): - return self.scp.key + return self.feature_set["id"] - def _open_archive(self, key_idx): + def _open_archive(self, key_idx: int): """Opens the hdf5 file correspoding to a given feature/matrix if it is not already open. @@ -633,7 +666,9 @@ def _open_archive(self, key_idx): return self.f[archive_idx], self.locks[archive_idx] - def read_shapes(self, keys, assert_same_dim=True): + def read_shapes( + self, keys: Union[str, List[str], np.array], assert_same_dim: bool = True + ): """Reads the shapes in the feature matrices of the dataset. Args: @@ -651,18 +686,15 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: shapes.append((0,)) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] - - row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) - + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] f, lock = self._open_archive(index) if not (key in f): if self.permissive: @@ -673,8 +705,12 @@ def read_shapes(self, keys, assert_same_dim=True): with lock: shape_i = f[key].shape - shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) - # print('%s %d %.2f' % (key,time.time()-t1, len(shapes)/len(keys)*100.)) + + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + shapes.append(shape_i) if assert_same_dim: @@ -683,7 +719,13 @@ def read_shapes(self, keys, assert_same_dim=True): return shapes - def read(self, keys, squeeze=False, row_offset=0, num_rows=0): + def read( + self, + keys: Union[str, List[str], np.array], + squeeze: bool = False, + row_offset: int = 0, + num_rows: int = 0, + ): """Reads the feature matrices/vectors for the recordings in keys. Args: @@ -704,12 +746,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = isinstance(row_offset, list) or isinstance( - row_offset, np.ndarray - ) - num_rows_is_list = isinstance(num_rows, list) or isinstance( - num_rows, np.ndarray - ) + row_offset_is_list = isinstance(row_offset, (list, np.ndarray)) + num_rows_is_list = isinstance(num_rows, (list, np.ndarray)) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -718,15 +756,19 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data = [] for i, key in enumerate(keys): - if not (key in self.scp): + if not (key in self.feature_set.index): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: raise Exception("Key %s not found" % key) - index = self.scp.get_index(key) - _, file_path, offset, range_spec = self.scp[index] + index = self.feature_set.get_loc(key) + feature_spec = self.feature_set.loc[key] + if "start" in feature_spec and "num_frames" in feature_spec: + range_spec = [feature_spec["start"], feature_spec["num_frames"]] + else: + range_spec = None row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py index fed91d1e..c34aa0ca 100644 --- a/hyperion/io/h5_data_writer.py +++ b/hyperion/io/h5_data_writer.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import sys +from typing import Union, Optional, List import h5py import numpy as np @@ -11,7 +11,7 @@ from ..hyp_defs import float_save from ..utils.kaldi_io_funcs import is_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix -from ..utils.scp_list import SCPList +from ..utils import PathLike from .data_writer import DataWriter @@ -27,18 +27,18 @@ class H5DataWriter(DataWriter): {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. - scp_sep: Separator for scp files (default ' '). """ - def __init__(self, archive_path, script_path=None, **kwargs): + def __init__( + self, archive_path: PathLike, script_path: Optional[PathLike] = None, **kwargs + ): super().__init__(archive_path, script_path, **kwargs) self.f = h5py.File(archive_path, "w") - if script_path is None: - self.f_script = None - else: - self.f_script = open(script_path, "w") + if script_path is not None and not self.script_is_scp: + row = self.script_sep.join(["id", "storage_path"]) + self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type @@ -64,7 +64,7 @@ def flush(self): if self.f_script is not None: self.f_script.flush() - def _convert_data(self, data): + def _convert_data(self, data: np.array): """Converts data to the format for saving. Compresses the data it needed. Args: @@ -85,7 +85,11 @@ def _convert_data(self, data): else: raise ValueError("Data is not ndarray") - def write(self, keys, data): + def write( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + ): """Writes data to file. Args: @@ -108,9 +112,11 @@ def write(self, keys, data): dset.attrs[k] = v if self.f_script is not None: - self.f_script.write( - "%s%s%s\n" % (key_i, self.scp_sep, self.archive_path) - ) + if self.script_is_scp: + self.f_script.write(f"{key_i} {self.archive_path}\n") + else: + row = self.script_sep.join([key_i, self.archive_path]) + self.f_script.write(f"{row}\n") if self._flush: self.flush() diff --git a/hyperion/io/old_audio_reader.py b/hyperion/io/old_audio_reader.py new file mode 100644 index 00000000..341f04a4 --- /dev/null +++ b/hyperion/io/old_audio_reader.py @@ -0,0 +1,477 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import io +import logging +import math +import os +import subprocess + +import numpy as np +import soundfile as sf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ..hyp_defs import float_cpu +from ..utils import SCPList, SegmentList + +valid_ext = [ + ".wav", + ".flac", + ".ogg", + ".au", + ".avr", + ".caf", + ".htk", + ".iff", + ".mat", + ".mpc", + ".oga", + ".pvf", + ".rf64", + ".sd2", + ".sds", + ".sf", + ".voc", + "w64", + ".wve", + ".xi", +] + + +class AudioReader(object): + """Class to read audio files from wav, flac or pipe + + Attributes: + file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. + segments_path: segments file with format: segment_id file_id tbeg tend + wav_scale: multiplies signal by scale factor + """ + + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + self.file_path = file_path + if isinstance(file_path, SCPList): + self.scp = file_path + else: + self.scp = SCPList.load(file_path, sep=" ", is_wav=True) + + self.segments_path = segments_path + if segments_path is None: + self.segments = None + self.with_segments = False + else: + self.with_segments = True + if isinstance(file_path, SegmentList): + self.segments = segments_path + else: + self.segments = SegmentList.load( + segments_path, sep=" ", index_by_file=False + ) + + self.wav_scale = wav_scale + + @property + def keys(self): + if self.with_segments: + return np.asarray(self.segments["segment_id"]) + return self.scp.key + + def __enter__(self): + """Function required when entering contructions of type + + with AudioReader('file.h5') as f: + keys, data = f.read() + """ + return self + + def __exit__(self, exc_type, exc_value, traceback): + """Function required when exiting from contructions of type + + with AudioReader('file.h5') as f: + keys, data = f.read() + """ + pass + + @staticmethod + def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): + """Reads an audiospecifier (audio_file/pipe) + It reads from pipe or from all the files that can be read + by `libsndfile ` + + Args: + wavspecifier: A pipe, wav, flac, ogg file etc. + scale: Multiplies signal by scale factor + time_offset: float indicating the start time to read in the utterance. + time_durs: floats indicating the number of seconds to read from the utterance, + if 0 it reads untils the end + + """ + wavspecifier = wavspecifier.strip() + if wavspecifier[-1] == "|": + wavspecifier = wavspecifier[:-1] + x, fs = AudioReader.read_pipe(wavspecifier, scale) + if time_offset == 0 and time_dur == 0: + return x, fs + + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + if num_samples == 0: + return x[start_sample:], fs + + end_sample = start_sample + num_samples + assert end_sample <= len(x) + return x[start_sample:end_sample], fs + + ext = os.path.splitext(wavspecifier)[1] + if ext in valid_ext: + if time_offset == 0 and time_dur == 0: + x, fs = sf.read(wavspecifier, dtype=float_cpu()) + x *= scale + return x, fs + + with sf.SoundFile(wavspecifier, "r") as f: + fs = f.samplerate + start_sample = int(math.floor(time_offset * fs)) + num_samples = int(math.floor(time_dur * fs)) + f.seek(start_sample) + if num_samples > 0: + x = scale * f.read(num_samples, dtype=float_cpu()) + else: + x = scale * f.read(dtype=float_cpu()) + return x, fs + + raise Exception("Unknown format for %s" % (wavspecifier)) + + @staticmethod + def read_pipe(wavspecifier, scale=2 ** 15): + """Reads wave file from a pipe + Args: + wavspecifier: Shell command with pipe output + scale: Multiplies signal by scale factor + """ + # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) + pipe = proc.communicate()[0] + if proc.returncode != 0: + raise Exception( + "Wave read pipe command %s returned code %d" + % (wavspecifier, proc.returncode) + ) + x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) + x *= scale + return x, fs + + def _read_segment(self, segment, time_offset=0, time_dur=0): + """Reads a wave segment + + Args: + segment: pandas DataFrame (segment_id , file_id, tbeg, tend) + Returns: + Wave, sampling frequency + """ + file_id = segment["file_id"] + t_beg = segment["tbeg"] + time_offset + t_end = segment["tend"] + if time_dur > 0: + t_end_new = t_beg + time_dur + assert t_end_new <= t_end + t_end = t_end_new + + file_path, _, _ = self.scp[file_id] + x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale) + num_samples_i = len(x_i) + s_beg = int(t_beg * fs_i) + if s_beg >= num_samples_i: + raise Exception( + "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" + % (file_id, t_beg, s_beg, file_id, num_samples_i) + ) + + s_end = int(t_end * fs_i) + if s_end > num_samples_i or t_end < 0: + s_end = num_samples_i + + x_i = x_i[s_beg:s_end] + return x_i, fs_i + + def read(self): + pass + + +class SequentialAudioReader(AudioReader): + def __init__( + self, + file_path, + segments_path=None, + wav_scale=2 ** 15 - 1, + part_idx=1, + num_parts=1, + ): + super().__init__(file_path, segments_path, wav_scale=wav_scale) + self.cur_item = 0 + self.part_idx = part_idx + self.num_parts = num_parts + if self.num_parts > 1: + if self.with_segments: + self.segments = self.segments.split(self.part_idx, self.num_parts) + else: + self.scp = self.scp.split( + self.part_idx, self.num_parts, group_by_key=False + ) + + def __iter__(self): + """Needed to build an iterator, e.g.: + r = SequentialAudioReader(...) + for key, s, fs in r: + print(key) + process(s) + """ + return self + + def __next__(self): + """Needed to build an iterator, e.g.: + r = SequentialAudioReader(...) + for key , s, fs in r: + process(s) + """ + key, x, fs = self.read(1) + if len(key) == 0: + raise StopIteration + return key[0], x[0], fs[0] + + def next(self): + """__next__ for Python 2""" + return self.__next__() + + def reset(self): + """Returns the file pointer to the begining of the dataset, + then we can start reading the features again. + """ + self.cur_item = 0 + + def eof(self): + """End of file. + + Returns: + True, when we have read all the recordings in the dataset. + """ + if self.with_segments: + return self.cur_item == len(self.segments) + return self.cur_item == len(self.scp) + + def read(self, num_records=0, time_offset=0, time_durs=0): + """Reads next num_records audio files + + Args: + num_records: Number of audio files to read. + time_offset: List of floats indicating the start time to read in the utterance. + time_durs: List of floats indicating the number of seconds to read from each utterance + + Returns: + key: List of recording names. + data: List of waveforms + fs: list of sample freqs + """ + if num_records == 0: + if self.with_segments: + num_records = len(self.segments) - self.cur_item + else: + num_records = len(self.scp) - self.cur_item + + offset_is_list = isinstance(time_offset, (list, np.ndarray)) + dur_is_list = isinstance(time_durs, (list, np.ndarray)) + + keys = [] + data = [] + fs = [] + for i in range(num_records): + if self.eof(): + break + + offset_i = time_offset[i] if offset_is_list else time_offset + dur_i = time_durs[i] if dur_is_list else time_durs + + if self.with_segments: + segment = self.segments[self.cur_item] + key = segment["segment_id"] + x_i, fs_i = self._read_segment(segment, offset_i, dur_i) + else: + key, file_path, _, _ = self.scp[self.cur_item] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) + + keys.append(key) + data.append(x_i) + fs.append(fs_i) + self.cur_item += 1 + + return keys, data, fs + + @staticmethod + def filter_args(**kwargs): + valid_args = ("part_idx", "num_parts", "wav_scale") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) + try: + parser.add_argument( + "--part-idx", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) + parser.add_argument( + "--num-parts", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args + + +class RandomAccessAudioReader(AudioReader): + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): + super().__init__(file_path, segments_path, wav_scale) + + def _read(self, keys, time_offset=0, time_durs=0): + """Reads the waveforms for the recordings in keys. + + Args: + keys: List of recording/segment_ids names. + + Returns: + data: List of waveforms + """ + if isinstance(keys, str): + keys = [keys] + + offset_is_list = isinstance(time_offset, (list, np.ndarray)) + dur_is_list = isinstance(time_durs, (list, np.ndarray)) + + data = [] + fs = [] + for i, key in enumerate(keys): + + offset_i = time_offset[i] if offset_is_list else time_offset + dur_i = time_durs[i] if dur_is_list else time_durs + + if self.with_segments: + if not (key in self.segments): + raise Exception("Key %s not found" % key) + + segment = self.segments[key] + x_i, fs_i = self._read_segment(segment, offset_i, dur_i) + else: + if not (key in self.scp): + raise Exception("Key %s not found" % key) + + file_path, _, _ = self.scp[key] + x_i, fs_i = self.read_wavspecifier( + file_path, self.wav_scale, offset_i, dur_i + ) + + data.append(x_i) + fs.append(fs_i) + + return data, fs + + def read(self, keys, time_offset=0, time_durs=0): + """Reads the waveforms for the recordings in keys. + + Args: + keys: List of recording/segment_ids names. + + Returns: + data: List of waveforms + fs: List of sampling freq. + """ + try: + x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) + except: + if isinstance(keys, str): + keys = [keys] + + if not isinstance(time_offset, (list, np.ndarray)): + time_offset = [time_offset] * len(keys) + if not isinstance(time_durs, (list, np.ndarray)): + time_durs = [time_durs] * len(keys) + + try: + # some files produce error in the fseek after reading the data, + # this seems an issue from pysoundfile or soundfile lib itself + # we try to read from + # time-offset to the end of the file, and remove the extra frames later, + # this solves the problem in most cases + logging.info( + ( + "error-1 reading at keys={} offset={} " + "retrying reading until end-of-file ..." + ).format(keys, time_offset) + ) + x, fs = self._read(keys, time_offset=time_offset) + for i in range(len(x)): + end_sample = int(time_durs[i] * fs[i]) + x[i] = x[i][:end_sample] + except: + # try to read the full file + logging.info( + ( + "error-2 reading at key={}, " "retrying reading full file ..." + ).format(keys) + ) + x, fs = self._read(keys) + for i in range(len(x)): + start_sample = int(time_offset[i] * fs[i]) + end_sample = start_sample + int(time_durs[i] * fs[i]) + x[i] = x[i][start_sample:end_sample] + + return x, fs + + @staticmethod + def filter_args(**kwargs): + valid_args = ("wav_scale",) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, action=ActionParser(parser=parser), + ) + + add_argparse_args = add_class_args diff --git a/hyperion/io/vad_rw_factory.py b/hyperion/io/vad_rw_factory.py index 32032d1d..fff1ab4a 100644 --- a/hyperion/io/vad_rw_factory.py +++ b/hyperion/io/vad_rw_factory.py @@ -6,8 +6,7 @@ import logging from .bin_vad_reader import BinVADReader as BVR -from .rw_specifiers import (ArchiveType, RSpecifier, RSpecType, WSpecifier, - WSpecType) +from .rw_specifiers import ArchiveType, RSpecifier, RSpecType, WSpecifier, WSpecType from .segment_vad_reader import SegmentVADReader as SVR @@ -16,7 +15,6 @@ class VADReaderFactory(object): def create( rspecifier, path_prefix=None, - scp_sep=" ", frame_length=25, frame_shift=10, snip_edges=False, @@ -33,7 +31,6 @@ def create( return BVR( rspecifier, path_prefix, - scp_sep, frame_length=frame_length, frame_shift=frame_shift, snip_edges=snip_edges, @@ -48,7 +45,6 @@ def create( return BVR( rspecifier, path_prefix, - scp_sep, frame_length=frame_length, frame_shift=frame_shift, snip_edges=snip_edges, @@ -57,7 +53,6 @@ def create( @staticmethod def filter_args(**kwargs): valid_args = ( - "scp_sep", "path_prefix", "frame_shift", "frame_length", @@ -72,9 +67,6 @@ def add_class_args(parser, prefix=None): else: p1 = "--" + prefix + "." - parser.add_argument( - p1 + "scp-sep", default=" ", help=("scp file field separator") - ) parser.add_argument( p1 + "path-prefix", default=None, help=("scp file_path prefix") ) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 1e42a1c3..fa675fdb 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -9,7 +9,8 @@ import numpy as np import pandas as pd -#import k2 + +# import k2 import sentencepiece as spm import torchaudio.transforms as tat from jsonargparse import ActionParser, ActionYesNo, ArgumentParser @@ -25,16 +26,11 @@ from ...utils.text import read_text from ..torch_defs import floatstr_torch -#from torch.nn.utils.rnn import pad_sequence - - - class AudioDataset(Dataset): - def __init__( self, - audio_file, + recordings_file, segments_file, class_names=None, class_files=None, @@ -46,7 +42,7 @@ def __init__( return_segment_info=None, return_orig=False, target_sample_freq=None, - wav_scale=2**15 - 1, + wav_scale=2 ** 15 - 1, is_val=False, ): @@ -61,12 +57,6 @@ def __init__( self.rank = rank self.world_size = world_size self.epoch = 0 - - if rank == 0: - logging.info("opening audio reader %s", audio_file) - - self.r = AR(audio_file, wav_scale=wav_scale) - if rank == 0: logging.info("loading segments file %s", segments_file) @@ -74,17 +64,17 @@ def __init__( if rank == 0: logging.info("dataset contains %d seqs", len(self.seg_set)) + if rank == 0: + logging.info("opening audio reader %s", recordings_file) + + audio_seg_set = self.seg_set if self.seg_set.has_time_marks else None + self.r = AR(recordings_file, segments=audio_seg_set, wav_scale=wav_scale) + self.is_val = is_val if time_durs_file is not None: - if rank == 0: - logging.info("loading durations file %s", time_durs_file) + self._load_legacy_durations(time_durs_file) - time_durs = SegmentSet.load(time_durs_file) - self.seg_set["duration"] = time_durs.loc[ - self.seg_set["id"]].class_id.values.astype(np.float, - copy=False) - else: - assert "duration" in self.seg_set + assert "duration" in self.seg_set logging.info("loading class-info files") self._load_class_infos(class_names, class_files, is_val) @@ -96,8 +86,9 @@ def __init__( if text_file is not None: logging.info("loading text files") self._load_text_infos(text_file, is_val) - self.return_segment_info = ([] if return_segment_info is None else - return_segment_info) + self.return_segment_info = ( + [] if return_segment_info is None else return_segment_info + ) self.return_orig = return_orig self.num_augs = num_augs @@ -106,9 +97,18 @@ def __init__( self.target_sample_freq = target_sample_freq self.resamplers = {} + def _load_legacy_durations(self, time_durs_file): + if self.rank == 0: + logging.info("loading durations file %s", time_durs_file) + + time_durs = SegmentSet.load(time_durs_file) + self.seg_set["duration"] = time_durs.loc[ + self.seg_set["id"] + ].class_id.values.astype(np.float, copy=False) + def _load_bpe_model(self, bpe_model, is_val): if self.rank == 0: - logging.info("loading bpe file %s" % bpe_model) + logging.info("loading bpe file %s", bpe_model) self.sp = spm.SentencePieceProcessor() self.sp.load(bpe_model) blank_id = self.sp.piece_to_id("") @@ -118,7 +118,7 @@ def _load_text_infos(self, text_file, is_val): if text_file is None: return if self.rank == 0: - logging.info("loading text file %s" % text_file) + logging.info("loading text file %s", text_file) text = read_text(text_file) self.seg_set["text"] = text.loc[self.seg_set["id"]].text @@ -131,8 +131,9 @@ def _load_class_infos(self, class_names, class_files, is_val): assert len(class_names) == len(class_files) for name, file in zip(class_names, class_files): - assert (name in self.seg_set - ), f"class_name {name} not present in the segment set" + assert ( + name in self.seg_set + ), f"class_name {name} not present in the segment set" if self.rank == 0: logging.info("loading class-info file %s" % file) table = ClassInfo.load(file) @@ -143,8 +144,9 @@ def _load_class_infos(self, class_names, class_files, is_val): segment_class_ids = self.seg_set[name].unique() for c_id in class_ids: if c_id not in segment_class_ids: - logging.warning("%s class: %s not present in dataset", - name, c_id) + logging.warning( + "%s class: %s not present in dataset", name, c_id + ) def _create_augmenters(self, aug_cfgs): self.augmenters = [] @@ -154,12 +156,11 @@ def _create_augmenters(self, aug_cfgs): for aug_cfg in aug_cfgs: logging.info(f"loading augmentation={aug_cfg}") - augmenter = SpeechAugment.create(aug_cfg, - random_seed=112358 + - 1000 * self.rank) + augmenter = SpeechAugment.create( + aug_cfg, random_seed=112358 + 1000 * self.rank + ) self.augmenters.append(augmenter) - self.reverb_context = max(augmenter.max_reverb_context, - self.reverb_context) + self.reverb_context = max(augmenter.max_reverb_context, self.reverb_context) def set_epoch(self, epoch): self.epoch = epoch @@ -201,12 +202,13 @@ def _parse_segment_item(self, segment): assert duration <= self.seg_set.loc[seg_id].duration, ( f"{seg_id} with start={start} duration " f"({self.seg_set.loc[seg_id].duration}) < " - f"chunk duration ({duration})") + f"chunk duration ({duration})" + ) else: seg_id, start, duration = segment, 0, 0 - if "start" in self.seg_set: - start += self.seg_set.loc[seg_id].start + # if "start" in self.seg_set: + # start += self.seg_set.loc[seg_id].start return seg_id, start, duration @@ -217,14 +219,23 @@ def _read_audio(self, seg_id, start, duration): start -= reverb_context read_duration = duration + reverb_context + # read audio + x, fs = self.r.read([seg_id], time_offset=start, time_durs=read_duration) + return x[0].astype(floatstr_torch(), copy=False), fs[0] + + def _read_audio0(self, seg_id, start, duration): + # how much extra audio we need to load to + # calculate the reverb of the first part of the audio + reverb_context = min(self.reverb_context, start) + start -= reverb_context + read_duration = duration + reverb_context + # read audio recording_id = self.seg_set.recording_ids(seg_id) - x, fs = self.r.read([recording_id], - time_offset=start, - time_durs=read_duration) + x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration) return x[0].astype(floatstr_torch(), copy=False), fs[0] - def _apply_augs(self, x, num_samples, reverb_context_samples): + def _apply_augs(self, x, reverb_context_samples): x_augs = {} # for each type of augmentation for i, augmenter in enumerate(self.augmenters): @@ -233,7 +244,7 @@ def _apply_augs(self, x, num_samples, reverb_context_samples): # augment x x_aug, aug_info = augmenter(x) # remove the extra left context used to compute the reverberation. - x_aug = x_aug[reverb_context_samples:len(x)] + x_aug = x_aug[reverb_context_samples : len(x)] x_aug = x_aug.astype(floatstr_torch(), copy=False) x_augs[f"x_aug_{i}_{j}"] = x_aug @@ -300,7 +311,7 @@ def __getitem__(self, segment): else: num_samples = int(duration * fs) reverb_context_samples = len(x) - num_samples - x_augs = self._apply_augs(x, num_samples, reverb_context_samples) + x_augs = self._apply_augs(x, reverb_context_samples) data.update(x_augs) # add original non augmented audio @@ -311,15 +322,6 @@ def __getitem__(self, segment): else: data["x"] = x - # try: - # import soundfile as sf - - # for i, z in enumerate(r): - # sf.write(f"file_{seg_id}.wav", z, fs, "PCM_16") - # except: - # print("soundfile failed", flush=True) - - # adds the segment labels seg_info = self._get_segment_info(seg_id) data.update(seg_info) return data @@ -329,7 +331,7 @@ def filter_args(**kwargs): ar_args = AR.filter_args(**kwargs) valid_args = ( - "audio_file", + "recordings_file", "segments_file", "aug_cfgs", "num_augs", @@ -352,48 +354,43 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - if "audio_file" not in skip: + if "recordings_file" not in skip: parser.add_argument( - "--audio-file", + "--recordings-file", required=True, - help=("audio manifest file"), + help=("recordings manifest file (kaldi .scp or pandas .csv)"), ) if "segments_file" not in skip: parser.add_argument( "--segments-file", required=True, - help=("segments manifest file"), + help=("segments manifest file (kaldi .scp or pandas .csv)"), ) parser.add_argument( "--class-names", default=None, nargs="+", - help= - ("list with the names of the types of classes in the datasets, e.g., speaker, language" - ), + help=( + "list with the names of the types of classes in the datasets, e.g., speaker, language" + ), ) parser.add_argument( - "--class-files", - default=None, - nargs="+", - help=("list of class info files"), + "--class-files", default=None, nargs="+", help=("list of class info files"), ) parser.add_argument( "--time-durs-file", default=None, - help= - ("segment to duration in secs file, if durations are not in segments_file" - ), + help=( + "(deprecated) segment to duration in secs file, if durations are not in segments_file" + ), ) parser.add_argument( - "--bpe-model", - default=None, - help=("bpe model for the text label"), + "--bpe-model", default=None, help=("bpe model for the text label"), ) parser.add_argument( @@ -418,32 +415,31 @@ def add_class_args(parser, prefix=None, skip=set()): "--return-segment-info", default=None, nargs="+", - help= - ("list of columns of the segment file which should be returned as supervisions" - ), + help=( + "list of columns of the segment file which should be returned as supervisions" + ), ) parser.add_argument( "--return-orig", default=False, action=ActionYesNo, - help= - ("when using augmentation, whether or not to return also the original audio" - ), + help=( + "when using augmentation, whether or not to return also the original audio" + ), ) parser.add_argument( "--target-sample-freq", default=None, type=int, - help= - ("target sampling frequencey, if not None all audios are converted to this sample freq" - ), + help=( + "target sampling frequencey, if not None all audios are converted to this sample freq" + ), ) AR.add_class_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='audio dataset options') add_argparse_args = add_class_args diff --git a/hyperion/utils/feature_set.py b/hyperion/utils/feature_set.py index 2b2f0aaf..7e40dfd6 100644 --- a/hyperion/utils/feature_set.py +++ b/hyperion/utils/feature_set.py @@ -9,6 +9,7 @@ import pandas as pd from .info_table import InfoTable +from .misc import PathLike class FeatureSet(InfoTable): @@ -16,6 +17,9 @@ def __init__(self, df): super().__init__(df) assert "storage_path" in df + def add_prefix_to_storage_path(self, prefix: PathLike): + self.df["storge_path"] = self.df["storage_path"].apply(lambda x: f"{prefix}{x}") + def save(self, file_path, sep=None): """Saves info table to file @@ -31,14 +35,14 @@ def save(self, file_path, sep=None): from .scp_list import SCPList offset = self.df["storage_byte"] if "storage_byte" in self.df else None - range = None + range_spec = None if "start" and "num_frames" in self.df: - range = [ + range_spec = [ np.array([s, n], dtype=np.int64) for s, n in self.df[["start", "num_frames"]] ] scp = SCPList( - self.df["id"].values, self.df["storage_path"].values, offset, range + self.df["id"].values, self.df["storage_path"].values, offset, range_spec ) scp.save(file_path) return @@ -67,9 +71,9 @@ def load(cls, file_path, sep=None): if scp.offset is not None: df["storage_byte"] = scp.offset - if scp.range is not None: - df["start"] = [r[0] for r in scp.range] - df["num_frames"] = [r[0] for r in scp.range] + if scp.range_spec is not None: + df["start"] = [r[0] for r in scp.range_spec] + df["num_frames"] = [r[1] for r in scp.range_spec] return cls(df) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index a3a1da27..5a4f27d2 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -22,6 +22,7 @@ class InfoTable: Attributes: df: pandas dataframe. """ + def __init__(self, df): self.df = df assert "id" in df, f"info_table={df}" @@ -137,10 +138,7 @@ def load(cls, file_path, sep=None, name="class_id"): sep=" ", header=None, names=["id", name], - dtype={ - "id": np.str, - name: np.str - }, + dtype={"id": np.str, name: np.str}, ) else: if sep is None: @@ -163,17 +161,16 @@ def split(self, idx, num_parts, group_by=None): Args: idx: Part to return from 1 to num_parts. num_parts: Number of parts to split the list. - group_by_field: All the lines with the same value in column + group_by: All the lines with the same value in column groub_by_field go to the same part Returns: - Sub Utt2Info object + Sub InfoTable object """ - if group_by is None: + if group_by is None or group_by == "id": _, idx1 = split_list(self.df["id"], idx, num_parts) else: - _, idx1 = split_list_group_by_key(self.df[group_by], idx, - num_parts) + _, idx1 = split_list_group_by_key(self.df[group_by], idx, num_parts) df = self.df.iloc[idx1] return self.__class__(df) @@ -192,14 +189,10 @@ def merge(cls, tables): df = pd.concat(df_list) return cls(df) - def filter(self, - items=None, - iindex=None, - columns=None, - by="id", - keep=True): - assert (items is None or iindex is None - ), "items and iindex cannot be not None at the same time" + def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): + assert ( + items is None or iindex is None + ), "items and iindex cannot be not None at the same time" df = self.df if not keep: diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index f9da69fa..d51edc34 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -9,9 +9,36 @@ class SegmentSet(InfoTable): def __init__(self, df): super().__init__(df) + if "start" in df and "recording_id" not in df: + df["recording_id"] = df["id"] + + if "start" not in df and "recording_id" in df: + df["start"] = 0.0 + + @property + def has_time_marks(self): + return ( + "recording_id" in self.df and "start" in self.df and "duration" in self.df + ) + + @property + def has_recording_ids(self): + return "recording_id" in self.df def recording_ids(self, ids): if "recording_id" in self.df: return self.df.loc[ids, "recording_id"] return ids + + def recording_time_marks(self, ids): + if "recording" in self.df: + rec_col = "recording_id" + else: + rec_col = "id" + + assert "duration" in self.df + if "start" not in self.df: + self.df["start"] = 0.0 + + return self.df.loc[ids, [rec_col, "start", "duration"]] diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index 9785d021..edf2c23a 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -142,7 +142,7 @@ def save(self, file_path, sep=" "): self.utt_info.to_csv(file_path, sep=sep, header=False, index=False) @classmethod - def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}): + def load(cls, file_path, sep=" ", dtype={0: np.str_, 1: np.str_}): """Loads utt2info list from text file. Args: From c408f7428b7443761a0142a7b010dacf16aeaf2b Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 11 May 2023 14:15:47 -0400 Subject: [PATCH 100/154] some fixes in sre21 --- ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 102 +++++++++++++ ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml | 66 +++++++++ ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 47 +----- .../v1.16k/local/make_voxceleb1cat_v2.pl | 4 +- egs/sre21-av-a/v1.16k/run_002_compute_evad.sh | 39 ----- .../v1.16k/run_011_train_xvector.sh | 53 ++++++- egs/voxceleb/v1.1/README.md | 52 ++++--- ...train_res2net50w26s4_xvec_stage1_v3.0.yaml | 72 +++++++++ ...train_res2net50w26s4_xvec_stage2_v3.0.yaml | 69 +++++++++ ...train_res2net50w26s8_xvec_stage1_v3.0.yaml | 72 +++++++++ ...train_res2net50w26s8_xvec_stage2_v3.0.yaml | 69 +++++++++ .../train_tseresnet34_xvec_stage1_v3.0.yaml | 4 +- .../config_fbank80_stmn_tseresnet34.v3.0.sh | 2 +- hyperion/np/classifiers/svmc.py | 138 +++++++++--------- hyperion/np/np_model.py | 5 + hyperion/torch/layer_blocks/res2net_blocks.py | 3 - hyperion/torch/trainers/xvector_trainer.py | 29 ---- 17 files changed, 608 insertions(+), 218 deletions(-) create mode 100644 egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..01cfa082 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,102 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 8192 + dropout_rate: 0.0 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 35000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..24b1c081 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index c8732c36..1b7c3764 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,72 +9,34 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 lr=0.02 nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=2048 -ep_channels=8192 -width_factor=1 -scale=8 -se_r=16 dropout=0 -attstats_inner=128 embed_dim=256 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet_enc.in-feats 80 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 5 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - +nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth - +nnet=$nnet_dir/model_ep0004.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -88,7 +50,4 @@ else plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl index 27b1f152..18b6d40c 100755 --- a/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl +++ b/egs/sre21-av-a/v1.16k/local/make_voxceleb1cat_v2.pl @@ -31,7 +31,7 @@ my $meta_path = "$data_base/vox1_meta.csv"; if (! -e "$meta_path") { $meta_path = "$out_dir/vox1_meta.csv"; - system("wget -O $meta_path $meta_url"); + system("wget --no-check-certificate -O $meta_path $meta_url"); } open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; @@ -53,7 +53,7 @@ my $lid_path = "$data_base/lang_vox1_final.csv"; if (! -e "$lid_path") { $lid_path = "$out_dir/lang_vox1_final.csv"; - system("wget -O $lid_path $lid_url"); + system("wget --no-check-certificate -O $lid_path $lid_url"); } open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; my %utt2lang = (); diff --git a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh index f7aa7828..08f655ea 100755 --- a/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh +++ b/egs/sre21-av-a/v1.16k/run_002_compute_evad.sh @@ -9,7 +9,6 @@ set -e nodes=fs01 storage_name=$(date +'%m_%d_%H_%M') vaddir=`pwd`/exp/vad_e -vad_config=conf/vad_16k.yaml stage=1 config_file=default_config.sh @@ -75,41 +74,3 @@ if [ $stage -le 3 ];then done fi -# #Enroll multi-speaker Datasets with time marks -# if [ $stage -le 3 ];then -# for name in sre18_dev_enroll_vast sre18_eval_enroll_vast sre19_av_a_dev_enroll sre19_av_a_eval_enroll -# do -# num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') -# nj=$(($num_spk < 40 ? $num_spk:40)) -# # we just run energy vad to get the utt2num_frames file -# hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ -# --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ -# data/${name} exp/make_vad/$name $vaddir -# utils/fix_data_dir.sh data/${name} -# local/sre18_diar_to_vad.sh data/${name} exp/make_vad $vaddir -# utils/fix_data_dir.sh data/${name} -# done -# fi - -# #Dihard Datasets -# if [ $stage -le 4 ];then -# for name in dihard2_train_dev dihard2_train_eval -# do -# num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') -# nj=$(($num_spk < 40 ? $num_spk:40)) -# # we just run energy vad to get the utt2num_frames file -# hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ -# --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ -# data/${name} exp/make_vad/$name $vaddir -# hyp_utils/rttm_to_bin_vad.sh --nj 5 data/$name/vad.rttm data/$name $vaddir -# utils/fix_data_dir.sh data/${name} -# done - -# fi - -# if [ $stage -le 5 ];then -# utils/combine_data.sh --extra-files "utt2num_frames" data/dihard2_train data/dihard2_train_dev data/dihard2_train_eval -# utils/fix_data_dir.sh data/dihard2_train -# fi - - diff --git a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh index 0608929c..7f405952 100755 --- a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh @@ -10,28 +10,67 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type \ + --cfg $nnet_base_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu \ + +fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + mkdir -p $ft_nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $ft_nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type \ + --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet \ + --trainer.exp-path $ft_nnet_dir \ + --num-gpus $ngpu \ + +fi +exit + # Network Training if [ $stage -le 1 ]; then diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 23e0a26f..73b9bb4e 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -104,12 +104,12 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | | | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| | | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 | +| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 | +| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 | +| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | +| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | | | | | Cosine + AS-Norm | 0.52 | 0.33 | 0.045 | | | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | @@ -134,16 +134,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | | | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | | | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 | +| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 | +| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 | +| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 | +| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| | | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | | | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | + + ### VoxCeleb 1 Hard-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -163,16 +165,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | | | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | | | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 | +| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 | +| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 | +| | | | Cosine + AS-Norm | 1.58 | 0.092 | 0.152 | +| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | | | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | | | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | + + ### VoxSRC2022 dev | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -192,12 +196,12 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | | | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | | | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | -| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | -| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 | +| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 | +| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 | +| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 | +| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | | | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | | | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..5dda7913 --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..469e166b --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..40fb362e --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +feats: fbank80_specaug1_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..469e166b --- /dev/null +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml index 1d864080..31dcaf9a 100644 --- a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -47,7 +47,7 @@ model: dropout_rate: 0.1 norm_before: false hid_act: swish - se_r: 128 + se_r: 256 trainer: optim: opt_type: adam @@ -67,5 +67,5 @@ trainer: grad_clip: 250 use_amp: true log_interval: 1000 - epochs: 35 + epochs: 25 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh index 42af2d52..00622772 100644 --- a/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh +++ b/egs/voxceleb/v1.1/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh @@ -17,7 +17,7 @@ nnet_name=${feat_type}_tseresnet34.v3.0 nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0035.pth +nnet_s1=$nnet_s1_dir/model_ep0025.pth nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml nnet_s2_name=${nnet_name}.s2 diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py index 9311b8e8..6b54034b 100644 --- a/hyperion/np/classifiers/svmc.py +++ b/hyperion/np/classifiers/svmc.py @@ -9,20 +9,24 @@ import numpy as np from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from sklearn.svm import SVC as SVC +from sklearn.svm import SVC from ...hyp_defs import float_cpu from ...utils.math import softmax +from ...utils.misc import filter_func_args from ..np_model import NPModel -class GaussianSVMC(NPModel): +class SVMC(NPModel): """Gaussian Support Vector Machine for Classification.""" def __init__( self, C=1.0, + kernel="rbf", + degree=3, gamma="scale", + coef0=0.0, shrinking=True, probability=True, tol=0.0001, @@ -32,7 +36,6 @@ def __init__( class_weight=None, random_state=None, max_iter=100, - model=None, verbose=0, balance_class_weight=True, lr_seed=1024, @@ -48,25 +51,38 @@ def __init__( if random_state is None: random_state = np.random.RandomState(seed=lr_seed) + self.C = C + self.kernel = kernel + self.degree = degree + self.gamma = gamma + self.coef0 = coef0 + self.shrinking = shrinking + self.probability = probability + self.tol = tol + self.cache_size = cache_size + self.multi_class = multi_class + self.break_ties = break_ties + self.class_weight = class_weight + self.balance_class_weight = balance_class_weight - if model is None: - self.svm = SVC( - C=C, - kernel="rbf", - gamma=gamma, - shrinking=shrinking, - probability=probability, - tol=tol, - cache_size=cache_size, - class_weight=class_weight, - verbose=verbose, - max_iter=max_iter, - decision_function_shape=multi_class, - break_ties=break_ties, - random_state=random_state, - ) - else: - self.svm = model + self.svm = SVC( + C=C, + kernel=kernel, + gamma=gamma, + degree=degree, + coef0=coef0, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=multi_class, + break_ties=break_ties, + random_state=random_state, + ) + self.set_labels(labels) @property @@ -84,6 +100,18 @@ def get_config(self): Dictionary with config hyperparams. """ config = { + "C": self.C, + "kernel": self.kernel, + "gamma": self.gamma, + "degree": self.degree, + "coef0": self.coef0, + "shrinking": self.shrinking, + "probability": self.probability, + "tol": self.tol, + "cache_size": self.cache_size, + "multi_class": self.multi_class, + "break_ties": self.break_ties, + "class_weight": self.class_weight, "balance_class_weight": self.balance_class_weight, "labels": self.labels, } @@ -135,7 +163,6 @@ def fit(self, x, class_ids, sample_weight=None): class_ids: class integer [0, num_classes-1] identifier (num_samples,) sample_weight: weight of each sample in the estimation (num_samples,) """ - print("--------------", type(x[3, 2]), type(class_ids[20]), "--------------") self.svm.fit(x, class_ids) if self.svm.fit_status_: logging.warning("SVM did not converge") @@ -153,9 +180,6 @@ def save(self, file_path): if not split_path[-1] == "sav": file_path = "".join(split_path[0] + ".sav") with open(file_path, "wb") as f: - # with h5py.File(file_path, "w") as f: - # config = self.to_json() - # f.create_dataset("config", data=np.array(config, dtype="S")) self.save_params(f) @classmethod @@ -169,27 +193,17 @@ def load(cls, file_path): Model object. """ split_path = os.path.splitext(file_path) - if not split_path[-1] == "sav": - file_path = "".join(split_path[0] + ".sav") + if not split_path[-1] == "pkl": + file_path = "".join(split_path[0] + ".pkl") - # with h5py.File(file_path, "r") as f: with open(file_path, "rb") as f: - # json_str = str(np.asarray(f["config"]).astype("U")) - # config = cls.load_config_from_json(json_str) - config = None - return cls.load_params(f, config) + return pickle.load(f) def save_params(self, f): - # params = {"A": self.A, "b": self.b} - # self._save_params_from_dict(f, params) pickle.dump(self, f) @classmethod - def load_params(cls, f, config): - # param_list = ["A", "b"] - # params = cls._load_params_to_dict(f, config["name"], param_list) - # kwargs = dict(list(config.items()) + list(params.items())) - # return cls(**kwargs) + def load_params(cls, f): svmc = pickle.load(f) return svmc @@ -200,27 +214,7 @@ def filter_class_args(**kwargs): Returns: Hyperparamter dictionary to initialize the class. """ - valid_args = ( - "nu", - "gamma", - "shrinking", - "probability", - "tol", - "cache_size", - "multi_class", - "break_ties", - "class_weight", - "random_state", - "max_iter", - "verbose", - "balance_class_weight", - "lr_seed", - "model", - "labels", - ) - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - filter_train_args = filter_class_args + return filter_func_args(SVMC.__init__, **kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -240,17 +234,27 @@ def add_class_args(parser, prefix=None): type=float, help="inverse of regularization strength", ) - # parser.add_argument( - # "--class_weight", - # default=None, - # help="Class weights", - # ) + parser.add_argument( + "--kernel", + default="rbf", + choices=["linear", "poly", "rbf", "sigmoid", "precomputed"], + help="kernel for svm", + ) + parser.add_argument( + "--degree", defaut=3, type=int, help="degree of polynomial kernel" + ) parser.add_argument( "--gamma", default="scale", choices=["scale", "auto"], help="Kernel coefficient for ‘rbf’", ) + parser.add_argument( + "--coef0", + default=0.0, + type=float, + help="independent term of poly and sigmoid kernels", + ) parser.add_argument( "--shrinking", default=True, @@ -264,7 +268,7 @@ def add_class_args(parser, prefix=None): help="Whether to enable probability estimates", ) parser.add_argument( - "--break_ties", + "--break-ties", default=True, type=bool, help="If true, predict will break ties according to the confidence values of decision_function; otherwise \ @@ -293,7 +297,7 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - "--cache_size", + "--cache-size", default=600, type=int, help="Specify the size of the kernel cache (in MB)", diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py index ee464161..aa635fc5 100644 --- a/hyperion/np/np_model.py +++ b/hyperion/np/np_model.py @@ -99,6 +99,8 @@ def _save_params_from_dict(self, f, params, dtypes=None): """ if dtypes is None: dtypes = dict((k, float_save()) for k in params) + elif isinstance(dtypes, type): + dtypes = dict((k, dtypes) for k in params) if self.name is None: prefix = "" @@ -174,6 +176,9 @@ def _load_params_to_dict(f, name, params, dtypes=None): """ if dtypes is None: dtypes = dict((k, float_cpu()) for k in params) + elif isinstance(dtypes, type): + dtypes = dict((k, dtypes) for k in params) + if name is None: prefix = "" else: diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 73255a24..8de700c4 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -410,9 +410,6 @@ def forward(self, x, x_mask=None): x += residual - if not self.norm_before: - x = self.bn3(x) - if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index a9a9d98f..eddf47a7 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -88,35 +88,6 @@ def __init__( super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # loss, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - @record def train_epoch(self, data_loader): """Training epoch loop From 27d579cb7247bba1983cd4abac42e836e796355a Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sat, 20 May 2023 17:42:13 -0400 Subject: [PATCH 101/154] sre21 16k recipe finished --- egs/sre21-av-a/v1.16k/README.md | 51 ++- .../v1.16k/conf/lresnet34_lid_v1.yaml | 59 --- ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 2 + ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml | 2 +- .../v1.16k/conf/train_lresnet34_lid_v1.yaml | 78 ++++ ...train_res2net50w26s4_xvec_stage1_v1.0.yaml | 80 ++++ ...train_res2net50w26s4_xvec_stage2_v1.0.yaml | 66 +++ ...train_res2net50w26s8_xvec_stage1_v1.0.yaml | 82 ++++ ...train_res2net50w26s8_xvec_stage2_v1.0.yaml | 66 +++ ...in_tseres2net50w26s4_xvec_stage1_v1.0.yaml | 83 ++++ ...in_tseres2net50w26s4_xvec_stage2_v1.0.yaml | 66 +++ ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 2 +- ...cs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh | 32 +- ...cs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh | 67 ---- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 71 +--- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 25 +- ...et50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh | 0 ...cs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh | 49 +++ ...et50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh | 0 .../v1.16k/run_011_train_xvector.sh | 1 - .../v1.16k/run_012_finetune_xvector.sh | 61 --- egs/sre21-av-a/v1.16k/run_014_train_lid.sh | 34 +- egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh | 2 +- ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 104 +++++ ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml | 66 +++ ...train_res2net50w26s8_xvec_stage1_v1.0.yaml | 82 ++++ ...train_res2net50w26s8_xvec_stage2_v1.0.yaml | 65 +++ ...in_tseres2net50w26s4_xvec_stage1_v1.0.yaml | 83 ++++ ...in_tseres2net50w26s4_xvec_stage2_v1.0.yaml | 66 +++ egs/sre21-av-a/v1.8k/default_config.sh | 2 +- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 50 +-- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 48 +++ ...statsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh | 68 ---- ...statsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | 58 +++ ...statsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh | 76 ---- egs/sre21-av-a/v1.8k/run_011_train_xvector.sh | 54 ++- .../v1.8k/run_012_finetune_xvector.sh | 61 --- egs/voxceleb/v1.1/run_002_compute_evad.sh | 1 - egs/voxceleb/v1.2/run_001_prepare_data.sh | 14 +- hyp_utils/create_data_link.pl | 132 ++++++ hyp_utils/create_data_split_dirs.sh | 46 +++ hyp_utils/create_data_split_links.sh | 23 ++ hyp_utils/create_split_dir.pl | 92 +++++ ...l_xvec_cosine_scoring_from_adv_test_wav.py | 8 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 14 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 2 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 4 +- ...sine_scoring_from_transfer_adv_test_wav.py | 2 +- ...sine_scoring_from_transfer_art_test_wav.py | 3 +- hyperion/bin/eval_xvec_logits_from_wav.py | 48 +-- hyperion/bin/extract_xvectors_from_wav.py | 2 +- .../bin/extract_xvectors_slidwin_from_wav.py | 14 +- hyperion/bin/finetune_xvector_from_wav.py | 77 +--- hyperion/data_prep/__init__.py | 2 +- hyperion/data_prep/voxceleb1.py | 338 ++++++++++++++++ hyperion/data_prep/voxceleb2.py | 12 +- hyperion/np/classifiers/__init__.py | 2 +- hyperion/torch/layers/global_pool.py | 7 +- hyperion/torch/models/xvectors/xvector.py | 6 +- hyperion/torch/narchs/audio_feats_mvn.py | 9 +- hyperion/torch/narchs/classif_head.py | 2 +- hyperion/torch/narchs/dc1d_decoder.py | 4 +- hyperion/torch/narchs/dc1d_encoder.py | 4 +- hyperion/torch/narchs/dc2d_decoder.py | 4 +- hyperion/torch/narchs/dc2d_encoder.py | 4 +- hyperion/torch/narchs/fcnet.py | 2 +- hyperion/torch/narchs/resnet.py | 16 +- hyperion/torch/narchs/resnet1d_decoder.py | 14 +- hyperion/torch/narchs/resnet1d_encoder.py | 20 +- hyperion/torch/narchs/resnet2d_decoder.py | 14 +- hyperion/torch/narchs/resnet2d_encoder.py | 19 +- hyperion/torch/narchs/resnet_factory.py | 4 +- hyperion/torch/narchs/spinenet.py | 16 +- hyperion/torch/narchs/spinenet_factory.py | 4 +- hyperion/torch/narchs/tdnn_factory.py | 4 +- .../torch/narchs/transformer_encoder_v1.py | 4 +- .../trainers/xvector_trainer_from_wav.py | 8 +- hyperion/torch/utils/masking.py | 7 +- hyperion/utils/__init__.py | 2 + hyperion/utils/dataset.py | 379 +++++++++++++++--- hyperion/utils/enrollment_map.py | 86 ++++ hyperion/utils/info_table.py | 12 +- hyperion/utils/segment_set.py | 4 + hyperion/utils/sparse_trial_key.py | 58 +++ hyperion/utils/trial_key.py | 82 +++- 85 files changed, 2625 insertions(+), 868 deletions(-) delete mode 100644 egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml delete mode 100644 egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh rename egs/sre21-av-a/v1.16k/global_conf/{ => deprecated}/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh (100%) create mode 100644 egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh rename egs/sre21-av-a/v1.16k/global_conf/{ => deprecated}/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh (100%) delete mode 100755 egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh create mode 100644 egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml create mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh delete mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh create mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh delete mode 100644 egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh delete mode 100755 egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh create mode 100755 hyp_utils/create_data_link.pl create mode 100755 hyp_utils/create_data_split_dirs.sh create mode 100755 hyp_utils/create_data_split_links.sh create mode 100755 hyp_utils/create_split_dir.pl create mode 100644 hyperion/data_prep/voxceleb1.py create mode 100644 hyperion/utils/enrollment_map.py diff --git a/egs/sre21-av-a/v1.16k/README.md b/egs/sre21-av-a/v1.16k/README.md index e35577d7..0f5d09ad 100644 --- a/egs/sre21-av-a/v1.16k/README.md +++ b/egs/sre21-av-a/v1.16k/README.md @@ -88,8 +88,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_011_train_xvector.sh` - Trains the x-vector network on 4sec chunks - - - `run_012_finetune_xvector.sh` - Fine-tune x-vector network on 10-15 secs utts - `run_013_prepare_langid_train_data.sh` @@ -110,8 +108,8 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_040_eval_be_v1.sh, run_041_eval_be_v2.sh, run_042_eval_be_v3.sh, run_042b_eval_be_v3.sh` - Evals different back-end versions: - V1: Back-end trained on all data without adaptation - - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, global PLDA adapted to SRE-Vox-CHN - - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and langauge, source dependent PLDA adapted to SRE-CHN or Vox-CHN + - V2: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, global PLDA adapted to SRE-Vox-CHN + - V3: Centering + PCA + LNorm + PLDA (+S-Norm), Centering adapted to source and language, source dependent PLDA adapted to SRE-CHN or Vox-CHN - V3b: V3 with hyperparmeters tuned for x-vectors trained on VoxCeleb only - `run_fus*.sh` @@ -120,4 +118,47 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs ## Results -TODO +The back-end used for these results is: +- back-end V2 (run_041_eval_be_v2.sh) +- Without S-Norm +- Scores are calibrated as indicated in the paper. + +## SRE16 Eval40% YUE + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.57 | 0.135 | 0.237 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.23 | 0.136 | 0.187 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.38 | 0.147 | 0.189 | + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | + +## SRE21 Audio Dev (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.91 | 0.393 | 0.409 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 5.22 | 0.370 | 0.377 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.79 | 0.309 | 0.325 | + +## SRE21 Audio Eval (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.68 | 0.395 | 0.401 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.92 | 0.405 | 0.412 | +| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.80 | 0.357 | 0.360 | diff --git a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml deleted file mode 100644 index 5451702f..00000000 --- a/egs/sre21-av-a/v1.16k/conf/lresnet34_lid_v1.yaml +++ /dev/null @@ -1,59 +0,0 @@ -min_chunk_length: 4.0 -max_chunk_length: 4.0 -return_fullseqs: false -wav_scale: 32767 -batch_size: 512 -var_batch_size: false -iters_per_epoch: 6.0 -train_aug_cfg: conf/reverb_noise_aug.yaml -val_aug_cfg: conf/reverb_noise_aug.yaml -feats: fbank64_stmn_nb_16k.yaml -pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 32 -embed_dim: 32 -num_embed_layers: 1 -hid_act: relu6 -loss_type: arc-softmax -s: 30.0 -margin: 0.3 -margin_warmup_epochs: 30.0 -dropout_rate: 0.0 -in_feats: 64 -resnet_type: lresnet34 -in_channels: 1 -conv_channels: 64 -base_channels: 64 -in_kernel_size: 3 -in_stride: 1 -in_norm: false -no_maxpool: true -optim: - opt_type: adam - lr: 0.02 - # lr: 0.01 - beta1: 0.9 - beta2: 0.95 - amsgrad: true - weight_decay: 1e-5 -lrsched: - lrsch_type: exp_lr - decay_rate: 0.5 - decay_steps: 8000 - hold_steps: 10000 - min_lr: 1.0e-05 - warmup_steps: 1000 - update_lr_on_opt_step: true -grad_acc_steps: 1 -epochs: 70 -log_interval: 100 -use_tensorboard: false -use_wandb: false -wandb: - mode: online -ddp_type: ddp -use_amp: true -swa_start: 0 -swa_lr: 0.001 -swa_anneal_epochs: 10 -num_gpus: 4 diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml index 01cfa082..d68ea26e 100644 --- a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -68,6 +68,7 @@ model: multilayer_concat: true endpoint_channels: 8192 dropout_rate: 0.0 + hid_act: relu6 pool_net: pool_type: ch-wise-att-mean+stddev inner_feats: 128 @@ -76,6 +77,7 @@ model: margin: 0.3 margin_warmup_epochs: 20.0 dropout_rate: 0.0 + hid_act: relu6 trainer: optim: opt_type: adam diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml index 24b1c081..e7f9969b 100644 --- a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -46,7 +46,7 @@ model: trainer: optim: opt_type: sgd - lr: 1e-3 + lr: 0.01 momentum: 0.9 weight_decay: 1e-5 lrsched: diff --git a/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml new file mode 100644 index 00000000..c46365db --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_lresnet34_lid_v1.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + data_loader: + num_workers: 8 +feats: fbank64_stmn_nb_16k.yaml +model: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 64 + conv_channels: 64 + in_kernel_size: 3 + in_stride: 1 + in_norm: false + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 32 + embed_dim: 32 + num_embed_layers: 1 + hid_act: relu6 + loss_type: arc-softmax + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 30.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.02 + beta1: 0.9 + beta2: 0.95 + amsgrad: true + weight_decay: 1e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 10000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + epochs: 70 + log_interval: 100 + use_amp: true + diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..7a9234b6 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,80 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + pool_net: + pool_type: mean+stddev + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 +trainer: + optim: + opt_type: adam + lr: 0.05 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 50 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..9884bb4c --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 21 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..4c427202 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..f34b4896 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..10607607 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + se_r: 256 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..f34b4896 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 1b7c3764..1da68697 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -27,7 +27,7 @@ nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth -nnet=$nnet_dir/model_ep0004.pth + # xvector full net finetuning with out-of-domain ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh index 1903369e..6d14f27d 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh @@ -1,4 +1,4 @@ -# LResNet34 x-vector with mixed precision training +# Res2Net50 w26s4 x-vector with mixed precision training # acoustic features feat_config=conf/fbank80_stmn_16k.yaml @@ -9,50 +9,33 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.05 -nnet_type=res2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data -nnet_num_epochs=60 +nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s4_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0071.pth - +nnet=$nnet_dir/model_ep0061.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=21 ft_margin=0.5 -ft_margin_warmup=5 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0021.pth @@ -61,7 +44,4 @@ ft_nnet=$ft_nnet_dir/model_ep0021.pth plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh deleted file mode 100644 index 344e1288..00000000 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh +++ /dev/null @@ -1,67 +0,0 @@ -# LResNet34 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_16k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxcelebcat -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=res2net50 -dropout=0 -embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - -s=30 -margin_warmup=20 -margin=0.3 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 50 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1.$nnet_data -nnet_num_epochs=60 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth -#nnet=$nnet_dir/swa_model_ep0061.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=10 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=15 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index cae32b57..0b62008e 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,103 +9,40 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.02 -nnet_type=res2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 s=30 margin_warmup=20 margin=0.3 attstats_inner=128 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=10 -ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# xvector last-layer finetuning in-domain -reg_layers_classif=0 -reg_layers_enc="0 1 2 3 4" -nnet_adapt_data=voxcelebcat_sre_alllangs_mixfs_chnspks - -# ft2_batch_size_1gpu=4 -# ft2_eff_batch_size=128 # effective batch size -# ft2_ipe=4 -# ft2_lr=0.01 -# ft2_nnet_num_epochs=12 -# ft2_margin_warmup=3 -# ft2_reg_weight_embed=0.1 -# ft2_min_chunk=10 -# ft2_max_chunk=60 - -# ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -# ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -# ft2_nnet_name=${ft_nnet_name}.ft_eaffine_rege_w${ft2_reg_weigth_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v2 -# ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name -# ft2_nnet=$ft2_nnet_dir/model_ep0010.pth - - -# xvector full nnet finetuning -ft2_batch_size_1gpu=6 -ft2_eff_batch_size=128 # effective batch size -ft2_ipe=1 -ft2_lr=0.01 -ft2_nnet_num_epochs=15 -ft2_margin=0.5 -ft2_margin_warmup=3 -ft2_reg_weight_embed=0.1 -ft2_reg_weight_enc=0.1 -ft2_min_chunk=10 -ft2_max_chunk=10 - -ft2_opt_opt="--optim.opt-type sgd --optim.lr $ft2_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft2_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft2_nnet_name=${ft_nnet_name}.ft_reg_wenc${ft2_reg_weight_enc}_we${ft2_reg_weight_embed}_${ft2_min_chunk}_${ft2_max_chunk}_sgdcos_lr${ft2_lr}_b${ft2_eff_batch_size}_amp.v1 -ft2_nnet_dir=exp/xvector_nnets/$ft2_nnet_name -ft2_nnet=$ft2_nnet_dir/model_ep0012.pth - - # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 96475c53..a57f16d9 100644 --- a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,21 +9,15 @@ vad_config=conf/vad_16k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=24 eff_batch_size=512 # effective batch size -ipe=1 min_chunk=4 max_chunk=4 lr=0.02 -nnet_type=tseres2net50 +nnet_type=resnet dropout=0 embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 se_r=256 s=30 @@ -31,13 +25,8 @@ margin_warmup=20 margin=0.3 attstats_inner=128 -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 +nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_dir=exp/xvector_nnets/$nnet_name nnet=$nnet_dir/model_ep0075.pth nnet=$nnet_dir/swa_model_ep0076.pth @@ -49,12 +38,9 @@ ft_min_chunk=10 ft_max_chunk=15 ft_ipe=1 ft_lr=0.01 -ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -69,7 +55,4 @@ else plda_data=voxceleb2cat_train_augx${plda_num_augs} fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh similarity index 100% rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.02_amp.v1.sh diff --git a/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh new file mode 100644 index 00000000..b5863308 --- /dev/null +++ b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.voxcelebcat.sh @@ -0,0 +1,49 @@ +# Res2Net50 w26s8 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxcelebcat + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_args="--model.pool_net.pool-type mean+stddev" +nnet_name=${feat_type}_res2net50w26s8_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth +#nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +plda_type=splda diff --git a/egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh similarity index 100% rename from egs/sre21-av-a/v1.16k/global_conf/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh rename to egs/sre21-av-a/v1.16k/global_conf/deprecated/config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.02_amp.v1.sh diff --git a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh index 7f405952..d7ea8ed0 100755 --- a/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.16k/run_011_train_xvector.sh @@ -19,7 +19,6 @@ num_workers="" list_dir=data/${nnet_data}_proc_audio_no_sil -args="" if [ -n "$num_workers" ];then extra_args="--data.train.data_loader.num-workers $num_workers" fi diff --git a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh deleted file mode 100755 index 58a3fdc9..00000000 --- a/egs/sre21-av-a/v1.16k/run_012_finetune_xvector.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=3 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($ft_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $ft_nnet_dir/log - $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \ - --iters-per-epoch $ft_ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $ft_nnet_num_epochs \ - --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --in-model-path $nnet \ - --train-mode ft-full \ - --exp-path $ft_nnet_dir $args - -fi - - diff --git a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh index 6251de97..35d2c0bc 100755 --- a/egs/sre21-av-a/v1.16k/run_014_train_lid.sh +++ b/egs/sre21-av-a/v1.16k/run_014_train_lid.sh @@ -10,19 +10,17 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 -lid_ipe=1 +num_workers="" + . parse_options.sh || exit 1; . $config_file . datapath.sh list_dir=data/train_lid_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then @@ -33,22 +31,20 @@ lid_nnet_dir=exp/lid_nnets/lresnet34_lid_v1 # Network Training if [ $stage -le 1 ]; then - train_exec=torch-train-resnet-xvec-from-wav.py mkdir -p $lid_nnet_dir/log $cuda_cmd \ --gpu $ngpu $lid_nnet_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec --cfg conf/lresnet34_lid_v1.yaml \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_train_lid/train.scp \ - --val-list $list_dir/lists_train_lid/val.scp \ - --class-file $list_dir/lists_train_lid/class2int \ - --iters-per-epoch $lid_ipe \ - --num-workers $num_workers \ - --num-gpus $ngpu \ - --exp-path $lid_nnet_dir $args - + train_xvector_from_wav.py resnet \ + --cfg conf/train_lresnet34_lid_v1.yaml \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_train_lid/train.scp \ + --data.train.dataset.class-file $list_dir/lists_train_lid/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_train_lid/val.scp \ + --trainer.exp-path $lid_nnet_dir $extra_args \ + --num-gpus $ngpu fi -exit diff --git a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh index 0941951f..73cb9a3d 100755 --- a/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh +++ b/egs/sre21-av-a/v1.16k/run_041_eval_be_v2.sh @@ -195,7 +195,7 @@ if [ $stage -le 5 ]; then #SRE superset and 16 echo "SRE Superset Dev" steps_be/eval_be_plda_snorm_v2_cts.sh \ - --cmd "$train_cmd --mem 8G" \ + --cmd "$train_cmd --mem 12G" \ --plda_type $plda_type --ncoh $ncoh --num-parts 100 \ data/sre_cts_superset_16k_dev/trials \ data/sre_cts_superset_16k_dev/utt2enroll \ diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..bc311234 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,104 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 8192 + dropout_rate: 0.0 + hid_act: relu6 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 30000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..031e9ca3 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 15.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..416926d0 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: res2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 35000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..16203033 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,65 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..2d74799c --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml @@ -0,0 +1,83 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 1.625 + res2net_scale: 4 + se_r: 256 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 256 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 20.0 + dropout_rate: 0.0 + hid_act: relu6 +trainer: + optim: + opt_type: adam + lr: 0.02 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 1.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 10000 + hold_steps: 40000 + min_lr: 1.0e-05 + warmup_steps: 1000 + update_lr_on_opt_step: true + grad_clip: 250 + swa_start: 65 + swa_anneal_epochs: 5 + swa_lr: 1e-3 + use_amp: true + log_interval: 1000 + epochs: 75 + eff_batch_size: 512 diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..f34b4896 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -0,0 +1,66 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 8 + max_chunk_length: 10.0 + min_chunk_length: 10.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: uniform + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +feats: fbank80_stmn_16k.yaml +model: + cos_scale: 30.0 + margin: 0.5 + margin_warmup_epochs: 3 + intertop_margin: 0.0 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 1e-5 + lrsched: + lrsch_type: cos_lr + t: 2500 + t_mul: 2 + warm_restarts: true + gamma: 0.75 + min_lr: 1e-4 + warmup_steps: 100 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 7 + eff_batch_size: 128 + diff --git a/egs/sre21-av-a/v1.8k/default_config.sh b/egs/sre21-av-a/v1.8k/default_config.sh index 91a20745..74b76b0a 120000 --- a/egs/sre21-av-a/v1.8k/default_config.sh +++ b/egs/sre21-av-a/v1.8k/default_config.sh @@ -1 +1 @@ -global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh \ No newline at end of file +global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh \ No newline at end of file diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh index 69ad025b..65c2c924 100644 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -9,53 +9,19 @@ vad_config=conf/vad_8k.yaml # x-vector training nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" -batch_size_1gpu=16 eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 lr=0.02 nnet_type=resnet1d -block_type=seres2bn # squeeze-excitation res2net bottleneck -channels=2048 -ep_channels=8192 -width_factor=1 -scale=8 -se_r=16 dropout=0 -attstats_inner=128 embed_dim=256 s=30 margin_warmup=20 margin=0.3 -nnet_opt="--resnet_enc.in-feats 64 \ - --resnet_enc.in-conv-channels $channels \ - --resnet_enc.in-kernel-size 5 \ - --resnet_enc.in-stride 1 \ - --resnet_enc.resb-type $block_type \ - --resnet_enc.resb-repeats 1 1 1 1 \ - --resnet_enc.resb-channels $channels \ - --resnet_enc.resb-kernel-sizes 3 \ - --resnet_enc.resb-dilations 2 3 4 5 \ - --resnet_enc.resb-strides 1 \ - --resnet_enc.res2net-width-factor $width_factor \ - --resnet_enc.res2net-scale $scale \ - --resnet_enc.se-r $se_r \ - --resnet_enc.multilayer \ - --resnet_enc.multilayer-concat \ - --resnet_enc.endpoint-channels $ep_channels \ - --pool_net.pool-type ch-wise-att-mean+stddev \ - --pool_net.inner-feats $attstats_inner \ - --embed-dim $embed_dim" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 30000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - +nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml nnet_name=${feat_type}_ecapatdnn2048x4_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 nnet_num_epochs=75 nnet_dir=exp/xvector_nnets/$nnet_name @@ -63,18 +29,14 @@ nnet=$nnet_dir/model_ep0070.pth nnet=$nnet_dir/swa_model_ep0076.pth # xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 ft_eff_batch_size=128 # effective batch size ft_min_chunk=10 ft_max_chunk=15 -ft_ipe=1 ft_lr=0.01 ft_nnet_num_epochs=15 ft_margin=0.5 -ft_margin_warmup=3 -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" +ft_nnet_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name ft_nnet=$ft_nnet_dir/model_ep0007.pth @@ -82,8 +44,10 @@ ft_nnet=$ft_nnet_dir/model_ep0007.pth # back-end plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh new file mode 100644 index 00000000..824361d0 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -0,0 +1,48 @@ +# Res2Net50 w26s8 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxcelebcat_sre_alllangs_mixfs + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_res2net50w26s8_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0070.pth +nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +plda_type=splda diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index e1a923d7..00000000 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,68 +0,0 @@ -# LResNet34 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank64_stmn_8k.yaml -feat_type=fbank64_stmn - -#vad -vad_config=conf/vad_8k.yaml - -# x-vector training -nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=16 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=res2net50 -dropout=0 -embed_dim=256 -width_factor=3.25 -scale=8 -ws_tag=w26s8 - -s=30 -margin_warmup=20 -margin=0.3 -attstats_inner=128 - -nnet_opt="--resnet-type $nnet_type --in-feats 64 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 35000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0070.pth -nnet=$nnet_dir/swa_model_ep0076.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=10 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=15 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0007.pth - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh new file mode 100644 index 00000000..58010842 --- /dev/null +++ b/egs/sre21-av-a/v1.8k/global_conf/config_fbank64_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh @@ -0,0 +1,58 @@ +# Time SE Res2Net50 w26s4 x-vector with mixed precision training + +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxcelebcat_sre_alllangs_mixfs + +eff_batch_size=512 # effective batch size +min_chunk=4 +max_chunk=4 +lr=0.02 + +nnet_type=resnet +dropout=0 +embed_dim=256 +se_r=256 + +s=30 +margin_warmup=20 +margin=0.3 +attstats_inner=128 + +nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage1_v1.0.yaml +nnet_name=${feat_type}_tseres2net50w26s4_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 +nnet_dir=exp/xvector_nnets/$nnet_name +nnet=$nnet_dir/model_ep0075.pth +nnet=$nnet_dir/swa_model_ep0076.pth + +# xvector full net finetuning with out-of-domain +ft_batch_size_1gpu=8 +ft_eff_batch_size=128 # effective batch size +ft_min_chunk=10 +ft_max_chunk=10 +ft_ipe=1 +ft_lr=0.01 +ft_margin=0.5 + +ft_nnet_base_cfg=conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml +ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 +ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name +ft_nnet=$ft_nnet_dir/model_ep0007.pth + + +# back-end +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda + diff --git a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh b/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh deleted file mode 100644 index 9f5c8e70..00000000 --- a/egs/sre21-av-a/v1.8k/global_conf/config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.05_amp.v1.sh +++ /dev/null @@ -1,76 +0,0 @@ -# Time SE Res2Net50 w26s4 x-vector with mixed precision training - -# acoustic features -feat_config=conf/fbank80_stmn_8k.yaml -feat_type=fbank80_stmn - -#vad -vad_config=conf/vad_8k.yaml - -# x-vector training -nnet_data=voxcelebcat_sre_alllangs_mixfs -aug_opt="--train-aug-cfg conf/reverb_noise_aug.yaml --val-aug-cfg conf/reverb_noise_aug.yaml" - -batch_size_1gpu=24 -eff_batch_size=512 # effective batch size -ipe=1 -min_chunk=4 -max_chunk=4 -lr=0.02 - -nnet_type=tseres2net50 -dropout=0 -embed_dim=256 -width_factor=1.625 -scale=4 -ws_tag=w26s4 -se_r=256 - -s=30 -margin_warmup=20 -margin=0.3 -attstats_inner=128 - -nnet_opt="--resnet-type $nnet_type --in-feats 80 --in-channels 1 --in-kernel-size 3 --in-stride 1 --no-maxpool --res2net-width-factor $width_factor --res2net-scale $scale --se-r $se_r --pool_net.pool-type ch-wise-att-mean+stddev --pool_net.inner-feats $attstats_inner" - -opt_opt="--optim.opt-type adam --optim.lr $lr --optim.beta1 0.9 --optim.beta2 0.95 --optim.weight-decay 1e-5 --optim.amsgrad --use-amp --swa-start 65 --swa-lr 1e-3 --swa-anneal-epochs 5" -lrs_opt="--lrsched.lrsch-type exp_lr --lrsched.decay-rate 0.5 --lrsched.decay-steps 10000 --lrsched.hold-steps 40000 --lrsched.min-lr 1e-5 --lrsched.warmup-steps 1000 --lrsched.update-lr-on-opt-step" - -nnet_name=${feat_type}_${nnet_type}${ws_tag}_r${se_r}_chattstatsi128_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v1 -nnet_num_epochs=75 -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0075.pth -nnet=$nnet_dir/swa_model_ep0076.pth - -# xvector full net finetuning with out-of-domain -ft_batch_size_1gpu=8 -ft_eff_batch_size=128 # effective batch size -ft_min_chunk=10 -ft_max_chunk=15 -ft_ipe=1 -ft_lr=0.01 -ft_nnet_num_epochs=21 -ft_nnet_num_epochs=45 -ft_margin=0.5 -ft_margin_warmup=3 - -ft_opt_opt="--optim.opt-type sgd --optim.lr $ft_lr --optim.momentum 0.9 --optim.weight-decay 1e-5 --use-amp --var-batch-size" -ft_lrs_opt="--lrsched.lrsch-type cos_lr --lrsched.t 2500 --lrsched.t-mul 2 --lrsched.warm-restarts --lrsched.gamma 0.75 --lrsched.min-lr 1e-4 --lrsched.warmup-steps 100 --lrsched.update-lr-on-opt-step" -ft_nnet_name=${nnet_name}.ft_${ft_min_chunk}_${ft_max_chunk}_arcm${ft_margin}_sgdcos_lr${ft_lr}_b${ft_eff_batch_size}_amp.v1 -ft_nnet_dir=exp/xvector_nnets/$ft_nnet_name -ft_nnet=$ft_nnet_dir/model_ep0014.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -# lda_dim=200 -# plda_y_dim=150 -# plda_z_dim=200 - diff --git a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh index 9891e812..d7ea8ed0 100755 --- a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh @@ -10,22 +10,17 @@ set -e stage=1 ngpu=4 config_file=default_config.sh -resume=false interactive=false -num_workers=8 +num_workers="" . parse_options.sh || exit 1; . $config_file . datapath.sh -batch_size=$(($batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $eff_batch_size | awk '{ print int($2/$1+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) list_dir=data/${nnet_data}_proc_audio_no_sil -args="" -if [ "$resume" == "true" ];then - args="--resume" +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" fi if [ "$interactive" == "true" ];then @@ -35,6 +30,49 @@ fi # Network Training if [ $stage -le 1 ]; then + mkdir -p $nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type \ + --cfg $nnet_base_cfg $nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --trainer.exp-path $nnet_dir \ + --num-gpus $ngpu \ + +fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + mkdir -p $ft_nnet_dir/log + $cuda_cmd \ + --gpu $ngpu $ft_nnet_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type \ + --cfg $ft_nnet_base_cfg $ft_nnet_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.time-durs-file $list_dir/utt2dur \ + --data.train.dataset.segments-file $list_dir/lists_xvec/train.scp \ + --data.train.dataset.class-files $list_dir/lists_xvec/class2int \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.time-durs-file $list_dir/utt2dur \ + --data.val.dataset.segments-file $list_dir/lists_xvec/val.scp \ + --in-model-file $nnet \ + --trainer.exp-path $ft_nnet_dir \ + --num-gpus $ngpu \ + +fi +exit + +# Network Training +if [ $stage -le 1 ]; then + if [[ ${nnet_type} =~ resnet1d ]]; then train_exec=torch-train-resnet1d-xvec-from-wav.py elif [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]] || [[ ${nnet_type} =~ res2net ]] || [[ ${nnet_type} =~ res2next ]]; then diff --git a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh b/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh deleted file mode 100755 index 58a3fdc9..00000000 --- a/egs/sre21-av-a/v1.8k/run_012_finetune_xvector.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright -# 2019 Johns Hopkins University (Author: Jesus Villalba) -# Apache 2.0. -# -. ./cmd.sh -. ./path.sh -set -e - -stage=1 -ngpu=4 -config_file=default_config.sh -resume=false -interactive=false -num_workers=3 - -. parse_options.sh || exit 1; -. $config_file -. datapath.sh - -batch_size=$(($ft_batch_size_1gpu*$ngpu)) -grad_acc_steps=$(echo $batch_size $ft_eff_batch_size $ft_min_chunk $ft_max_chunk | awk '{ print int($2/($1*$4*2/($3+$4))+0.5)}') -log_interval=$(echo 100*$grad_acc_steps | bc) -list_dir=data/${nnet_data}_proc_audio_no_sil - -args="" -if [ "$resume" == "true" ];then - args="--resume" -fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - -# Network Training -if [ $stage -le 1 ]; then - mkdir -p $ft_nnet_dir/log - $cuda_cmd --gpu $ngpu $ft_nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - torch-finetune-xvec-from-wav.py --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $ft_min_chunk --max-chunk-length $ft_max_chunk \ - --iters-per-epoch $ft_ipe \ - --batch-size $batch_size \ - --num-workers $num_workers $ft_opt_opt $ft_lrs_opt \ - --grad-acc-steps $grad_acc_steps \ - --epochs $ft_nnet_num_epochs \ - --cos-scale $s --margin $ft_margin --margin-warmup-epochs $ft_margin_warmup \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --in-model-path $nnet \ - --train-mode ft-full \ - --exp-path $ft_nnet_dir $args - -fi - - diff --git a/egs/voxceleb/v1.1/run_002_compute_evad.sh b/egs/voxceleb/v1.1/run_002_compute_evad.sh index 7a2a9be5..4e82a87a 100755 --- a/egs/voxceleb/v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/v1.1/run_002_compute_evad.sh @@ -41,7 +41,6 @@ if [ $stage -le 1 ]; then fi fi -#Train datasets if [ $stage -le 2 ];then if [ "$do_voxsrc22" == "true" ];then extra_data="voxsrc22_dev" diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh index 831eb1bc..f956bc8c 100755 --- a/egs/voxceleb/v1.2/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -20,17 +20,17 @@ if [ $stage -le 1 ];then prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ --cat-videos --use-kaldi-ids \ --output-dir data/voxceleb2cat_train - #local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train fi -exit + if [ $stage -le 2 ];then # prepare voxceleb1 for test - # This script is for the old version of the dataset - # local/make_voxceleb1_oeh.pl $voxceleb1_root data - # Use this for the newer version of voxceleb1: - local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data + #hyp_utils/conda_env.sh \ + prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test + #local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi - +exit if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then local/prepare_voxsrc22_dev.py \ --vox1-corpus-dir $voxceleb1_root \ diff --git a/hyp_utils/create_data_link.pl b/hyp_utils/create_data_link.pl new file mode 100755 index 00000000..850f29f0 --- /dev/null +++ b/hyp_utils/create_data_link.pl @@ -0,0 +1,132 @@ +#!/usr/bin/env perl + +# Copyright 2013 Guoguo Chen +# 2014 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0. +# +# This script distributes data onto different file systems by making symbolic +# links. It is supposed to use together with utils/create_split_dir.pl, which +# creates a "storage" directory that links to different file systems. +# +# If a sub-directory egs/storage does not exist, it does nothing. If it exists, +# then it selects pseudo-randomly a number from those available in egs/storage/* +# creates a link such as +# +# egs/egs.3.4.ark -> storage/4/egs.3.4.ark +# +use strict; +use warnings; +use File::Basename; +use File::Spec; +use Getopt::Long; + +sub GetGCD { + my ($a, $b) = @_; + while ($a != $b) { + if ($a > $b) { + $a = $a - $b; + } else { + $b = $b - $a; + } + } + return $a; +} + +my $Usage = < storage/4/egs.3.4.ark + +Usage: utils/create_data_link.pl [ ... ] + e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark foo/bar/egs.3.5.ark + (note: the dirname, e.g. foo/bar/, must be the same in all cases). + +See also utils/remove_data_links.sh +EOU + +GetOptions(); + +if (@ARGV == 0) { + die $Usage; +} + +my $example_fullpath = $ARGV[0]; + +# Check if the storage has been created. If so, do nothing. +my $dirname = dirname($example_fullpath); +if (! -d "$dirname/storage") { + exit(0); +} + +# Storage exists, create symbolic links in the next few steps. + +# First, get a list of the available storage directories, and check if they are +# properly created. +opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n"; +my @storage_dirs = grep(/^[0-9]*$/, readdir($dh)); +closedir($dh); +my $num_storage = scalar(@storage_dirs); +for (my $x = 1; $x <= $num_storage; $x++) { + (-d "$dirname/storage/$x") || die "$0: $dirname/storage/$x does not exist\n"; +} + +# Second, get the coprime list. +my @coprimes; +for (my $n = 1; $n <= $num_storage; $n++) { + if (GetGCD($n, $num_storage) == 1) { + push(@coprimes, $n); + } +} + +my $ret = 0; + +foreach my $fullpath (@ARGV) { + if ($dirname ne dirname($fullpath)) { + die "Mismatch in directory names of arguments: $example_fullpath versus $fullpath"; + } + + # Finally, work out the directory index where we should put the data to. + my $basename = basename($fullpath); + my $filename_numbers = $basename; + $filename_numbers =~ s/[^0-9]+/ /g; + my @filename_numbers = split(" ", $filename_numbers); + my $total = 0; + my $index = 0; + foreach my $x (@filename_numbers) { + if ($index >= scalar(@coprimes)) { + $index = 0; + } + $total += $x * $coprimes[$index]; + $index++; + } + my $dir_index = $total % $num_storage + 1; + + # Make the symbolic link. + if (-e $fullpath) { + unlink($fullpath); + } + if (symlink("storage/$dir_index/$basename", $fullpath) != 1) { # failure + $ret = 1; # will exit with error status. + } +} + +exit($ret); + +## testing: +# rm -rf foo bar +# mkdir -p bar/{1,2,3,4} +# mkdir -p foo/storage +# for x in 1 2 3 4; do ln -s ../../bar/$x foo/storage/$x; done +# utils/create_data_link.pl utils/create_data_link.pl foo/1.3.ark foo/2.3.ark +# ls -l foo +# total 0 +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 1.3.ark -> storage/3/1.3.ark +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 2.3.ark -> storage/4/2.3.ark +# drwxr-xr-x 2 dpovey fax 38 Sep 2 17:40 storage diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh new file mode 100755 index 00000000..877b9e3f --- /dev/null +++ b/hyp_utils/create_data_split_dirs.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +storage_name=$(date +'%m_%d_%H_%M') + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo "$0 exp/vad_dir $USER/hyp-data/voxceleb/v1/vad/storage b0" +fi +output_dir=$1 +storage_dir=$2 +nodes=$3 + +link_dir=$output_dir/storage + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then + echo "Prepare to distribute data over multiple $nodes nodes" + dir_name=$storage_dir/$storage_name/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + hyp_utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $link_dir + elif [ "$nodes" == "b1" ];then + hyp_utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $link_dir + elif [ "$nodes" == "c0" ];then + hyp_utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $link_dir + elif [ "$nodes" == "fs01" ];then + hyp_utils/create_split_dir.pl \ + /export/fs01/$dir_name $link_dir + else + echo "we don't distribute data between multiple machines" + fi +fi + + + diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh new file mode 100755 index 00000000..fb5b8ca0 --- /dev/null +++ b/hyp_utils/create_data_split_links.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +storage_name=$(date +'%m_%d_%H_%M') + +echo "$0 $@" # Print the command line for logging +if [ $# -ne 3 ]; then + echo "Usage: $0 < " + echo "$0 exp/vad_dir/vad.JOB.ark 40" +fi +output_file_pattern=$1 +nj=$2 + +for n in $(seq $nj); do + # the next command does nothing unless output_dir/storage exists, see + # utils/create_data_link.pl for more info. + output_file=$(echo $output_file_pattern | sed 's@\.JOB\.[^\.]*$@.'$n'.@') + hyp_utils/create_data_link.pl $output_file +done + diff --git a/hyp_utils/create_split_dir.pl b/hyp_utils/create_split_dir.pl new file mode 100755 index 00000000..ab952357 --- /dev/null +++ b/hyp_utils/create_split_dir.pl @@ -0,0 +1,92 @@ +#!/usr/bin/env perl + +# Copyright 2013 Guoguo Chen +# Apache 2.0. +# +# This script creates storage directories on different file systems, and creates +# symbolic links to those directories. For example, a command +# +# utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage +# +# will mkdir -p all of those directories, and will create links +# +# egs/storage/1 -> /export/gpu-03/egs/storage +# egs/storage/2 -> /export/gpu-03/egs/storage +# ... +# +use strict; +use warnings; +use File::Spec; +use Getopt::Long; + +my $Usage = < + e.g.: utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage + +Allowed options: + --suffix : Common suffix to (string, default = "") + +See also create_data_link.pl, which is intended to work with the resulting +directory structure, and remove_data_links.sh +EOU + +my $suffix=""; +GetOptions('suffix=s' => \$suffix); + +if (@ARGV < 2) { + die $Usage; +} + +my $ans = 1; + +my $dir = pop(@ARGV); +system("mkdir -p $dir 2>/dev/null"); + +my @all_actual_storage = (); +foreach my $file (@ARGV) { + push @all_actual_storage, File::Spec->rel2abs($file . "/" . $suffix); +} + +my $index = 1; +foreach my $actual_storage (@all_actual_storage) { + my $pseudo_storage = "$dir/$index"; + + # If the symbolic link already exists, delete it. + if (-l $pseudo_storage) { + print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n"; + $index++; + next; + } + + # Create the destination directory and make the link. + system("mkdir -p $actual_storage 2>/dev/null"); + if ($? != 0) { + print STDERR "$0: error creating directory $actual_storage\n"; + exit(1); + } + { # create a README file for easier deletion. + open(R, ">$actual_storage/README.txt"); + my $storage_dir = File::Spec->rel2abs($dir); + print R "# This directory is linked from $storage_dir, as part of Kaldi striped data\n"; + print R "# The full list of directories where this data resides is:\n"; + foreach my $d (@all_actual_storage) { + print R "$d\n"; + } + close(R); + } + my $ret = symlink($actual_storage, $pseudo_storage); + + # Process the returned values + $ans = $ans && $ret; + if (! $ret) { + print STDERR "Error linking $actual_storage to $pseudo_storage\n"; + } + + $index++; +} + +exit($ans == 1 ? 0 : 1); diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index 10ea491c..50fd5088 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -49,11 +49,11 @@ def __init__( self.sigma = sigma def forward(self, s_t): - # print('sigma0=', self.sigma) + if self.sigma > 0: s_t = s_t + self.sigma * torch.randn_like(s_t) - # print('sigma1=', self.sigma) - f_t = self.feat_extractor(s_t) + + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] @@ -320,7 +320,7 @@ def eval_cosine_scoring( ) s.save_txt(score_file) - logging.info("saving stats to %s" % (stats_file)) + logging.info("saving stats to %s", stats_file) attack_stats.to_csv(stats_file) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index a6f535b3..5697404d 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -84,7 +84,7 @@ def forward(self, s_t): s_t = self.wav_scale * s_t # End of pre-processing defense - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] @@ -289,13 +289,11 @@ def eval_cosine_scoring_wavegan( vad = torch.tensor(vad, dtype=torch.bool).to(device) model.vad_t = vad logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key.seg_set[j], - speech_frames, - tot_frames, - speech_frames / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, ) t2 = time.time() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index 5ba42477..0ca1f740 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -113,7 +113,7 @@ def forward(self, s_t): s_t = s_t[0, 0] f_t = s_t - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index c3732bd3..49a762af 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -125,7 +125,7 @@ def eval_cosine_scoring( audio_reader = AR(test_wav_file, **audio_args) if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) + logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32") @@ -144,7 +144,7 @@ def eval_cosine_scoring( t2 = time.time() s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) - x_t = feat_extractor(s) + x_t, _ = feat_extractor(s) t4 = time.time() tot_frames = x_t.shape[1] if vad_spec is not None: diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index c00cf286..b2c111ca 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -45,7 +45,7 @@ def __init__( def forward(self, s_t): f_t = s_t - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 4f2b82ab..8b6c8dae 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -59,8 +59,7 @@ def __init__( self.threshold = threshold def forward(self, s_t): - f_t = s_t - f_t = self.feat_extractor(s_t) + f_t, _ = self.feat_extractor(s_t) if self.vad_t is not None: n_vad_frames = len(self.vad_t) n_feat_frames = f_t.shape[1] diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index 2f5cf3da..98ba76b5 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -137,7 +137,7 @@ def eval_xvec( with AR(input_spec, **ar_args) as reader: if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) + logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): @@ -160,7 +160,7 @@ def eval_xvec( x[None, :], dtype=torch.get_default_dtype() ).to(device) - x = feat_extractor(x) + x, _ = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: @@ -169,13 +169,11 @@ def eval_xvec( x = x[:, vad] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - x.shape[1], - tot_frames, - x.shape[1] / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, ) if random_utt_length: @@ -200,27 +198,23 @@ def eval_xvec( read_time = t2 - t1 tot_time = read_time + t8 - t3 logging.info( - ( - "utt %s total-time=%.3f read-time=%.3f " - "aug-time=%.3f feat-time=%.3f " - "vad-time=%.3f embed-time=%.3f write-time=%.3f " - "rt-factor=%.2f" - ) - % ( - key, - tot_time, - read_time, - t4 - t3, - t5 - t4, - t6 - t5, - t7 - t6, - t8 - t7, - x0.shape[0] / fs[0] / tot_time, - ) + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f", + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, ) if write_num_frames_spec is not None: - logging.info("writing num-frames to %s" % (write_num_frames_spec)) + logging.info("writing num-frames to %s", write_num_frames_spec) u2nf = Utt2Info.create(keys, info) u2nf.save(write_num_frames_spec) diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index 1da1ac05..f49a5fb0 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -163,7 +163,7 @@ def extract_xvectors( x[None, :], dtype=torch.get_default_dtype() ).to(device) - x = feat_extractor(x) + x, _ = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index a31bd614..9dc0aa2c 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -155,7 +155,7 @@ def extract_xvectors( x[None, :], dtype=torch.get_default_dtype() ).to(device) - x = feat_extractor(x) + x, _ = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: @@ -164,13 +164,11 @@ def extract_xvectors( x = x[:, vad] logging.info( - "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - x.shape[1], - tot_frames, - x.shape[1] / tot_frames * 100, - ) + "utt %s detected %d/%d (%.2f %%) speech frames", + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, ) t6 = time.time() diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 227892ea..7d602709 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -10,8 +10,12 @@ import time from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, set_float_cpu @@ -239,72 +243,3 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) - - -# if __name__ == "__main__": - -# parser = ArgumentParser(description="Fine-tune x-vector model from audio files") -# parser.add_argument("--cfg", action=ActionConfigFile) - -# train_parser = ArgumentParser(prog="") -# AD.add_class_args(train_parser, prefix="dataset", skip={}) -# Sampler.add_class_args(train_parser, prefix="sampler") -# train_parser.add_argument( -# "--data_loader.num-workers", -# type=int, -# default=5, -# help="num_workers of data loader", -# ) - -# val_parser = ArgumentParser(prog="") -# AD.add_class_args(val_parser, prefix="dataset", skip={}) -# Sampler.add_class_args(val_parser, prefix="sampler") -# val_parser.add_argument( -# "--data_loader.num-workers", -# type=int, -# default=5, -# help="num_workers of data loader", -# ) -# data_parser = ArgumentParser(prog="") -# data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) -# data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) -# parser.add_argument("--data", action=ActionParser(parser=data_parser)) -# parser.link_arguments( -# "data.train.dataset.class_file", "data.val.dataset.class_file" -# ) -# parser.link_arguments( -# "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" -# ) -# parser.link_arguments( -# "data.train.sampler.batch_size", "data.val.sampler.batch_size" -# ) - -# AF.add_class_args(parser, prefix="feats") -# parser.add_argument("--in-model-path", required=True) - -# XVec.add_finetune_args(parser, prefix="model") -# Trainer.add_class_args( -# parser, prefix="trainer", train_modes=XVec.valid_train_modes() -# ) -# ddp.add_ddp_args(parser) - -# parser.add_argument("--seed", type=int, default=1123581321, help="random seed") -# parser.add_argument( -# "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int -# ) -# parser.add_argument("--local_rank", default=0, type=int) - -# args = parser.parse_args() -# gpu_id = args.local_rank -# del args.local_rank - -# if gpu_id == 0: -# try: -# config_file = Path(args.exp_path) / "config.yaml" -# parser.save(args, str(config_file), format="yaml", overwrite=True) -# except: -# pass - -# # torch docs recommend using forkserver -# multiprocessing.set_start_method("forkserver") -# train_xvec(gpu_id, args) diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py index 7caae8c4..9ae59246 100644 --- a/hyperion/data_prep/__init__.py +++ b/hyperion/data_prep/__init__.py @@ -3,6 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# from .data_prep import data_prep_registry from .data_prep import DataPrep from .voxceleb2 import VoxCeleb2DataPrep +from .voxceleb1 import VoxCeleb1DataPrep diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py new file mode 100644 index 00000000..00b2e380 --- /dev/null +++ b/hyperion/data_prep/voxceleb1.py @@ -0,0 +1,338 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +import glob + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class VoxCeleb1DataPrep(DataPrep): + """Class for preparing VoxCeleb1 database into tables, + It prepares the full voxceleb either to train or test with + Original/Entire/Hard. + We don't consider preparing dev for train and test for test Original + + Attributes: + corpus_dir: input data directory + task: train/test + cat_videos: concatenate utterances from the same video. + output_dir: output data directory + use_kaldi_ids: puts speaker-id in front of segment id like kaldi + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + task: str, + cat_videos: bool, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + use_kaldi_ids = True + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + + self.task = task + assert ( + cat_videos == False or task == "train" + ), "cat-videos is only available for train task" + + self.cat_videos = cat_videos + + @staticmethod + def dataset_name(): + return "voxceleb1" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--task", + default="test", + choices=["test", "train"], + help="""if we prepare the data for [test, train]""", + ) + parser.add_argument( + "--cat-videos", + default=False, + action=ActionYesNo, + help="""concatenate utterances from the same video.""", + ) + + def _get_metadata(self): + file_name = "vox1_meta.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.openslr.org/resources/49/vox1_meta.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_meta = pd.read_csv(file_path, sep="\t") + df_meta.rename(columns=str.strip, inplace=True) + df_meta = df_meta.applymap(lambda x: str.strip(x) if isinstance(x, str) else x) + df_meta.set_index("VoxCeleb1 ID", inplace=True) + return df_meta + + def _get_langs_est(self): + file_name = "lang_vox2_final.csv" + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_lang = pd.read_csv(file_path, sep=",") + + if self.cat_videos: + + def get_video(x): + x = re.sub("/[^/]*.wav$", "", x) + return re.sub("/", "-", x) + + elif self.use_kaldi_ids: + + def get_video(x): + x = re.sub(".wav$", "", x) + return re.sub("/", "-", x) + + else: + + def get_video(x): + x = re.sub(".wav$", "", x) + x = re.sub("^[^/]*/", "", x) + return re.sub("/", "-", x) + + df_lang["id"] = df_lang["filename"].apply(get_video) + df_lang.drop(["filename"], axis=1, inplace=True) + df_lang.drop_duplicates(inplace=True) + df_lang.set_index("id", inplace=True) + df_lang["lang"] = df_lang["lang"].apply(str.lower) + return df_lang + + @staticmethod + def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i): + list_file = lists_cat_dir / f"{rec_id}.txt" + with open(list_file, "w") as fw: + rec_idx = (video_idx == i).nonzero()[0] + recs_i = [f"file {rec_files[j]}" for j in rec_idx] + recs_i.sort() + recs_i = "\n".join(recs_i) + fw.write(f"{recs_i}\n") + + file_path = ( + f"ffmpeg -v 8 -f concat -safe 0 -i {list_file} -f wav -acodec pcm_s16le -|" + ) + return file_path + + def make_trials(self): + url_base = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta" + trials_file_names = [ + "veri_test2.txt", + "list_test_hard2.txt", + "list_test_all2.txt", + ] + trials_names = ["trials_o", "trials_h", "trials_e"] + + trials = {} + dfs = [] + logging.info("making trials") + for trial_name, file_name in zip(trials_names, trials_file_names): + file_path = self.corpus_dir / file_name + if not file_path.exists(): + file_path = self.output_dir / file_name + if not file_path.exists(): + url = f"{url_base}/{file_name}" + file_path, _ = urlretrieve_progress(url, file_path, desc=file_name) + + df_in = pd.read_csv( + file_path, + header=None, + sep=" ", + names=["key", "enroll_file", "test_file"], + ) + key = ["target" if k == 1 else "nontarget" for k in df_in["key"]] + + def get_modelid(s): + s = re.sub(r"\.wav", "", s) + return re.sub(r"/", "-", s) + + if self.use_kaldi_ids: + get_segmentid = get_modelid + else: + + def get_segmentid(s): + s = get_modelid(s) + return re.sub(r"[^-]*-", "", s) + + modelid = [get_modelid(f) for f in df_in["enroll_file"]] + segmentid = [get_segmentid(f) for f in df_in["test_file"]] + df_out = pd.DataFrame( + {"modelid": modelid, "segmentid": segmentid, "targettype": key} + ) + df_out.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / f"{trial_name}.csv" + df_out.to_csv(file_path, index=False) + dfs.append(df_out) + trials[trial_name] = file_path + + df_out = pd.concat(dfs, ignore_index=True) + df_out.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / "trials.csv" + df_out.to_csv(file_path, index=False) + trials["trials"] = file_path + + logging.info("making enrollment map") + modelid = df_out["modelid"].sort_values().unique() + if self.use_kaldi_ids: + segmentid = modelid + else: + segmentid = [re.sub(r"[^-]*-", "", s) for s in modelid] + + df_out = pd.DataFrame({"modelid": modelid, "segmentid": segmentid}) + file_path = self.output_dir / "enrollment.csv" + df_out.to_csv(file_path, index=False) + enrollments = {"enrollment": file_path} + + return enrollments, trials + + def prepare(self): + + logging.info("getting audio meta-data") + df_meta = self._get_metadata() + logging.info("getting language estimations") + df_lang = self._get_langs_est() + rec_dir = self.corpus_dir + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + speakers = [f.parents[1].name for f in rec_files] + video_ids = [f.parent.name for f in rec_files] + if self.cat_videos: + lists_cat_dir = self.output_dir / "lists_cat" + lists_cat_dir.mkdir(exist_ok=True, parents=True) + uniq_video_ids, uniq_video_idx, video_idx = np.unique( + video_ids, return_index=True, return_inverse=True + ) + rec_ids = uniq_video_ids + speakers = [speakers[i] for i in uniq_video_idx] + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] + + file_paths = [] + futures = [] + logging.info("making video cat lists") + logging.info("submitting threats...") + with ThreadPoolExecutor(max_workers=self.num_threads) as pool: + for i, rec_id in tqdm(enumerate(rec_ids)): + future = pool.submit( + VoxCeleb1DataPrep.make_cat_list, + lists_cat_dir, + rec_id, + rec_files, + video_idx, + i, + ) + futures.append(future) + + logging.info("waiting threats...") + file_paths = [f.result() for f in tqdm(futures)] + video_ids = uniq_video_ids + + else: + file_names = [f.with_suffix("").name for f in rec_files] + if self.use_kaldi_ids: + rec_ids = [ + f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names) + ] + else: + rec_ids = [f"{v}-{f}" for v, f in zip(video_ids, file_names)] + + file_paths = [str(r) for r in rec_files] + + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": file_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "video_ids": video_ids, + "speaker": speakers, + "gender": df_meta.loc[speakers, "Gender"], + "nationality": df_meta.loc[speakers, "Nationality"], + "language_est": [ + df_lang.loc[r, "lang"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "language_est_conf": [ + df_lang.loc[r, "confidence"] if r in df_lang.index else "N/A" + for r in rec_ids + ], + "duration": recs.loc[rec_ids, "duration"].values, + } + ) + segments = SegmentSet(segments) + segments.sort() + + logging.info("making speaker info file") + uniq_speakers = np.unique(speakers) + speakers = pd.DataFrame( + { + "id": uniq_speakers, + "vgg_id": df_meta.loc[uniq_speakers, "VGGFace1 ID"], + "gender": df_meta.loc[uniq_speakers, "Gender"], + "nationality": df_meta.loc[uniq_speakers, "Nationality"], + } + ) + speakers = ClassInfo(speakers) + + logging.info("making language info file") + languages = np.unique(df_lang["lang"]) + languages = ClassInfo(pd.DataFrame({"id": languages})) + + if self.task == "test": + enrollments, trials = self.make_trials() + + logging.info("making dataset") + dataset = Dataset( + segments, + classes={"speaker": speakers, "languages": languages}, + recordings={"recordings": recs}, + enrollments=enrollments, + trials=trials, + sparse_trials=False, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments, %d speakers", len(segments), len(speakers) + ) diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index a1a9f0c3..1a32420f 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging +import glob import re from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -39,8 +40,7 @@ def __init__( target_sample_freq: int, num_threads: int = 10, ): - if cat_videos: - use_kaldi_ids = True + use_kaldi_ids = True super().__init__( corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads ) @@ -143,6 +143,12 @@ def prepare(self): rec_dir = self.corpus_dir / self.subset logging.info("searching audio files in %s", str(rec_dir)) rec_files = list(rec_dir.glob("**/*.m4a")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + speakers = [f.parents[1].name for f in rec_files] video_ids = [f.parent.name for f in rec_files] if self.cat_videos: @@ -176,7 +182,7 @@ def prepare(self): video_ids = uniq_video_ids else: - file_names = [f.name for f in rec_files] + file_names = [f.with_suffix("").name for f in rec_files] if self.use_kaldi_ids: rec_ids = [ f"{s}-{v}-{f}" for s, v, f in zip(speakers, video_ids, file_names) diff --git a/hyperion/np/classifiers/__init__.py b/hyperion/np/classifiers/__init__.py index d9d02ed0..60582016 100644 --- a/hyperion/np/classifiers/__init__.py +++ b/hyperion/np/classifiers/__init__.py @@ -10,4 +10,4 @@ from .linear_svmc import LinearSVMC from .logistic_regression import LogisticRegression from .q_scoring_homo_gbe import QScoringHomoGBE -from .svmc import GaussianSVMC +from .svmc import SVMC diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 5e38494f..8fe67792 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -42,8 +42,9 @@ def _standardize_weights(self, x, x_lengths=None, weights=None): multiplied by the input data. """ if weights is None: + time_dim = self.dim if self.dim >= 0 else x.dim() + self.dim return seq_lengths_to_mask( - x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=self.dim + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=time_dim ) if weights.dim() == x.dim(): @@ -599,7 +600,7 @@ def _standardize_weights(self, x, x_lengths=None, weights=None): """standardizes the weights to have shape (batch, max_length).""" if weights is None: return seq_lengths_to_mask( - x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=1 + x_lengths, x.size(self.dim), dtype=x.dtype, time_dim=2 ) if weights.dim() == x.dim(): @@ -797,7 +798,7 @@ def forward(self, x, x_lengths=None, weights=None): if attn.dtype == torch.half: min_value = -65504 else: - min_value = -1e200 + min_value = -1e20 mask = weights.eq(0) attn = attn.masked_fill(mask, min_value) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 8556104a..d67785d2 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -355,7 +355,7 @@ def forward_hid_feats( max_in_length = x.size(-1) x = self._pre_enc(x) h_enc, x = self.encoder_net.forward_hid_feats( - x, return_enc_layers, return_logits=True + x, return_enc_layers, return_output=True ) output = {"h_enc": h_enc} if not return_logits and return_classif_layers is None: @@ -363,7 +363,7 @@ def forward_hid_feats( x, x_lengths = self._post_enc(x, x_lengths, max_in_length) p = self.pool_net(x, x_lengths=x_lengths) - h_classif, y_pred = self.classif_net.forward_hid_feats( + h_classif = self.classif_net.forward_hid_feats( p, y, return_classif_layers, return_logits=return_logits ) if return_logits: @@ -750,7 +750,7 @@ def add_class_args(parser, prefix=None, skip=set()): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index 160ee61b..a9ad224e 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -32,7 +32,12 @@ def __init__( if mvn is not None: mvn = MVN.filter_args(**mvn) self.mvn_cfg = mvn - if mvn["norm_mean"] or mvn["norm_var"]: + if ( + ("norm_mean" in mvn) + and mvn["norm_mean"] + or ("norm_var" in mvn) + and mvn["norm_var"] + ): self.mvn = MVN(**mvn) self.spec_augment = None @@ -79,7 +84,7 @@ def forward(self, x, x_lengths=None): if self.trans: f = f.transpose(1, 2).contiguous() - return f + return f, f_lengths def get_config(self): config = { diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index 9f9b280b..e5d90f4f 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -402,7 +402,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index f5ab74d5..172a3d70 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -31,7 +31,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -389,7 +389,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index 0c331a5e..6cf7f4ca 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -28,7 +28,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -362,7 +362,7 @@ def add_class_args(parser, prefix=None, head_channels=False, in_feats=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index 4106cbfd..68679e0b 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -31,7 +31,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -410,7 +410,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index ce7b9677..bc7e4b33 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -29,7 +29,7 @@ def __init__( conv_strides=2, conv_dilations=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, use_norm=True, @@ -367,7 +367,7 @@ def add_class_args(parser, prefix=None, head_channels=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/fcnet.py b/hyperion/torch/narchs/fcnet.py index cdbf1940..a47f304e 100644 --- a/hyperion/torch/narchs/fcnet.py +++ b/hyperion/torch/narchs/fcnet.py @@ -125,7 +125,7 @@ def __init__( in_units, hid_units, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, dropout_rate=0, norm_layer=None, diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index 858cf4ea..5d3b9793 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -10,10 +10,16 @@ import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear -from ..layer_blocks import (Res2NetBasicBlock, Res2NetBNBlock, - ResNetBasicBlock, ResNetBNBlock, - ResNetEndpointBlock, ResNetInputBlock, - SEResNetBasicBlock, SEResNetBNBlock) +from ..layer_blocks import ( + Res2NetBasicBlock, + Res2NetBNBlock, + ResNetBasicBlock, + ResNetBNBlock, + ResNetEndpointBlock, + ResNetInputBlock, + SEResNetBasicBlock, + SEResNetBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..utils import scale_seq_lengths, seq_lengths_to_mask @@ -69,7 +75,7 @@ def __init__( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index 0c577174..9332724f 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -9,9 +9,13 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC1dDecBlock, ResNet1dBasicDecBlock, - ResNet1dBNDecBlock, SEResNet1dBasicDecBlock, - SEResNet1dBNDecBlock) +from ..layer_blocks import ( + DC1dDecBlock, + ResNet1dBasicDecBlock, + ResNet1dBNDecBlock, + SEResNet1dBasicDecBlock, + SEResNet1dBNDecBlock, +) from ..layers import ActivationFactory as AF from ..layers import ICNR1d from ..layers import NormLayer1dFactory as NLF @@ -34,7 +38,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -450,7 +454,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index 5bdad186..97b244f3 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -12,10 +12,16 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC1dEncBlock, Res2Net1dBasicBlock, - Res2Net1dBNBlock, ResNet1dBasicBlock, - ResNet1dBNBlock, ResNet1dEndpoint, - SEResNet1dBasicBlock, SEResNet1dBNBlock) +from ..layer_blocks import ( + DC1dEncBlock, + Res2Net1dBasicBlock, + Res2Net1dBNBlock, + ResNet1dBasicBlock, + ResNet1dBNBlock, + ResNet1dEndpoint, + SEResNet1dBasicBlock, + SEResNet1dBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer1dFactory as NLF from ..utils import seq_lengths_to_mask @@ -37,7 +43,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, drop_connect_rate=0, @@ -472,7 +478,7 @@ def forward_hid_feats(self, x, x_lengths=None, layers=None, return_output=False) if self.head_channels > 0: x = self.head_block(x) - return x + return h, x def get_config(self): @@ -675,7 +681,7 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index 426b37f5..0afa1acc 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -10,9 +10,13 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC2dDecBlock, ResNet2dBasicDecBlock, - ResNet2dBNDecBlock, SEResNet2dBasicDecBlock, - SEResNet2dBNDecBlock) +from ..layer_blocks import ( + DC2dDecBlock, + ResNet2dBasicDecBlock, + ResNet2dBNDecBlock, + SEResNet2dBasicDecBlock, + SEResNet2dBNDecBlock, +) from ..layers import ActivationFactory as AF from ..layers import ICNR2d from ..layers import NormLayer2dFactory as NLF @@ -35,7 +39,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -457,7 +461,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet2d_encoder.py b/hyperion/torch/narchs/resnet2d_encoder.py index 84e6599e..a7fd047e 100644 --- a/hyperion/torch/narchs/resnet2d_encoder.py +++ b/hyperion/torch/narchs/resnet2d_encoder.py @@ -11,10 +11,15 @@ import torch import torch.nn as nn -from ..layer_blocks import (DC2dEncBlock, Res2Net2dBasicBlock, - Res2Net2dBNBlock, ResNet2dBasicBlock, - ResNet2dBNBlock, SEResNet2dBasicBlock, - SEResNet2dBNBlock) +from ..layer_blocks import ( + DC2dEncBlock, + Res2Net2dBasicBlock, + Res2Net2dBNBlock, + ResNet2dBasicBlock, + ResNet2dBNBlock, + SEResNet2dBasicBlock, + SEResNet2dBNBlock, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from ..utils import seq_lengths_to_mask @@ -38,7 +43,7 @@ class ResNet2dEncoder(NetArch): resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -65,7 +70,7 @@ def __init__( resb_dilations=1, resb_groups=1, head_channels=0, - hid_act="relu6", + hid_act="relu", head_act=None, dropout_rate=0, se_r=16, @@ -511,7 +516,7 @@ def add_class_args(parser, prefix=None, skip=set()): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/resnet_factory.py b/hyperion/torch/narchs/resnet_factory.py index 2d17a6d7..35ed9af0 100644 --- a/hyperion/torch/narchs/resnet_factory.py +++ b/hyperion/torch/narchs/resnet_factory.py @@ -146,7 +146,7 @@ def create( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, @@ -341,7 +341,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/spinenet.py b/hyperion/torch/narchs/spinenet.py index 117c0733..4349dbe1 100644 --- a/hyperion/torch/narchs/spinenet.py +++ b/hyperion/torch/narchs/spinenet.py @@ -11,9 +11,17 @@ import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear -from ..layer_blocks import (BlockSpec, Res2NetBasicBlock, Res2NetBNBlock, - ResNetBasicBlock, ResNetBNBlock, ResNetInputBlock, - SpineConv, SpineEndpoints, SpineResample) +from ..layer_blocks import ( + BlockSpec, + Res2NetBasicBlock, + Res2NetBNBlock, + ResNetBasicBlock, + ResNetBNBlock, + ResNetInputBlock, + SpineConv, + SpineEndpoints, + SpineResample, +) from ..layers import ActivationFactory as AF from ..layers import NormLayer2dFactory as NLF from .net_arch import NetArch @@ -111,7 +119,7 @@ def __init__( do_endpoint_conv=True, concat_ax=3, upsampling_type="nearest", - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, diff --git a/hyperion/torch/narchs/spinenet_factory.py b/hyperion/torch/narchs/spinenet_factory.py index 092cbd0e..871b37e9 100644 --- a/hyperion/torch/narchs/spinenet_factory.py +++ b/hyperion/torch/narchs/spinenet_factory.py @@ -44,7 +44,7 @@ def create( conv_channels=64, base_channels=64, out_units=0, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_act=None, in_kernel_size=7, in_stride=2, @@ -243,7 +243,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/tdnn_factory.py b/hyperion/torch/narchs/tdnn_factory.py index 901cc9d0..77f69b9c 100644 --- a/hyperion/torch/narchs/tdnn_factory.py +++ b/hyperion/torch/narchs/tdnn_factory.py @@ -21,7 +21,7 @@ def create( kernel_size=3, dilation=1, dilation_factor=1, - hid_act={"name": "relu6", "inplace": True}, + hid_act={"name": "relu", "inplace": True}, out_units=0, out_act=None, dropout_rate=0, @@ -194,7 +194,7 @@ def add_class_args(parser, prefix=None): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index 4468185e..f8b50491 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -64,7 +64,7 @@ def __init__( in_layer_type="conv2d-sub", rel_pos_enc=False, causal_pos_enc=False, - hid_act="relu6", + hid_act="relu", norm_before=True, concat_after=False, padding_idx=-1, @@ -408,7 +408,7 @@ def add_class_args(parser, prefix=None, in_feats=False): ) try: - parser.add_argument("--hid-act", default="relu6", help="hidden activation") + parser.add_argument("--hid-act", default="relu", help="hidden activation") except: pass diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 9541d7b0..52474baa 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -109,10 +109,10 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) with torch.no_grad(): - feats = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(input_data) with amp.autocast(enabled=self.use_amp): - output = self.model(feats, y=target) + output = self.model(feats, feats_lengths, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: @@ -162,9 +162,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - feats = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(input_data) with amp.autocast(enabled=self.use_amp): - output = self.model(feats) + output = self.model(feats, feats_lengths) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py index fb93b439..934b4b90 100644 --- a/hyperion/torch/utils/masking.py +++ b/hyperion/torch/utils/masking.py @@ -17,9 +17,7 @@ def scale_seq_lengths(lengths, max_out_length, max_in_length=None): if max_in_length == max_out_length: return lengths - return torch.div(lengths * max_out_length, - max_in_length, - rounding_mode="floor") + return torch.div(lengths * max_out_length, max_in_length, rounding_mode="floor") def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): @@ -29,7 +27,7 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): lengths: sequence lengths with shape=(batch,). If None, it returns None max_length: maximum length of the sequence. dtype: dtype for the mask. - time_dim: dimension corresponding to time in the mask. This will + time_dim: dimension > 0 corresponding to time in the mask. This will return a view of the mask which will adapt to the shape of the tensor where we want to apply the mask. This has to be a positive integer. @@ -40,6 +38,7 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): if lengths is None: return None + assert time_dim > 0 assert lengths.dim() == 1 if max_length is None: diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index db035987..51b476aa 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -5,6 +5,7 @@ from .class_info import ClassInfo from .dataset import Dataset +from .enrollment_map import EnrollmentMap from .feature_set import FeatureSet from .hyp_dataclass import HypDataClass from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix @@ -12,6 +13,7 @@ from .recording_set import RecordingSet from .rttm import RTTM from .scp_list import SCPList + # from .ext_segment_list import ExtSegmentList from .segment_list import SegmentList from .segment_set import SegmentSet diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index efb7c114..e6c9e861 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -4,7 +4,7 @@ """ from pathlib import Path -from typing import Dict, Optional +from typing import Dict, Optional, Union import yaml @@ -13,41 +13,184 @@ from .misc import PathLike from .recording_set import RecordingSet from .segment_set import SegmentSet +from .enrollment_map import EnrollmentMap +from .trial_key import TrialKey +from .trial_ndx import TrialNdx +from .sparse_trial_key import SparseTrialKey class Dataset: """ Class that contains all objects (segments, recordings, features, class_infos) that conform a dataset + + Attributes: + segments: SegmentSet object or path to it. + classes: Dictionary of ClassInfo objects or paths to then + recordings: Dictionary of RecordingSet objects or paths to then + features: Dictionary of FeatureSet objects or paths to then + enrollments: Dictionary of EnrollmentMap objects or paths to then + trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects + or paths to then + sparse_trials: load trial keys using the SparseTrialKey class instead + of TrialKey class. + table_sep: Column separator when reading/writting tables + """ def __init__( self, - segments: SegmentSet, - classes: Optional[Dict[str, ClassInfo]] = None, - recordings: Optional[Dict[str, RecordingSet]] = None, - features: Optional[Dict[str, FeatureSet]] = None, + segments: Union[SegmentSet, PathLike], + classes: Optional[Dict[str, Union[ClassInfo, PathLike]]] = None, + recordings: Optional[Dict[str, Union[RecordingSet, PathLike]]] = None, + features: Optional[Dict[str, Union[FeatureSet, PathLike]]] = None, + enrollments: Optional[Dict[str, Union[EnrollmentMap, PathLike]]] = None, + trials: Optional[ + Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]] + ] = None, + sparse_trials: bool = False, + table_sep: Optional[str] = None, ): - self._segments = segments - self._classes = classes - self._recordings = recordings - self._features = features - @property - def segments(self): + if isinstance(segments, SegmentSet): + self._segments = segments + self._segments_path = None + else: + assert isinstance(segments, (str, Path)) + self._segments = None + self._segments_path = Path(segments) + + self._classes, self._classes_paths = self._parse_dict_args(classes, ClassInfo) + + self._recordings, self._recordings_paths = self._parse_dict_args( + recordings, RecordingSet + ) + + self._features, self._features_paths = self._parse_dict_args( + features, FeatureSet + ) + self._enrollments, self._enrollments_paths = self._parse_dict_args( + enrollments, EnrollmentMap, + ) + self._trials, self._trials_paths = self._parse_dict_args( + trials, (TrialKey, TrialNdx, SparseTrialKey), + ) + + self.sparse_trials = sparse_trials + self.table_sep = table_sep + + def _parse_dict_args(self, data, types): + if data is None: + return None, None + + assert isinstance(data, dict) + objects = {k: (v if isinstance(v, types) else None) for k, v in data.items()} + paths = { + k: (v if isinstance(v, (str, Path)) else None) for k, v in data.items() + } + + return objects, paths + + def segments(self, keep_loaded: bool = True): + if self._segments is None: + assert self._segments_path is not None + segments = SegmentSet.load(self.segments_path, sep=self.table_sep) + if keep_loaded: + self._segments = segments + return segments + return self._segments - @property - def recordings(self): - return self._recordings + def recordings_value(self, key: str, keep_loaded: bool = True): + if self._recordings[key] is None: + assert self._recordings_paths[key] is not None + recordings = RecordingSet.load( + self._recordings_paths[key], sep=self.table_sep + ) + if keep_loaded: + self._recordings[key] = recordings + + return self._recordings[key] - @property - def features(self): - return self._features + def features_value(self, key: str, keep_loaded: bool = True): + if self._features[key] is None: + assert self._features_paths[key] is not None + features = FeatureSet.load(self._features_paths[key], sep=self.table_sep) + if keep_loaded: + self._features[key] = features + + return self._features[key] + + def classes_value(self, key: str, keep_loaded: bool = True): + if self._classes[key] is None: + assert self._classes_paths[key] is not None + classes = ClassInfo.load(self._classes_paths[key], self.table_sep) + if keep_loaded: + self._classes[key] = classes + + return self._classes[key] + + def enrollments_value(self, key: str, keep_loaded: bool = True): + if self._enrollments[key] is None: + assert self._enrollments_paths[key] is not None + enrollments = EnrollmentMap.load( + self._enrollments_paths[key], sep=self.table_sep + ) + if keep_loaded: + self._enrollments[key] = enrollments + + return self._enrollments[key] + + def trials_value(self, key: str, keep_loaded: bool = True): + if self._trials[key] is None: + assert self._trials_paths[key] is not None + try: + if self.sparse_trials: + trials = SparseTrialKey.load(self._trials_paths[key]) + else: + trials = TrialKey.load(self._trials_paths[key]) + except: + trials = TrialNdx.load(self._trials_paths[key]) + + if keep_loaded: + self._trials[key] = trials + + return self._trials[key] + + def recordings(self, keep_loaded: bool = True): + if self._recordings is None: + yield from () + else: + for key in self._recordings.keys(): + yield key, self.recordings_value(key, keep_loaded) + + def features(self, keep_loaded: bool = True): + if self._features is None: + yield from () + else: + for key in self._features.keys(): + yield key, self.features_value(key, keep_loaded) + + def classes(self, keep_loaded: bool = True): + if self._classes is None: + yield from () + else: + for key in self._classes.keys(): + yield key, self.classes_value(key, keep_loaded) + + def enrollments(self, keep_loaded: bool = True): + if self._enrollments is None: + yield from () + else: + for key in self._enrollments.keys(): + yield key, self.enrollments_value(key, keep_loaded) - @property - def classes(self): - return self._classes + def trials(self, keep_loaded: bool = True): + if self._trials is None: + yield from () + else: + for key in self._trials.keys(): + yield key, self.trials_value(key, keep_loaded) @staticmethod def resolve_dataset_path(dataset_path): @@ -69,64 +212,128 @@ def resolve_file_path(dataset_dir, file_path): return dataset_dir / file_path - def save(self, dataset_path: PathLike): + def save( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + ): """Saves all the dataset objects. Args: - dataset_path: str/Path indicating directory - to save the dataset or .yaml file to save - the dataset info. + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DateSet object """ + table_sep = self.table_sep if table_sep is None else table_sep + if update_paths: + self.table_sep = table_sep + + table_ext = ".tsv" if table_sep == "\t" else ".csv" dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) dataset = {} - if self.segments is not None: - file_name = "segments.csv" - dataset["segments"] = file_name - file_path = dataset_dir / file_name - self.segments.save(file_path) + file_name = f"segments{table_ext}" + dataset["segments"] = file_name + file_path = dataset_dir / file_name + self.segments().save(file_path, sep=table_sep) + if update_paths: + self._segments_path = file_path - if self.recordings is not None: - file_names = {} - for k, v in self.recordings.items(): - file_name = k + ".csv" - file_names[k] = file_name - file_path = dataset_dir / file_name - v.save(file_path) + file_names = {} + for k, v in self.recordings(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._recordings_paths[k] = file_path + if file_names: dataset["recordings"] = file_names - if self.features is not None: - file_names = {} - for k, v in self.features.items(): - file_name = k + ".csv" - file_names[k] = file_name - file_path = dataset_dir / file_name - v.save(file_path) + file_names = {} + for k, v in self.features(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path + if file_names: dataset["features"] = file_names - if self.classes is not None: - file_names = {} - for k, v in self.classes.items(): - file_name = k + ".csv" - file_names[k] = file_name - file_path = dataset_dir / file_name - v.save(file_path) + file_names = {} + for k, v in self.classes(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path + if file_names: dataset["classes"] = file_names + file_names = {} + for k, v in self.enrollments(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + file_names = {} + for k, v in self.trials(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names + with open(dataset_file, "w") as f: yaml.dump(dataset, f) + def update_from_disk(self): + self.segments() + for k, v in self.recordings(): + pass + + for k, v in self.features(): + pass + + for k, v in self.classes(): + pass + + for k, v in self.enrollments(): + pass + + for k, v in self.trials(): + pass + @classmethod - def load(cls, dataset_path: PathLike): + def load( + cls, dataset_path: PathLike, lazy: bool = True, sparse_trials: bool = False + ): """Loads all the dataset objects. Args: dataset_path: str/Path indicating directory to save the dataset or .yaml file to save the dataset info. + lazy: load data structures lazily when they are needed. + sparse_trials: load trial keys using the SparseTrialKey class instead of TrialKey class """ dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) @@ -134,27 +341,79 @@ def load(cls, dataset_path: PathLike): dataset = yaml.safe_load(f) assert "segments" in dataset - segments = SegmentSet.load( - Dataset.resolve_file_path(dataset_dir, dataset["segments"]) - ) + segments = Dataset.resolve_file_path(dataset_dir, dataset["segments"]) classes = None recordings = None features = None + enrollments = None + trials = None if "classes" in dataset: classes = {} for k, v in dataset["classes"]: - classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v)) + classes[k] = Dataset.resolve_file_path(dataset_dir, v) if "recordings" in dataset: recordings = {} for k, v in dataset["recordings"]: - recordings[k] = RecordingSet.load( - Dataset.resolve_file_path(dataset_dir, v) - ) + recordings[k] = Dataset.resolve_file_path(dataset_dir, v) if "features" in dataset: features = {} for k, v in dataset["features"]: - features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v)) + features[k] = Dataset.resolve_file_path(dataset_dir, v) + + if "enrollments" in dataset: + enrollments = {} + for k, v in dataset["enrollments"]: + enrollments[k] = Dataset.resolve_file_path(dataset_dir, v) + + if "trials" in dataset: + trials = {} + for k, v in dataset["trials"]: + trials[k] = Dataset.resolve_file_path(dataset_dir, v) + + dataset = cls( + segments, + classes, + recordings, + features, + enrollments, + trials, + sparse_trials=sparse_trials, + ) + if not lazy: + dataset.update_from_disk() + + return dataset + + # dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + # with open(dataset_file, "w") as f: + # dataset = yaml.safe_load(f) + + # assert "segments" in dataset + # segments = SegmentSet.load( + # Dataset.resolve_file_path(dataset_dir, dataset["segments"]) + # ) + # classes = None + # recordings = None + # features = None + # if "classes" in dataset: + # classes = {} + # for k, v in dataset["classes"]: + # classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v)) + + # if "recordings" in dataset: + # recordings = {} + # for k, v in dataset["recordings"]: + # recordings[k] = RecordingSet.load( + # Dataset.resolve_file_path(dataset_dir, v) + # ) + + # if "features" in dataset: + # features = {} + # for k, v in dataset["features"]: + # features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v)) - return cls(segments, classes, recordings, features) + # dataset = cls(segments, classes, recordings, features) + # if not lazy: + # dataset.update_from_disk() diff --git a/hyperion/utils/enrollment_map.py b/hyperion/utils/enrollment_map.py new file mode 100644 index 00000000..024e5b74 --- /dev/null +++ b/hyperion/utils/enrollment_map.py @@ -0,0 +1,86 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import re +from collections import OrderedDict +from copy import deepcopy +from pathlib import Path + +import numpy as np +import pandas as pd + +from .list_utils import split_list, split_list_group_by_key +from .info_table import InfoTable + + +class EnrollmentMap(InfoTable): + """Class to store the mapping between enrollment id + and segmentids + """ + + def __init__(self, df): + if "modelid" in df: + df.rename(columns={"modelid": "id"}, inplace=True) + super().__init__(df) + + def split(self, idx, num_parts): + """Splits the mapping into num_parts and return part idx. + + Args: + idx: Part to return from 1 to num_parts. + num_parts: Number of parts to split the list. + group_by: All the lines with the same value in column + groub_by_field go to the same part + + Returns: + Sub InfoTable object + """ + _, idx1 = split_list_group_by_key(self.df["id"], idx, num_parts) + + df = self.df.iloc[idx1] + return EnrollmentMap(df) + + def save(self, file_path, sep=None, nist_compatible=True): + if nist_compatible: + # For compatibility with NIST SRE files the index column "id" + # is saved as modelid + self.df.rename(columns={"id": "modelid"}, inplace=True) + + super().save(file_path, sep) + if nist_compatible: + self.df.rename(columns={"modelid": "id"}, inplace=True) + + @classmethod + def load(cls, file_path, sep=None): + """Loads EnrollmentMap from file. + + Args: + file_path: File to read the list. + sep: Separator between the key and file_path in the text file. + dtype: Dictionary with the dtypes of each column. + name: name for the data to be loaded + Returns: + EnrollmentMap object + """ + file_path = Path(file_path) + ext = file_path.suffix + if ext in ["", ".scp"]: + # if no extension we load as kaldi utt2spk file + df = pd.read_csv( + file_path, + sep=" ", + header=None, + names=["segmentid", "modelid"], + dtype={"segmentid": np.str, "modelid": np.str}, + ) + df = df[["modelid", "segmentid"]] + else: + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + + return cls(df) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 5a4f27d2..6bcd4aca 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -119,7 +119,7 @@ def from_dict(cls, df_dict): @classmethod def load(cls, file_path, sep=None, name="class_id"): - """Loads utt2info list from text file. + """Loads table from file. Args: file_path: File to read the list. @@ -127,7 +127,7 @@ def load(cls, file_path, sep=None, name="class_id"): dtype: Dictionary with the dtypes of each column. name: name for the data to be loaded Returns: - Utt2Info object + InfoTable object """ file_path = Path(file_path) ext = file_path.suffix @@ -156,7 +156,7 @@ def sort(self, column="id", ascending=True): self.df.sort_values(by=column, inplace=True, ascending=ascending) def split(self, idx, num_parts, group_by=None): - """Splits SCPList into num_parts and return part idx. + """Splits the table into num_parts and return part idx. Args: idx: Part to return from 1 to num_parts. @@ -177,13 +177,13 @@ def split(self, idx, num_parts, group_by=None): @classmethod def merge(cls, tables): - """Merges several Utt2Info tables. + """Merges several tables. Args: - info_lists: List of Utt2Info + info_lists: List of InfoTables Returns: - Utt2Info object concatenation the info_lists. + InfoTable object concatenation the info_lists. """ df_list = [table.df for table in tables] df = pd.concat(df_list) diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index d51edc34..1852d25d 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -7,6 +7,10 @@ class SegmentSet(InfoTable): + """Class to store information about a speech segment + Internally, it uses a pandas table. + """ + def __init__(self, df): super().__init__(df) if "start" in df and "recording_id" not in df: diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py index 5afc72a0..1bc321a7 100644 --- a/hyperion/utils/sparse_trial_key.py +++ b/hyperion/utils/sparse_trial_key.py @@ -5,8 +5,10 @@ import copy import os.path as path +from pathlib import Path import numpy as np +import pandas as pd import scipy.sparse as sparse from .list_utils import * @@ -79,6 +81,28 @@ def save_txt(self, file_path): for r, c in zip(non.row, non.col): f.write("%s %s nontarget\n" % (self.model_set[r], self.seg_set[c])) + def save_table(self, file_path, sep=None): + """Saves object to txt file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}targettype\n") + self.tar.eliminate_zeros() + self.non.eliminate_zeros() + tar = self.tar.tocoo() + for r, c in zip(tar.row, tar.col): + f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}target\n") + non = self.non.tocoo() + for r, c in zip(non.row, non.col): + f.write(f"{self.model_set[r]}{sep}{self.seg_set[c]}{sep}nontarget\n") + @classmethod def load_h5(cls, file_path): raise NotImplementedError() @@ -113,6 +137,40 @@ def load_txt(cls, file_path): non[item[0], item[1]] = True return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from txt file + + Args: + file_path: File to read the list. + + Returns: + TrialKey object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + is_tar = (df["targettype"] == "target").values + model_set, _, model_idx = np.unique( + models, return_index=True, return_inverse=True + ) + seg_set, _, seg_idx = np.unique( + segments, return_index=True, return_inverse=True + ) + tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") + non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") + for item in zip(model_idx, seg_idx, is_tar): + if item[2]: + tar[item[0], item[1]] = True + else: + non[item[0], item[1]] = True + return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) + @classmethod def merge(cls, key_list): raise NotImplementedError() diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py index 9552d7c0..4a99461b 100644 --- a/hyperion/utils/trial_key.py +++ b/hyperion/utils/trial_key.py @@ -5,9 +5,11 @@ import copy import os.path as path +from pathlib import Path import h5py import numpy as np +import pandas as pd from .list_utils import * from .trial_ndx import TrialNdx @@ -82,18 +84,20 @@ def sort(self): if self.trial_cond is not None: self.trial_cond = self.trial_cond[:, ix] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + ext = file_path.suffix + if ext in (".h5", ".hdf5"): self.save_h5(file_path) - else: + elif ext in ("", ".txt"): self.save_txt(file_path) + else: + self.save_table(file_path, sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -132,20 +136,40 @@ def save_txt(self, file_path): file_path: File to write the list. """ with open(file_path, "w") as f: - idx = (self.tar.T == True).nonzero() + idx = (self.tar.T).nonzero() for item in zip(idx[0], idx[1]): f.write( "%s %s target\n" % (self.model_set[item[1]], self.seg_set[item[0]]) ) - idx = (self.non.T == True).nonzero() + idx = (self.non.T).nonzero() for item in zip(idx[0], idx[1]): f.write( "%s %s nontarget\n" % (self.model_set[item[1]], self.seg_set[item[0]]) ) + def save_table(self, file_path, sep=None): + """Saves object to txt file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}targettype\n") + I, J = np.logical_or(self.tar, self.non).nonzero() + for i, j in zip(I, J): + target_type = "target" if self.tar[i, j] else "nontarget" + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{target_type}\n" + ) + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -154,11 +178,13 @@ def load(cls, file_path): Returns: TrialKey object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + _, file_ext = path.splitext(file_path) + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -240,6 +266,40 @@ def load_txt(cls, file_path): non[item[0], item[1]] = True return cls(model_set, seg_set, tar, non) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from txt file + + Args: + file_path: File to read the list. + + Returns: + TrialKey object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + is_tar = (df["targettype"] == "target").values + model_set, _, model_idx = np.unique( + models, return_index=True, return_inverse=True + ) + seg_set, _, seg_idx = np.unique( + segments, return_index=True, return_inverse=True + ) + tar = np.zeros((len(model_set), len(seg_set)), dtype="bool") + non = np.zeros((len(model_set), len(seg_set)), dtype="bool") + for i, j, target_type in zip(model_idx, seg_idx, is_tar): + if target_type: + tar[i, j] = True + else: + non[i, j] = True + return cls(model_set, seg_set, tar, non) + @classmethod def merge(cls, key_list): """Merges several key objects. From 27878914b1bc20b2dbeb5c1139b6d23f2857cd07 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 25 May 2023 09:19:23 -0400 Subject: [PATCH 102/154] sre21 8k adapted to persephone branck --- egs/sre21-av-a/v1.16k/README.md | 22 +- .../v1.16k/local/score_sre21_official.sh | 2 +- egs/sre21-av-a/v1.8k/README.md | 53 ++- egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh | 2 +- egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh | 4 +- egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh | 2 +- egs/voxceleb/v1.1/local | 1 - .../{v1 => v1.1}/local/attack_analysis.py | 0 .../{v1 => v1.1}/local/attack_analysis.sh | 0 .../local/calibrate_voxceleb1_o_clean.sh | 0 egs/voxceleb/{v1 => v1.1}/local/make_musan.py | 0 egs/voxceleb/{v1 => v1.1}/local/make_musan.sh | 0 .../{v1 => v1.1}/local/make_rirs_data.sh | 0 .../{v1 => v1.1}/local/make_some_figs.py | 0 .../make_train_lists_sup_embed_with_augm.sh | 0 .../{v1 => v1.1}/local/make_trials_subset.py | 0 .../{v1 => v1.1}/local/make_vox2_trials.py | 0 .../{v1 => v1.1}/local/make_voxceleb1_o.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_oeh.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_old.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_orig.pl | 0 .../local/make_voxceleb1_orig_v2.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_v2.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1_v2_o.pl | 0 .../local/make_voxceleb1_v2_oeh.pl | 0 .../{v1 => v1.1}/local/make_voxceleb1cat.pl | 0 .../local/make_voxceleb1cat_v2.pl | 0 .../{v1 => v1.1}/local/make_voxceleb2.pl | 0 .../{v1 => v1.1}/local/make_voxceleb2cat.pl | 0 .../local/prepare_voxsrc22_dev.py | 0 .../local/prepare_voxsrc22_test.py | 0 egs/voxceleb/{v1 => v1.1}/local/score_dcf.py | 0 .../{v1 => v1.1}/local/score_voxceleb1.sh | 0 .../local/score_voxceleb1_o_clean.sh | 0 .../local/score_voxceleb1_single_cond.sh | 0 .../{v1 => v1.1}/local/score_voxsrc22_dev.sh | 0 egs/voxceleb/v1.1/run_002_compute_evad.sh | 1 - egs/voxceleb/v1.2/hyp_utils | 1 + ...aseplus_ecapatdnn512x3_phase1_default.yaml | 6 - ...aseplus_ecapatdnn512x3_phase2_default.yaml | 12 - ...aseplus_ecapatdnn512x3_phase3_default.yaml | 11 - ...lmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml | 24 -- ...nn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh | 55 ---- egs/voxceleb/v2/local | 2 +- egs/voxceleb/v2/run_001_prepare_data.sh | 20 +- egs/voxceleb/v2/run_002_compute_evad.sh | 63 ++-- .../v2/run_003_prepare_noises_rirs.sh | 67 ++++ hyp_utils/conda_env.sh | 2 +- hyp_utils/create_data_split_dirs.sh | 3 +- hyperion/bin/hyperion_dataset.py | 93 ++++++ hyperion/bin/hyperion_tables.py | 129 ++++++++ hyperion/bin/train_xvector_from_wav.py | 10 +- hyperion/data_prep/data_prep.py | 1 - hyperion/data_prep/voxceleb1.py | 7 +- hyperion/data_prep/voxceleb2.py | 11 +- hyperion/data_prep/voxsrc22.py | 212 ++++++++++++ hyperion/torch/trainers/torch_trainer.py | 178 +++++----- hyperion/torch/trainers/xvector_trainer.py | 8 +- .../trainers/xvector_trainer_from_wav.py | 12 +- hyperion/utils/class_info.py | 27 +- hyperion/utils/dataset.py | 306 ++++++++++++++---- hyperion/utils/enrollment_map.py | 17 +- hyperion/utils/info_table.py | 7 +- 63 files changed, 1024 insertions(+), 347 deletions(-) delete mode 120000 egs/voxceleb/v1.1/local rename egs/voxceleb/{v1 => v1.1}/local/attack_analysis.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/attack_analysis.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/calibrate_voxceleb1_o_clean.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_musan.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_musan.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_rirs_data.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_some_figs.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_train_lists_sup_embed_with_augm.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_trials_subset.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_vox2_trials.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_o.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_oeh.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_old.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_orig.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_orig_v2.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_v2.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_v2_o.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1_v2_oeh.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1cat.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb1cat_v2.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb2.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/make_voxceleb2cat.pl (100%) rename egs/voxceleb/{v1 => v1.1}/local/prepare_voxsrc22_dev.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/prepare_voxsrc22_test.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_dcf.py (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_voxceleb1.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_voxceleb1_o_clean.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_voxceleb1_single_cond.sh (100%) rename egs/voxceleb/{v1 => v1.1}/local/score_voxsrc22_dev.sh (100%) create mode 120000 egs/voxceleb/v1.2/hyp_utils delete mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml delete mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml delete mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml delete mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh create mode 100755 egs/voxceleb/v2/run_003_prepare_noises_rirs.sh create mode 100644 hyperion/bin/hyperion_dataset.py create mode 100755 hyperion/bin/hyperion_tables.py create mode 100644 hyperion/data_prep/voxsrc22.py diff --git a/egs/sre21-av-a/v1.16k/README.md b/egs/sre21-av-a/v1.16k/README.md index 0f5d09ad..d90dc0a4 100644 --- a/egs/sre21-av-a/v1.16k/README.md +++ b/egs/sre21-av-a/v1.16k/README.md @@ -7,6 +7,20 @@ The systems runs at 16 kHz, telephone data is upsampled to 16k using SoX This recipe is based on these works ``` +@inproceedings{Villalba2022, +author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak}, +city = {ISCA}, +doi = {10.21437/Odyssey.2022-30}, +issue = {July}, +journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)}, +month = {6}, +pages = {213-220}, +publisher = {ISCA}, +title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21}, +url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html}, +year = {2022}, +} + @inproceedings{Villalba2020, address = {Tokyo, Japan}, author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim}, @@ -139,14 +153,6 @@ The back-end used for these results is: | config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | | config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | -## SRE-CTS Superset dev set - -| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | -| ------ | ---------- | ------------- | ------ | ------------- | ------------- | -| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.37 | 0.076 | 0.106 | -| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.19 | 0.64 | 0.089 | -| config_fbank80_stmn_tseres2net50w26s4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | TSE-Res2Net50 w26xs4 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.15 | 0.61 | 0.102 | - ## SRE21 Audio Dev (official scoring tool) | Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | diff --git a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh index a5bc03eb..e56906f6 100755 --- a/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh +++ b/egs/sre21-av-a/v1.16k/local/score_sre21_official.sh @@ -18,7 +18,7 @@ echo "Score SRE21 ${track} ${subset} for $score_dir" soft_dir=./sre21/scoring_software -if [ ! -f $s_dir/sre_scorer.py ];then +if [ ! -f $soft_dir/sre_scorer.py ];then echo "downloading scoring tool" local/download_sre21_scoring_tool.sh fi diff --git a/egs/sre21-av-a/v1.8k/README.md b/egs/sre21-av-a/v1.8k/README.md index a105128c..b55f9bf0 100644 --- a/egs/sre21-av-a/v1.8k/README.md +++ b/egs/sre21-av-a/v1.8k/README.md @@ -10,6 +10,20 @@ copy the utt2est_lang files from the 16k data dirs to the VoxCeleb and SRE21 dat This recipe is based on these works ``` +@inproceedings{Villalba2022, +author = {Jes\'us Villalba and Bengt J Borgstrom and Saurabh Kataria and Magdalena Rybicka and Carlos D Castillo and Jaejin Cho and L. Paola García-Perera and Pedro A. Torres-Carrasquillo and Najim Dehak}, +city = {ISCA}, +doi = {10.21437/Odyssey.2022-30}, +issue = {July}, +journal = {The Speaker and Language Recognition Workshop (Odyssey 2022)}, +month = {6}, +pages = {213-220}, +publisher = {ISCA}, +title = {Advances in Cross-Lingual and Cross-Source Audio-Visual Speaker Recognition: The JHU-MIT System for NIST SRE21}, +url = {https://www.isca-speech.org/archive/odyssey_2022/villalba22b_odyssey.html}, +year = {2022}, +} + @inproceedings{Villalba2020, address = {Tokyo, Japan}, author = {Villalba, Jes{\'{u}}s and Garcia-Romero, Daniel and Chen, Nanxin and Sell, Gregory and Borgstrom, Jonas and McCree, Alan and {Garcia Perera}, Leibny Paola and Kataria, Saurabh and Nidadavolu, Phani Sankar and Torres-Carrasquiilo, Pedro and Dehak, Najim}, @@ -91,8 +105,6 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs - `run_011_train_xvector.sh` - Trains the x-vector network on 4sec chunks - - - `run_012_finetune_xvector.sh` - Fine-tune x-vector network on 10-15 secs utts - `run_030_extract_xvectors.sh` @@ -111,4 +123,39 @@ run_0xx_....sh --config-file global_conf/config_fbank80_stmn_res2net50w26s8_arcs ## Results -TODO +The back-end used for these results is: +- back-end V2 (run_041_eval_be_v2.sh) +- Without S-Norm +- Scores are calibrated as indicated in the paper. + +## SRE16 Eval40% YUE + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.922 | 0.154 | 0.200 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.168 | 0.127 | 0.134 | + + +## SRE-CTS Superset dev set + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 1.39 | 0.072 | 0.095 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 1.175 | 0.057 | 0.069 | + + +## SRE21 Audio Dev (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 6.65 | 0.418 | 0.436 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 3.73 | 0.319 | 0.325 | + + +## SRE21 Audio Eval (official scoring tool) + +| Config | Model Type | Model Details | EER(%) | Min. Cprimary | Act. Cprimary | +| ------ | ---------- | ------------- | ------ | ------------- | ------------- | +| config_fbank80_stmn_ecapatdnn2048x4_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | ECAPA-TDNN 2048x4 | fine-tuned 10-15secs
AAM-Softmax margin=0.5 | 5.44 | 0.388 | 0.390 | +| config_fbank80_stmn_res2net50w26s8_chattstatsi128_arcs30m0.3_adam_lr0.02_amp.v1.sh | Res2Net50 w26xs8 | fine-tuned 10 secs
AAM-Softmax margin=0.5 | 4.21 | 0.356 | 0.377 | + diff --git a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh index a55761ae..92cbd887 100755 --- a/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh +++ b/egs/sre21-av-a/v1.8k/run_040_eval_be_v1.sh @@ -153,7 +153,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh index f8eae0a1..6890eba9 100755 --- a/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh +++ b/egs/sre21-av-a/v1.8k/run_041_eval_be_v2.sh @@ -187,7 +187,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 @@ -311,7 +311,7 @@ fi if [ $stage -le 7 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh index 263d7bbe..35afbb27 100755 --- a/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh +++ b/egs/sre21-av-a/v1.8k/run_042_eval_be_v3.sh @@ -185,7 +185,7 @@ fi if [ $stage -le 4 ];then local/calibrate_sre21av_v1.sh --cmd "$train_cmd" $score_plda_dir local/score_sre16.sh data/sre16_eval40_yue_test eval40_yue ${score_plda_dir}_cal_v1 - local/score_sre_cts_superset.sh data/sre_cts_superset_16k_dev ${score_plda_dir}_cal_v1 + local/score_sre_cts_superset.sh data/sre_cts_superset_8k_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_dev_test audio_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio-visual_dev_test audio-visual_dev ${score_plda_dir}_cal_v1 local/score_sre21.sh data/sre21_audio_eval_test audio_eval ${score_plda_dir}_cal_v1 diff --git a/egs/voxceleb/v1.1/local b/egs/voxceleb/v1.1/local deleted file mode 120000 index 740b697d..00000000 --- a/egs/voxceleb/v1.1/local +++ /dev/null @@ -1 +0,0 @@ -../v1/local/ \ No newline at end of file diff --git a/egs/voxceleb/v1/local/attack_analysis.py b/egs/voxceleb/v1.1/local/attack_analysis.py similarity index 100% rename from egs/voxceleb/v1/local/attack_analysis.py rename to egs/voxceleb/v1.1/local/attack_analysis.py diff --git a/egs/voxceleb/v1/local/attack_analysis.sh b/egs/voxceleb/v1.1/local/attack_analysis.sh similarity index 100% rename from egs/voxceleb/v1/local/attack_analysis.sh rename to egs/voxceleb/v1.1/local/attack_analysis.sh diff --git a/egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh similarity index 100% rename from egs/voxceleb/v1/local/calibrate_voxceleb1_o_clean.sh rename to egs/voxceleb/v1.1/local/calibrate_voxceleb1_o_clean.sh diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/voxceleb/v1.1/local/make_musan.py similarity index 100% rename from egs/voxceleb/v1/local/make_musan.py rename to egs/voxceleb/v1.1/local/make_musan.py diff --git a/egs/voxceleb/v1/local/make_musan.sh b/egs/voxceleb/v1.1/local/make_musan.sh similarity index 100% rename from egs/voxceleb/v1/local/make_musan.sh rename to egs/voxceleb/v1.1/local/make_musan.sh diff --git a/egs/voxceleb/v1/local/make_rirs_data.sh b/egs/voxceleb/v1.1/local/make_rirs_data.sh similarity index 100% rename from egs/voxceleb/v1/local/make_rirs_data.sh rename to egs/voxceleb/v1.1/local/make_rirs_data.sh diff --git a/egs/voxceleb/v1/local/make_some_figs.py b/egs/voxceleb/v1.1/local/make_some_figs.py similarity index 100% rename from egs/voxceleb/v1/local/make_some_figs.py rename to egs/voxceleb/v1.1/local/make_some_figs.py diff --git a/egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh b/egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh similarity index 100% rename from egs/voxceleb/v1/local/make_train_lists_sup_embed_with_augm.sh rename to egs/voxceleb/v1.1/local/make_train_lists_sup_embed_with_augm.sh diff --git a/egs/voxceleb/v1/local/make_trials_subset.py b/egs/voxceleb/v1.1/local/make_trials_subset.py similarity index 100% rename from egs/voxceleb/v1/local/make_trials_subset.py rename to egs/voxceleb/v1.1/local/make_trials_subset.py diff --git a/egs/voxceleb/v1/local/make_vox2_trials.py b/egs/voxceleb/v1.1/local/make_vox2_trials.py similarity index 100% rename from egs/voxceleb/v1/local/make_vox2_trials.py rename to egs/voxceleb/v1.1/local/make_vox2_trials.py diff --git a/egs/voxceleb/v1/local/make_voxceleb1_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_o.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_o.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_o.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_oeh.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_oeh.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_old.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_old.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_old.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_old.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_orig.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_orig_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_orig_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_o.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl b/egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1_v2_oeh.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1cat.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1cat.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl b/egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb1cat_v2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb2.pl b/egs/voxceleb/v1.1/local/make_voxceleb2.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb2.pl rename to egs/voxceleb/v1.1/local/make_voxceleb2.pl diff --git a/egs/voxceleb/v1/local/make_voxceleb2cat.pl b/egs/voxceleb/v1.1/local/make_voxceleb2cat.pl similarity index 100% rename from egs/voxceleb/v1/local/make_voxceleb2cat.pl rename to egs/voxceleb/v1.1/local/make_voxceleb2cat.pl diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_dev.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py similarity index 100% rename from egs/voxceleb/v1/local/prepare_voxsrc22_dev.py rename to egs/voxceleb/v1.1/local/prepare_voxsrc22_dev.py diff --git a/egs/voxceleb/v1/local/prepare_voxsrc22_test.py b/egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py similarity index 100% rename from egs/voxceleb/v1/local/prepare_voxsrc22_test.py rename to egs/voxceleb/v1.1/local/prepare_voxsrc22_test.py diff --git a/egs/voxceleb/v1/local/score_dcf.py b/egs/voxceleb/v1.1/local/score_dcf.py similarity index 100% rename from egs/voxceleb/v1/local/score_dcf.py rename to egs/voxceleb/v1.1/local/score_dcf.py diff --git a/egs/voxceleb/v1/local/score_voxceleb1.sh b/egs/voxceleb/v1.1/local/score_voxceleb1.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1.sh rename to egs/voxceleb/v1.1/local/score_voxceleb1.sh diff --git a/egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1_o_clean.sh rename to egs/voxceleb/v1.1/local/score_voxceleb1_o_clean.sh diff --git a/egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh b/egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxceleb1_single_cond.sh rename to egs/voxceleb/v1.1/local/score_voxceleb1_single_cond.sh diff --git a/egs/voxceleb/v1/local/score_voxsrc22_dev.sh b/egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh similarity index 100% rename from egs/voxceleb/v1/local/score_voxsrc22_dev.sh rename to egs/voxceleb/v1.1/local/score_voxsrc22_dev.sh diff --git a/egs/voxceleb/v1.1/run_002_compute_evad.sh b/egs/voxceleb/v1.1/run_002_compute_evad.sh index 4e82a87a..27260be3 100755 --- a/egs/voxceleb/v1.1/run_002_compute_evad.sh +++ b/egs/voxceleb/v1.1/run_002_compute_evad.sh @@ -24,7 +24,6 @@ if [ $stage -le 1 ]; then dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage if [ "$nodes" == "b0" ];then utils/create_split_dir.pl \ - utils/create_split_dir.pl \ /export/b{04,05,06,07}/$dir_name $vaddir/storage elif [ "$nodes" == "b1" ];then utils/create_split_dir.pl \ diff --git a/egs/voxceleb/v1.2/hyp_utils b/egs/voxceleb/v1.2/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v1.2/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml deleted file mode 100644 index 8574a1cf..00000000 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase1_default.yaml +++ /dev/null @@ -1,6 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: wavlmbaseplus_ecapatdnn512x3.yaml -trainer: trainer_phase1_sgd_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml deleted file mode 100644 index 87b01a1f..00000000 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase2_default.yaml +++ /dev/null @@ -1,12 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: - xvector: - cos_scale: 32.0 - margin: 0.2 - margin_warmup_epochs: 0 - intertop_k: 5 - intertop_margin: 0.1 -trainer: trainer_phase2_sgd_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml deleted file mode 100644 index d13931e0..00000000 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_phase3_default.yaml +++ /dev/null @@ -1,11 +0,0 @@ -data: - train: train_data_default.yaml - val: val_data_default.yaml -model: - xvector: - cos_scale: 32.0 - margin: 0.4 - margin_warmup_epochs: 0 - intertop_margin: 0. -trainer: trainer_phase3_sgd_default.yaml - \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml index 34c6e8dc..d4db70a7 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v1.0.yaml @@ -41,29 +41,6 @@ data: num_hard_prototypes: 0 data_loader: num_workers: 8 - -train: - dataset: - max_chunk_length: 3.0 - min_chunk_length: 3.0 - aug_cfg: conf/reverb_noise_aug.yaml - wav_scale: 1 - sampler: - batch_size: 32 - iters_per_epoch: 6 - data_loader: - num_workers: 8 - val: - dataset: - max_chunk_length: 4.0 - min_chunk_length: 4.0 - aug_cfg: conf/reverb_noise_aug.yaml - wav_scale: 1 - sampler: - batch_size: 32 - iters_per_epoch: 6 - data_loader: - num_workers: 8 model: wavlmbaseplus_ecapatdnn512x3.yaml trainer: optim: @@ -84,5 +61,4 @@ trainer: epochs: 60 eff_batch_size: 1024 train_mode: hf-feats-frozen-nograd - \ No newline at end of file diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh deleted file mode 100644 index 942fb336..00000000 --- a/egs/voxceleb/v2/global_conf/config_wav2vec2base_ecapatdnn512x2_arcs30m0.3_adam_lr0.001_amp.v12.sh +++ /dev/null @@ -1,55 +0,0 @@ -# Wav2vec2 base trained on 960h LibriSpeech + ECAPA-TDNN 512x2 - -# hugging face model -hf_model_name=wav2vec2base - -#vad -vad_config=conf/vad_16k.yaml - -# x-vector training -nnet_data=voxceleb2cat_train - -# x-vector cfg - -nnet_type=hf_wav2vec2resnet1d - -batch_size_1gpu=32 -eff_batch_size=512 # effective batch size -dropout=0 -embed_dim=256 -lr=0.05 -s=30 -margin_warmup=20 -margin=0.3 -nnet_num_epochs=70 - - -lr=0.001 -#lr=0.005 -xvec_train_base_cfg=conf/train_wav2vec2base_ecapatdnn512x2_default.yaml -xvec_train_args="--data.train.sampler.batch-size $batch_size_1gpu --trainer.optim.lr $lr --trainer.lrsched.warmup-steps 20000 --trainer.lrsched.hold-steps 20000 --trainer.lrsched.min-lr 1e-6 --trainer.epochs 75 --model conf/wav2vec2base_specaug5_ecapatdnn512x2.yaml --data.train.dataset.max-chunk-length 2 --data.train.dataset.min-chunk-length 2" - -nnet_name=${hf_model_name}_ecapatdnn512x2_e${embed_dim}_arcs${s}m${margin}_do${dropout}_adam_lr${lr}_b${eff_batch_size}_amp.v12 #v1 - -nnet_dir=exp/xvector_nnets/$nnet_name -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/swa_model_ep0076.pth -nnet=$nnet_dir/model_ep0060.pth -nnet=$nnet_dir/model_ep0030.pth -nnet=$nnet_dir/model_ep0040.pth -nnet=$nnet_dir/model_ep0020.pth - - -# back-end -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=6 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 - diff --git a/egs/voxceleb/v2/local b/egs/voxceleb/v2/local index 740b697d..2ac14857 120000 --- a/egs/voxceleb/v2/local +++ b/egs/voxceleb/v2/local @@ -1 +1 @@ -../v1/local/ \ No newline at end of file +../v1.1/local \ No newline at end of file diff --git a/egs/voxceleb/v2/run_001_prepare_data.sh b/egs/voxceleb/v2/run_001_prepare_data.sh index 7bf15448..44385610 100755 --- a/egs/voxceleb/v2/run_001_prepare_data.sh +++ b/egs/voxceleb/v2/run_001_prepare_data.sh @@ -12,7 +12,7 @@ config_file=default_config.sh . parse_options.sh || exit 1; . datapath.sh - +. $config_file if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. @@ -26,3 +26,21 @@ if [ $stage -le 2 ];then # Use this for the newer version of voxceleb1: local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + local/prepare_voxsrc22_dev.py \ + --vox1-corpus-dir $voxceleb1_root \ + --voxsrc22-corpus-dir $voxsrc22_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then +# local/prepare_voxsrc22_test.py \ +# --corpus-dir $voxsrc22_root \ +# --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # # split vox2 into 2 parts, for cohort and qmf training + local/make_vox2_trials.py --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v2/run_002_compute_evad.sh b/egs/voxceleb/v2/run_002_compute_evad.sh index eeae00ac..1248ad39 100755 --- a/egs/voxceleb/v2/run_002_compute_evad.sh +++ b/egs/voxceleb/v2/run_002_compute_evad.sh @@ -19,39 +19,40 @@ config_file=default_config.sh if [ $stage -le 1 ]; then - # Prepare to distribute data over multiple machines - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then - dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage - if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - utils/create_split_dir.pl \ - /export/b{04,05,06,07}/$dir_name $vaddir/storage - elif [ "$nodes" == "b1" ];then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$dir_name $vaddir/storage - elif [ "$nodes" == "c0" ];then - utils/create_split_dir.pl \ - /export/c{06,07,08,09}/$dir_name $vaddir/storage - elif [ "$nodes" == "fs01" ];then - utils/create_split_dir.pl \ - /export/fs01/$dir_name $vaddir/storage - else - echo "we don't distribute data between multiple machines" - fi + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/voxceleb/v1/$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" fi + fi fi -#Train datasets -if [ $stage -le 2 ];then - for name in voxceleb2cat_train voxceleb1_test - do - num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') - nj=$(($num_spk < 40 ? $num_spk:40)) - hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ - --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ - data/${name} exp/make_vad/$name $vaddir - utils/fix_data_dir.sh data/${name} - done +if [ $stage -le 2 ];then + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh \ + --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done fi - diff --git a/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..a448af9a --- /dev/null +++ b/egs/voxceleb/v2/run_003_prepare_noises_rirs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation + +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name voxceleb-v1.1-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../../sre19-cmn2/v1/RIRS_NOISES ];then + ln -s ../../sre19-cmn2/v1/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index ceee4e93..8d5c67c1 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -79,7 +79,7 @@ if [ $num_gpus -gt 0 ];then #export TORCH_DISTRIBUTED_DEBUG=DETAIL #variable to find unused parameters if [ $num_gpus -gt 1 ];then - [[ $(type -P "$torchrun") ]] && command="torchrun" \ + [[ $(type -P "torchrun") ]] && command="torchrun" \ || command="python -m torch.distributed.run" command="$command --nproc_per_node=$num_gpus --standalone --nnodes=1" fi diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh index 877b9e3f..06c30779 100755 --- a/hyp_utils/create_data_split_dirs.sh +++ b/hyp_utils/create_data_split_dirs.sh @@ -25,8 +25,7 @@ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then echo "Prepare to distribute data over multiple $nodes nodes" dir_name=$storage_dir/$storage_name/storage if [ "$nodes" == "b0" ];then - utils/create_split_dir.pl \ - hyp_utils/create_split_dir.pl \ + hyp_utils/create_split_dir.pl \ /export/b{04,05,06,07}/$dir_name $link_dir elif [ "$nodes" == "b1" ];then hyp_utils/create_split_dir.pl \ diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py new file mode 100644 index 00000000..9e7bac5c --- /dev/null +++ b/hyperion/bin/hyperion_dataset.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Optional, Union, List +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + PathLike, + Dataset, + InfoTable, + RecordingSet, + FeatureSet, + ClassInfo, + EnrollmentMap, + SegmentSet, +) + +subcommands = ["add_features"] +# table_dict = { +# "segments": SegmentSet, +# "recordings": RecordingSet, +# "features": FeatureSet, +# "classes": ClassInfo, +# "enrollments": EnrollmentMap, +# "generic": InfoTable, +# } + + +def add_common_args(parser): + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def make_add_features_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--features-name", required=True, help="""name of the feature""" + ) + parser.add_argument("--features-file", required=True, help="""feature set file""") + + add_common_args(parser) + return parser + + +def add_features( + dataset: PathLike, + features_name: str, + features_file: PathLike, +): + dataset = Dataset.load(dataset, lazy=True) + dataset.add_features(features_name, features_file) + dataset.save(dataset) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Tool to manipulates the Hyperion dataset") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommands: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(k, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + + globals()[subcommand](**kwargs) diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py new file mode 100755 index 00000000..a79a1dca --- /dev/null +++ b/hyperion/bin/hyperion_tables.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Optional, Union, List +from pathlib import Path + +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + PathLike, + InfoTable, + RecordingSet, + FeatureSet, + ClassInfo, + EnrollmentMap, + SegmentSet, +) + +subcommands = ["cat"] +table_dict = { + "segments": SegmentSet, + "recordings": RecordingSet, + "features": FeatureSet, + "classes": ClassInfo, + "enrollments": EnrollmentMap, + "generic": InfoTable, +} + + +def add_common_args(parser): + parser.add_argument( + "--table-type", + default="generic", + choices=list(table_dict.keys()), + help=f"Type of table in {list(table_dict.keys())}", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def make_cat_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--input-files", default=None, nargs="+", help="optional list of input files" + ) + parser.add_argument( + "--output-file", + required=True, + help="""output file, if input-files is None, input files names are derived from it""", + ) + parser.add_argument( + "--num-tables", + default=0, + type=int, + help="""number of jobs we used to create the individual tables""", + ) + parser.add_argument( + "--base-idx", + default=1, + type=int, + help="""index of the first job, typically 0 or 1""", + ) + + add_common_args(parser) + return parser + + +def cat( + table_type: str, + input_files: Union[List[PathLike], None], + output_file: PathLike, + num_table: int, + base_idx: int = 1, +): + + assert input_files is not None or num_jobs != 0 + output_file = Path(output_file) + if input_files is None: + ext = output_file.suffix + input_file_base = output_file.with_suffix("") + input_files = [] + for i in range(num_tables): + idx = base_idx + i + input_file_i = input_file_base.with_suffix(f".{idx}{ext}") + input_files.append(input_file_i) + + table_class = table_dict[table_type] + tables = [] + for file_path in input_files: + tables.append(table_class.load(file_path)) + + output_table = table_class.cat(tables) + output_table.save(output_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommands: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(k, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + + globals()[subcommand](**kwargs) diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index 5c999dd1..a210d429 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -6,12 +6,14 @@ import logging import multiprocessing import os -import sys -import time from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) import torch from hyperion.hyp_defs import config_logger, set_float_cpu diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index 19420761..d9828674 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -69,7 +69,6 @@ def get_recording_duration(self, recording_set): import itertools from ..utils import SCPList - # scp = SCPList(recording_set["id"].values, recording_set["storage_path"].values) futures = [] logging.info("submitting threats...") with ThreadPoolExecutor(max_workers=self.num_threads) as pool: diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py index 00b2e380..c23b64ff 100644 --- a/hyperion/data_prep/voxceleb1.py +++ b/hyperion/data_prep/voxceleb1.py @@ -214,7 +214,12 @@ def get_segmentid(s): return enrollments, trials def prepare(self): - + logging.info( + "Peparing VoxCeleb1 for %s corpus_dir:%s -> data_dir:%s", + self.task, + self.corpus_dir, + self.output_dir, + ) logging.info("getting audio meta-data") df_meta = self._get_metadata() logging.info("getting language estimations") diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index 1a32420f..bef34ec9 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -136,6 +136,12 @@ def make_cat_list(lists_cat_dir, rec_id, rec_files, video_idx, i): return file_path def prepare(self): + logging.info( + "Peparing VoxCeleb2 %s corpus_dir:%s -> data_dir:%s", + self.subset, + self.corpus_dir, + self.output_dir, + ) logging.info("getting audio meta-data") df_meta = self._get_metadata() logging.info("getting language estimations") @@ -224,11 +230,6 @@ def prepare(self): "duration": recs.loc[rec_ids, "duration"].values, } ) - # print( - # recs.loc[rec_ids, "duration"], - # len(segments), - # len(recs.loc[rec_ids, "duration"]), - # ) segments = SegmentSet(segments) segments.sort() diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py new file mode 100644 index 00000000..1999262a --- /dev/null +++ b/hyperion/data_prep/voxsrc22.py @@ -0,0 +1,212 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import glob +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class VoxSRC22DataPrep(DataPrep): + """Class to prepare VoxSRC22 dev/test data + Attributes: + corpus_dir: input data directory + vox1_corpus_dir: input data directory for VoxCeleb1 + subset: subset of the data dev or test + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + vox1_corpus_dir: PathLike, + subset: str, + output_dir: PathLike, + use_kaldi_ids: bool, + target_sample_freq: int, + num_threads: int = 10, + ): + use_kaldi_ids = False + super().__init__( + corpus_dir, output_dir, use_kaldi_ids, target_sample_freq, num_threads + ) + + assert ( + vox1_corpus_dir is not None or subset == "test" + ), "dev set needs the VoxCeleb1 corpus dir" + self.subset = subset + self.vox1_corpus_dir = ( + None if vox1_corpus_dir is None else Path(vox1_corpus_dir) + ) + + @staticmethod + def dataset_name(): + return "voxceleb2" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + default="dev", + choices=["dev", "test"], + help="""vox2 subset in [dev, test]""", + ) + parser.add_argument( + "--vox1-corpus-dir", + default=None, + help="""corpus directory of voxceleb 1.""", + ) + + def prepare_track12_dev(self): + logging.info( + "Preparing VoxSRC22 %s corpus:%s + %s -> %s", + self.subset, + self.corpus_dir, + self.vox1_corpus_dir, + self.output_dir, + ) + logging.info("making trials") + trials_file = self.corpus_dir / "voxsrc2022_dev.txt" + df_in = pd.read_csv( + trials_file, + header=None, + sep=" ", + names=["key", "enroll_file", "test_file"], + ) + key = ["target" if k == 1 else "nontarget" for k in df_in["key"]] + + modelid = df_in["enroll_file"] + segmentid = df_in["test_file"] + df_trials = pd.DataFrame( + {"modelid": modelid, "segmentid": segmentid, "targettype": key} + ) + df_trials.sort_values(by=["modelid", "segmentid"], inplace=True) + file_path = self.output_dir / "trials.csv" + df_trials.to_csv(file_path, index=False) + trials = {"trials": file_path} + modelid = df_trials["modelid"].sort_values().unique() + uniq_segmentid = df_trials["segmentid"].sort_values().unique() + uniq_segmentid = np.unique(np.concatenate((uniq_segmentid, modelid), axis=0)) + + logging.info("making enrollment map") + df_enroll = pd.DataFrame({"modelid": modelid, "segmentid": modelid}) + file_path = self.output_dir / "enrollment.csv" + df_enroll.to_csv(file_path, index=False) + enrollments = {"enrollment": file_path} + + logging.info("making RecordingSet") + vox1_segmentid = [] + vox22_segmentid = [] + for s in uniq_segmentid: + if "VoxSRC2022_dev" in s: + vox22_segmentid.append(s) + else: + vox1_segmentid.append(s) + + vox1_rec_files = [ + glob.glob(f"{self.vox1_corpus_dir}/**/{s}") for s in vox1_segmentid + ] + vox22_rec_files = [ + glob.glob(f"{self.corpus_dir}/**/{s}") for s in vox22_segmentid + ] + rec_ids = vox22_segmentid + vox1_segmentid + rec_files = vox22_rec_files + vox1_rec_files + + recs = pd.DataFrame({"id": rec_ids, "storage_path": rec_files}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame({"id": rec_ids,}) + segments = SegmentSet(segments) + segments.sort() + + logging.info("making dataset") + dataset = Dataset( + segments, + recordings={"recordings": recs}, + enrollments=enrollments, + trials=trials, + sparse_trials=False, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", len(segments), + ) + + # wav_file = voxsrc22_corpus_dir / file_id + # wav_file = vox1_corpus_dir / "wav" / file_id + # logging.info("searching audio files in %s", self.vox1_corpus_dir) + # vox1_rec_files = list(self.vox1_corpus_dir.glob("**/*.wav")) + # if not vox1_rec_files: + # # symlinks? try glob + # vox1_rec_files = [ + # Path(f) for f in glob.iglob(f"{self.vox1_corpus_dir}/**/*.wav", recursive=True) + # ] + + # vox1_rec_ids = [ f.parent.parent.name / f.parent.name / f.name for f in vox1_rec_files] + # rec_files = + + # rec_files = list(self.corpus_dir.glob("**/*.wav")) + # if not rec_files: + # # symlinks? try glob + # rec_files = [ + # Path(f) for f in glob.iglob(f"{self.corpus_dir}/**/*.wav", recursive=True) + # ] + + # u2s_file = output_dir / "utt2spk" + # logging.info("creating utt2spk file %s", u2s_file) + # file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"]))) + # with open(u2s_file, "w") as f: + # for file_id in file_ids: + # f.write("%s %s\n" % (file_id, file_id)) + + # s2u_file = output_dir / "spk2utt" + # logging.info("creating spk2utt file %s", s2u_file) + # with open(s2u_file, "w") as f: + # for file_id in file_ids: + # f.write("%s %s\n" % (file_id, file_id)) + + # wav_file = output_dir / "wav.scp" + # logging.info("creating wav.scp file %s", wav_file) + # with open(wav_file, "w") as f: + # for file_id in file_ids: + # if "VoxSRC2022_dev" in file_id: + # wav_file = voxsrc22_corpus_dir / file_id + # else: + # wav_file = vox1_corpus_dir / "wav" / file_id + + # f.write("%s %s\n" % (file_id, wav_file)) + + def prepare_track12_test(self): + logging.info( + "Preparing VoxSRC22 %s corpus:%s -> %s", + self.subset, + self.corpus_dir, + self.output_dir, + ) + + def prepare(self): + if self.subset == "dev": + self.prepare_track12_dev() + else: + self.prepare_track12_test() diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index a6f20a8e..c8565d1d 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -21,13 +21,17 @@ from torch.optim.swa_utils import SWALR, AveragedModel from ...utils.misc import filter_func_args -from ..loggers import (CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, - WAndBLogger) +from ..loggers import CSVLogger, LoggerList, ProgLogger, TensorBoardLogger, WAndBLogger from ..lr_schedulers import LRScheduler as LRS from ..lr_schedulers import LRSchedulerFactory as LRSF from ..optim import OptimizerFactory as OF -from ..utils import (FairFullyShardedDDP, FairShardedDDP, MetricAcc, TorchDDP, - tensors_subset) +from ..utils import ( + FairFullyShardedDDP, + FairShardedDDP, + MetricAcc, + TorchDDP, + tensors_subset, +) class DDPType(str, Enum): @@ -72,6 +76,7 @@ class TorchTrainer(object): input_key: dict. key for nnet input. target_key: dict. key for nnet targets. """ + def __init__( self, model, @@ -113,8 +118,9 @@ def __init__( self.exp_path = Path(exp_path) if loggers is None: - self.loggers = self._default_loggers(log_interval, use_tensorboard, - use_wandb, wandb) + self.loggers = self._default_loggers( + log_interval, use_tensorboard, use_wandb, wandb + ) elif isinstance(loggers, list): self.loggers = LoggerList(loggers) else: @@ -149,29 +155,23 @@ def __init__( self.rank = dist.get_rank() self.world_size = dist.get_world_size() if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm( - self.model) + self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) if self.rank == 0: logging.info( "training in multiple gpus with distributed-data-parallel" ) oss = False if ddp_type == DDPType.DDP else True - self.optimizer = self._make_optimizer(optim, - self.model, - oss=oss) + self.optimizer = self._make_optimizer(optim, self.model, oss=oss) self.model = TorchDDP( self.model, device_ids=[device], output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm( - self.model) + self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) if self.rank == 0: logging.info( "training in multiple gpus with fair sharded-distributed-data-parallel" ) - self.optimizer = self._make_optimizer(optim, - self.model, - oss=True) + self.optimizer = self._make_optimizer(optim, self.model, oss=True) self.model = FairShardedDDP(self.model, self.optimizer) else: if self.rank == 0: @@ -184,9 +184,7 @@ def __init__( mixed_precision=self.use_amp, move_params_to_cpu=cpu_offload, ) - self.optimizer = self._make_optimizer(optim, - self.model, - oss=False) + self.optimizer = self._make_optimizer(optim, self.model, oss=False) else: self.optimizer = self._make_optimizer(optim, self.model) @@ -216,9 +214,9 @@ def __init__( if self.rank == 0: logging.info("init SWA model") self.swa_model = AveragedModel(self.model) - self.swa_scheduler = SWALR(self.optimizer, - swa_lr=self.swa_lr, - anneal_epochs=self.swa_anneal_epochs) + self.swa_scheduler = SWALR( + self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs + ) def set_epoch(self, data_loader): try: @@ -252,8 +250,7 @@ def fit(self, train_data, val_data=None): if self.lr_scheduler is not None: # this is needed by cosine scheduler epoch_updates = int(len(train_data) / self.grad_acc_steps) - self.lr_scheduler.on_epoch_begin(epoch, - epoch_updates=epoch_updates) + self.lr_scheduler.on_epoch_begin(epoch, epoch_updates=epoch_updates) logs = self.train_epoch(train_data) if val_data is not None: @@ -275,8 +272,7 @@ def fit(self, train_data, val_data=None): self.save_checkpoint(logs) if self.in_swa: - self.loggers.on_epoch_begin(self.cur_epoch, - batches=len(train_data)) + self.loggers.on_epoch_begin(self.cur_epoch, batches=len(train_data)) self.model = self.swa_model.module logs = self.bn_update_epoch(train_data) @@ -351,16 +347,16 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): if swa_update_bn: log_tag = "train_" - self.train() + self.model.train() else: log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) with amp.autocast(enabled=self.use_amp): - output = self.model(input_data) + output = self.model(x) loss = self.loss(output, target) batch_metrics["loss"] = loss.mean().item() @@ -381,9 +377,9 @@ def bn_update_epoch(self, data_loader): def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): if self.ddp: if self.ddp_type == DDPType.DDP: - nn.utils.clip_grad_norm_(model.parameters(), - grad_clip, - norm_type=grad_clip_norm) + nn.utils.clip_grad_norm_( + model.parameters(), grad_clip, norm_type=grad_clip_norm + ) return if self.ddp_type == DDPType.FULLY_SHARDED_DDP: # we have to use the member function in FullyShardedDDP class @@ -395,24 +391,26 @@ def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): optim.clip_grad_norm(grad_clip, norm_type=grad_clip_norm) # if no DDP clip normally - nn.utils.clip_grad_norm_(model.parameters(), - grad_clip, - norm_type=grad_clip_norm) + nn.utils.clip_grad_norm_( + model.parameters(), grad_clip, norm_type=grad_clip_norm + ) def update_model(self): """Updates the model and does gradding clipping.""" if self.use_amp: if self.grad_clip > 0: self.grad_scaler.unscale_(self.optimizer) - self._clip_grad_norm(self.model, self.optimizer, - self.grad_clip, self.grad_clip_norm) + self._clip_grad_norm( + self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + ) self.grad_scaler.step(self.optimizer) self.grad_scaler.update() else: if self.grad_clip > 0: - self._clip_grad_norm(self.model, self.optimizer, - self.grad_clip, self.grad_clip_norm) + self._clip_grad_norm( + self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + ) self.optimizer.step() @@ -441,20 +439,21 @@ def _make_lr_sched(self, lr_sched, optim): lr_sched = LRSF.create(optim, **args) return lr_sched - def _default_loggers(self, log_interval, use_tensorboard, use_wandb, - wandb): + def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): """Creates the default data loaders""" prog_log = ProgLogger(interval=log_interval) csv_log = CSVLogger(self.exp_path / "train.log", append=True) loggers = [prog_log, csv_log] if use_tensorboard: loggers.append( - TensorBoardLogger(self.exp_path / "tb", interval=log_interval)) + TensorBoardLogger(self.exp_path / "tb", interval=log_interval) + ) if use_wandb: loggers.append( - WAndBLogger(**wandb, - path=self.exp_path / "wandb", - interval=log_interval)) + WAndBLogger( + **wandb, path=self.exp_path / "wandb", interval=log_interval + ) + ) return LoggerList(loggers) def _get_lr(self): @@ -478,7 +477,8 @@ def _compute_grad_acc_steps(self, data_loader): return self.grad_acc_steps = int( - math.ceil(self.eff_batch_size / batch_size / self.world_size)) + math.ceil(self.eff_batch_size / batch_size / self.world_size) + ) logging.info( "Setting grad_acc_steps=%d for " "eff_batch_size=%d, avg_batch_size=%d, world_size=%d", @@ -502,30 +502,24 @@ def checkpoint(self, logs=None): logs: logs containing the current value of the metrics. """ checkpoint = { - "epoch": - self.cur_epoch, - "rng_state": - torch.get_rng_state(), - "model_cfg": - self.model.get_config(), - "model_state_dict": - self.model.state_dict(), - "optimizer_state_dict": - self.optimizer.state_dict(), - "loss_state_dict": - self.loss.state_dict() if self.loss is not None else None, + "epoch": self.cur_epoch, + "rng_state": torch.get_rng_state(), + "model_cfg": self.model.get_config(), + "model_state_dict": self.model.state_dict(), + "optimizer_state_dict": self.optimizer.state_dict(), + "loss_state_dict": self.loss.state_dict() + if self.loss is not None + else None, } if self.lr_scheduler is not None: - checkpoint[ - "lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() + checkpoint["lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() if logs is not None: checkpoint["logs"] = logs if self.in_swa: checkpoint["swa_model_state_dict"] = self.swa_model.state_dict() - checkpoint[ - "swa_scheduler_state_dict"] = self.swa_scheduler.state_dict() + checkpoint["swa_scheduler_state_dict"] = self.swa_scheduler.state_dict() return checkpoint @@ -535,8 +529,9 @@ def save_checkpoint(self, logs=None): Args: logs: logs containing the current value of the metrics. """ - if self.ddp and (self.ddp_type == DDPType.OSS_DDP - or self.ddp_type == DDPType.OSS_SHARDED_DDP): + if self.ddp and ( + self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP + ): # Not sure what this does, just copying from the example in # https://github.com/facebookresearch/fairscale/blob/master/benchmarks/oss.py # Check the checkpointing in the case of the OSS optimizer @@ -591,17 +586,16 @@ def load_checkpoint(self, file_path): if self.loss is not None: self.loss.load_state_dict(checkpoint["loss_state_dict"]) if self.lr_scheduler is not None: - self.lr_scheduler.load_state_dict( - checkpoint["lr_scheduler_state_dict"]) + self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) # if self.use_amp: # amp.load_state_dict(checkpoint['amp']) if self.do_swa: if "swa_model_state_dict" in checkpoint: - self.swa_model.load_state_dict( - checkpoint["swa_model_state_dict"]) + self.swa_model.load_state_dict(checkpoint["swa_model_state_dict"]) self.swa_scheduler.load_state_dict( - checkpoint["swa_scheduler_state_dict"]) + checkpoint["swa_scheduler_state_dict"] + ) else: self.swa_scheduler = SWALR( self.optimizer, @@ -681,13 +675,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): "--eff-batch-size", type=int, default=None, - help= - "effective total batch size, if given, it overrides grad_acc_steps", + help="effective total batch size, if given, it overrides grad_acc_steps", ) - parser.add_argument("--epochs", - type=int, - default=200, - help="number of epochs") + parser.add_argument("--epochs", type=int, default=200, help="number of epochs") if train_modes is not None: parser.add_argument( "--train-mode", @@ -707,19 +697,12 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=False, help="use tensorboard logger", ) - parser.add_argument("--use-wandb", - action="store_true", - default=False, - help="use wandb logger") - parser.add_argument("--wandb.project", - default=None, - help="wandb project name") - parser.add_argument("--wandb.group", - default=None, - help="wandb group name") - parser.add_argument("--wandb.name", - default=None, - help="wandb display name") + parser.add_argument( + "--use-wandb", action="store_true", default=False, help="use wandb logger" + ) + parser.add_argument("--wandb.project", default=None, help="wandb project name") + parser.add_argument("--wandb.group", default=None, help="wandb group name") + parser.add_argument("--wandb.name", default=None, help="wandb display name") # parser.add_argument( # '--wandb.path', default=None, # help='wandb directory') @@ -748,10 +731,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=False, help="CPU offload of gradients when using fully_sharded_ddp", ) - parser.add_argument("--grad-clip", - type=float, - default=0, - help="gradient clipping norm value") + parser.add_argument( + "--grad-clip", type=float, default=0, help="gradient clipping norm value" + ) parser.add_argument( "--grad-clip-norm", default=2, @@ -764,10 +746,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=0, help="start epoch for SWA, if 0 it does not use SWA", ) - parser.add_argument("--swa-lr", - type=float, - default=1e-3, - help="learning rate for SWA phase") + parser.add_argument( + "--swa-lr", type=float, default=1e-3, help="learning rate for SWA phase" + ) parser.add_argument( "--swa-anneal-epochs", type=int, @@ -786,7 +767,6 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index eddf47a7..a59cbe14 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -101,16 +101,16 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - for batch, (data, target) in enumerate(data_loader): + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) with amp.autocast(enabled=self.use_amp): - output = self.model(input_data, y=target) + output = self.model(x, y=target) loss = self.loss(output, target).mean() / self.grad_acc_steps if self.use_amp: diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 52474baa..0f6ccd9b 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -106,10 +106,10 @@ def train_epoch(self, data_loader): if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + audio, target = tensors_subset(data, batch_keys, self.device) + batch_size = audio.size(0) with torch.no_grad(): - feats, feats_lengths = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(audio) with amp.autocast(enabled=self.use_amp): output = self.model(feats, feats_lengths, y=target) @@ -159,10 +159,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, data in enumerate(data_loader): - input_data, target = tensors_subset(data, batch_keys, self.device) - batch_size = input_data.size(0) + audio, target = tensors_subset(data, batch_keys, self.device) + batch_size = audio.size(0) - feats, feats_lengths = self.feat_extractor(input_data) + feats, feats_lengths = self.feat_extractor(audio) with amp.autocast(enabled=self.use_amp): output = self.model(feats, feats_lengths) loss = self.loss(output, target) diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index 70ee82c8..fe72339f 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -70,8 +70,33 @@ def load(cls, file_path, sep=None): if ext == "": # if no extension we load as kaldi utt2spk file df = pd.read_csv( - file_path, sep=" ", header=None, names=["id"], dtype={"id": np.str}, + file_path, + sep=" ", + header=None, + names=["id"], + dtype={"id": np.str}, ) return cls(df) return super().load(file_path, sep) + + @classmethod + def cat(cls, tables): + """Concatenates several tables. + + Args: + info_lists: List of InfoTables + + Returns: + InfoTable object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + assert df["id"].is_unique, """there are duplicated ids in original tables""" + if not df["class_idx"].is_unique: + logging.warning( + """class_idx in concat tables are not unique, + we will assign new class_idx""" + ) + df["class_idx"].drop(columns=["class_idx"], inplace=True) + return cls(df) diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index e6c9e861..0ef81ab6 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -20,21 +20,21 @@ class Dataset: - """ Class that contains all objects - (segments, recordings, features, class_infos) that - conform a dataset - - Attributes: - segments: SegmentSet object or path to it. - classes: Dictionary of ClassInfo objects or paths to then - recordings: Dictionary of RecordingSet objects or paths to then - features: Dictionary of FeatureSet objects or paths to then - enrollments: Dictionary of EnrollmentMap objects or paths to then - trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects - or paths to then - sparse_trials: load trial keys using the SparseTrialKey class instead - of TrialKey class. - table_sep: Column separator when reading/writting tables + """Class that contains all objects + (segments, recordings, features, class_infos) that + conform a dataset + + Attributes: + segments: SegmentSet object or path to it. + classes: Dictionary of ClassInfo objects or paths to then + recordings: Dictionary of RecordingSet objects or paths to then + features: Dictionary of FeatureSet objects or paths to then + enrollments: Dictionary of EnrollmentMap objects or paths to then + trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects + or paths to then + sparse_trials: load trial keys using the SparseTrialKey class instead + of TrialKey class. + table_sep: Column separator when reading/writting tables """ @@ -70,10 +70,12 @@ def __init__( features, FeatureSet ) self._enrollments, self._enrollments_paths = self._parse_dict_args( - enrollments, EnrollmentMap, + enrollments, + EnrollmentMap, ) self._trials, self._trials_paths = self._parse_dict_args( - trials, (TrialKey, TrialNdx, SparseTrialKey), + trials, + (TrialKey, TrialNdx, SparseTrialKey), ) self.sparse_trials = sparse_trials @@ -217,16 +219,41 @@ def save( dataset_path: PathLike, update_paths: bool = True, table_sep: Optional[str] = None, + force_save_all: bool = False, ): - """Saves all the dataset objects. + """Saves the dataset to disk. Args: - dataset_path: str/Path indicating directory - to save the dataset or .yaml file to save + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save the dataset info. - update_paths: whether to update the file_paths in the - data structures in the DateSet object + update_paths: whether to update the file_paths in the + data structures in the DataSet object + force_save_all: forces saving all tables even if they haven't changed, + otherwise, it only saves tables loaded in memory + and those that are not in the datadirectory + """ + if force_save_all: + self.save_all(dataset_path, update_paths, table_sep) + else: + self.save_changed(dataset_path, update_paths, table_sep) + def save_changed( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + force_save_all: bool = False, + ): + """Saves the tables that change in disk or tables + that are not in the ouput directory. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object """ table_sep = self.table_sep if table_sep is None else table_sep if update_paths: @@ -238,12 +265,139 @@ def save( file_name = f"segments{table_ext}" dataset["segments"] = file_name file_path = dataset_dir / file_name - self.segments().save(file_path, sep=table_sep) + if ( + self._segments is not None + or file_path != self._segments_path + or not file_path.exists() + ): + self.segments(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._segments_path = file_path + + file_names = {} + for k in self._recordings.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._recordings is not None + or file_path != self._recordings_paths[k] + or not file_path.exists() + ): + v = self.recordings_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._recordings_paths[k] = file_path + + if file_names: + dataset["recordings"] = file_names + + file_names = {} + for k in self._features.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._features is not None + or file_path != self._features_paths[k] + or not file_path.exists() + ): + v = self.features_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path + + if file_names: + dataset["features"] = file_names + + file_names = {} + for k, v in self._classes.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._classes is not None + or file_path != self._classes_paths[k] + or not file_path.exists() + ): + v = self.classes_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path + + if file_names: + dataset["classes"] = file_names + + file_names = {} + for k, v in self._enrollments.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._enrollments is not None + or file_path != self._enrollments_paths[k] + or not file_path.exists() + ): + v = self.enrollments_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + file_names = {} + for k, v in self._trials.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._trials is not None + or file_path != self._trials_paths[k] + or not file_path.exists() + ): + v = self.trials_value(k, keep_loaded=False) + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names + + with open(dataset_file, "w") as f: + yaml.dump(dataset, f) + + def save_all( + self, + dataset_path: PathLike, + update_paths: bool = True, + table_sep: Optional[str] = None, + ): + """Saves all the dataset objects. + + Args: + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save + the dataset info. + update_paths: whether to update the file_paths in the + data structures in the DataSet object + """ + table_sep = self.table_sep if table_sep is None else table_sep + if update_paths: + self.table_sep = table_sep + + table_ext = ".tsv" if table_sep == "\t" else ".csv" + dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + dataset = {} + file_name = f"segments{table_ext}" + dataset["segments"] = file_name + file_path = dataset_dir / file_name + self.segments(keep_loaded=False).save(file_path, sep=table_sep) if update_paths: self._segments_path = file_path file_names = {} - for k, v in self.recordings(): + for k, v in self.recordings(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -255,7 +409,7 @@ def save( dataset["recordings"] = file_names file_names = {} - for k, v in self.features(): + for k, v in self.features(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -267,7 +421,7 @@ def save( dataset["features"] = file_names file_names = {} - for k, v in self.classes(): + for k, v in self.classes(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -279,7 +433,7 @@ def save( dataset["classes"] = file_names file_names = {} - for k, v in self.enrollments(): + for k, v in self.enrollments(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -291,7 +445,7 @@ def save( dataset["enrollments"] = file_names file_names = {} - for k, v in self.trials(): + for k, v in self.trials(keep_loaded=False): file_name = k + table_ext file_names[k] = file_name file_path = dataset_dir / file_name @@ -329,8 +483,8 @@ def load( """Loads all the dataset objects. Args: - dataset_path: str/Path indicating directory - to save the dataset or .yaml file to save + dataset_path: str/Path indicating directory + to save the dataset or .yaml file to save the dataset info. lazy: load data structures lazily when they are needed. sparse_trials: load trial keys using the SparseTrialKey class instead of TrialKey class @@ -386,34 +540,64 @@ def load( return dataset - # dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) - # with open(dataset_file, "w") as f: - # dataset = yaml.safe_load(f) - - # assert "segments" in dataset - # segments = SegmentSet.load( - # Dataset.resolve_file_path(dataset_dir, dataset["segments"]) - # ) - # classes = None - # recordings = None - # features = None - # if "classes" in dataset: - # classes = {} - # for k, v in dataset["classes"]: - # classes[k] = ClassInfo.load(Dataset.resolve_file_path(dataset_dir, v)) - - # if "recordings" in dataset: - # recordings = {} - # for k, v in dataset["recordings"]: - # recordings[k] = RecordingSet.load( - # Dataset.resolve_file_path(dataset_dir, v) - # ) - - # if "features" in dataset: - # features = {} - # for k, v in dataset["features"]: - # features[k] = FeatureSet.load(Dataset.resolve_file_path(dataset_dir, v)) - - # dataset = cls(segments, classes, recordings, features) - # if not lazy: - # dataset.update_from_disk() + def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]): + if isinstance(features, (str, Path)): + self._features[features_name] = None + self._features_paths[features_name] = features + elif isinstance(features, FeatureSet): + self._features[features_name] = features + self._features_paths[features_name] = None + else: + raise ValueError() + + def add_recordings( + self, + recordings_name: str, + recordings: Union[PathLike, RecordingSet], + ): + if isinstance(features, (str, Path)): + self._recordings[features_name] = None + self._recordings_paths[recordings_name] = recordings + elif isinstance(recordings, RecordingSet): + self._recordings[recordings_name] = recordings + self._recordings_paths[recordings_name] = None + else: + raise ValueError() + + def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): + if isinstance(classes, (str, Path)): + self._classes[features_name] = None + self._classes_paths[classes_name] = classes + elif isinstance(classes, ClassInfo): + self._classes[classes_name] = classes + self._classes_paths[classes_name] = None + else: + raise ValueError() + + def add_enrollments( + self, + enrollments_name: str, + enrollments: Union[PathLike, EnrollmentMap], + ): + if isinstance(features, (str, Path)): + self._enrollments[features_name] = None + self._enrollments_paths[enrollments_name] = enrollments + elif isinstance(enrollments, EnrollmentMap): + self._enrollments[enrollments_name] = enrollments + self._enrollments_paths[enrollments_name] = None + else: + raise ValueError() + + def add_trials( + self, + trials_name: str, + trials: Union[PathLike, TrialKey, TrialNdx, SparseTrialKey], + ): + if isinstance(features, (str, Path)): + self._trials[features_name] = None + self._trials_paths[trials_name] = trials + elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)): + self._trials[trials_name] = trials + self._trials_paths[trials_name] = None + else: + raise ValueError() diff --git a/hyperion/utils/enrollment_map.py b/hyperion/utils/enrollment_map.py index 024e5b74..4af69144 100644 --- a/hyperion/utils/enrollment_map.py +++ b/hyperion/utils/enrollment_map.py @@ -18,12 +18,13 @@ class EnrollmentMap(InfoTable): """Class to store the mapping between enrollment id - and segmentids + and segmentids """ def __init__(self, df): if "modelid" in df: df.rename(columns={"modelid": "id"}, inplace=True) + assert "segmentid" in df super().__init__(df) def split(self, idx, num_parts): @@ -84,3 +85,17 @@ def load(cls, file_path, sep=None): df = pd.read_csv(file_path, sep=sep) return cls(df) + + @classmethod + def cat(cls, tables): + """Concatenates several tables. + + Args: + info_lists: List of InfoTables + + Returns: + InfoTable object concatenation the info_lists. + """ + df_list = [table.df for table in tables] + df = pd.concat(df_list) + return cls(df) diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 6bcd4aca..45eab05f 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -176,8 +176,8 @@ def split(self, idx, num_parts, group_by=None): return self.__class__(df) @classmethod - def merge(cls, tables): - """Merges several tables. + def cat(cls, tables): + """Concatenates several tables. Args: info_lists: List of InfoTables @@ -187,6 +187,9 @@ def merge(cls, tables): """ df_list = [table.df for table in tables] df = pd.concat(df_list) + assert df[ + "id" + ].is_unique, """there are duplicated ids in the tables we are concatenating""" return cls(df) def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): From 63a2bd994c961b6c438bda454cc66a8695d1b797 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 30 Jun 2023 09:38:03 -0400 Subject: [PATCH 103/154] added config 2.0 to vox v2 --- ...un_031_attack_type_verif_and_noveltydet.sh | 2 +- egs/voxceleb/v1.1/README.md | 18 +- ...rain_idrnd_resnet100_xvec_stage1_v3.0.yaml | 2 +- ...train_res2net50w26s8_xvec_stage1_v3.0.yaml | 2 +- ...train_res2net50w26s8_xvec_stage2_v3.0.yaml | 3 +- egs/voxceleb/v1.1/run_030_extract_xvectors.sh | 4 +- egs/voxceleb/v1.1/run_040_eval_be.sh | 2 +- egs/voxceleb/v1.2/run_001_prepare_data.sh | 34 +- egs/voxceleb/v2/README.md | 149 +----- ...lmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++ ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 63 +++ ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 73 +++ .../wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml | 45 ++ .../wavlmbaseplus_ecapatdnn512x3_v2.0.yaml | 44 ++ ...onfig_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | 54 ++ egs/voxceleb/v2/run_030_extract_xvectors.sh | 16 +- egs/voxceleb/v2/run_040_eval_be.sh | 294 +++++++++- hyperion/bin/adv_finetune_xvector_from_wav.py | 5 +- hyperion/bin/apply_mvn_select_frames.py | 9 +- hyperion/bin/audio_to_duration.py | 5 +- hyperion/bin/compute_energy_vad.py | 9 +- hyperion/bin/compute_mfcc_feats.py | 9 +- hyperion/bin/copy_feats.py | 1 - hyperion/bin/decode_wav2transducer.py | 12 +- hyperion/bin/decode_wav2vec2rnn_transducer.py | 5 +- ...l_xvec_cosine_scoring_from_adv_test_wav.py | 9 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 10 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 18 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 9 +- ...sine_scoring_from_transfer_adv_test_wav.py | 5 +- ...sine_scoring_from_transfer_art_test_wav.py | 18 +- hyperion/bin/eval_xvec_logits_from_wav.py | 9 +- hyperion/bin/extract_wav2vec2xvectors.py | 38 +- hyperion/bin/extract_xvectors_from_feats.py | 5 +- hyperion/bin/extract_xvectors_from_wav.py | 9 +- .../extract_xvectors_slidwin_from_feats.py | 11 +- .../bin/extract_xvectors_slidwin_from_wav.py | 11 +- hyperion/bin/finetune_wav2vec2transducer.py | 5 +- hyperion/bin/finetune_wav2vec2xvector.py | 18 +- .../bin/finetune_xvector_dfr_from_feats.py | 5 +- hyperion/bin/finetune_xvector_dfr_from_wav.py | 5 +- hyperion/bin/finetune_xvector_from_feats.py | 5 +- hyperion/bin/finetune_xvector_from_wav.py | 9 +- .../generate_adv_attacks_xvector_classif.py | 11 +- .../bin/generate_adv_attacks_xvector_verif.py | 11 +- hyperion/bin/hyperion_dataset.py | 23 +- hyperion/bin/hyperion_tables.py | 22 +- hyperion/bin/make_babble_noise_audio_files.py | 7 +- hyperion/bin/pack_wav_rirs.py | 9 +- hyperion/bin/plot_embedding_tsne.py | 5 +- hyperion/bin/plot_embedding_tsne_per_class.py | 5 +- hyperion/bin/prepare_data.py | 5 +- hyperion/bin/preprocess_audio_files.py | 7 +- .../split_dataset_into_trials_and_cohort.py | 68 +++ hyperion/bin/train_wav2rnn_transducer.py | 5 +- hyperion/bin/train_wav2vec2rnn_transducer.py | 5 +- hyperion/bin/train_wav2vec2transducer.py | 5 +- hyperion/bin/train_wav2vec2xvector.py | 5 +- hyperion/bin/train_xvector_from_feats.py | 5 +- hyperion/bin/train_xvector_from_wav.py | 9 +- hyperion/data_prep/__init__.py | 1 + hyperion/data_prep/voxceleb1.py | 2 +- hyperion/data_prep/voxceleb2.py | 2 +- hyperion/data_prep/voxsrc22.py | 21 +- .../data/class_weighted_seg_chunk_sampler.py | 2 +- .../models/wav2xvectors/hf_wav2xvector.py | 100 ++-- hyperion/torch/torch_model.py | 34 +- hyperion/torch/tpm/hf/hf_hubert.py | 32 ++ hyperion/torch/tpm/hf/hf_wav2vec2.py | 6 + hyperion/torch/tpm/hf/hf_wav2vec_base.py | 84 ++- hyperion/torch/tpm/hf/hf_wavlm.py | 32 ++ hyperion/torch/trainers/torch_trainer.py | 12 +- hyperion/utils/dataset.py | 500 ++++++++++++++---- hyperion/utils/segment_set.py | 10 +- 74 files changed, 1535 insertions(+), 628 deletions(-) create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh create mode 100755 hyperion/bin/split_dataset_into_trials_and_cohort.py diff --git a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh index 4ce703ba..3b93fabd 100755 --- a/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh +++ b/egs/voxceleb/adv.v2/run_031_attack_type_verif_and_noveltydet.sh @@ -293,7 +293,7 @@ if [ $stage -le 13 ]; then awk '!/benign/' $list_someknown_dir/train/utt2spk > $list_someknown_dir/train_nobenign/utt2spk steps_backend/train_be_v1.sh --cmd "$train_cmd" \ --plda-type splda \ - --y-dim 6 \ + --y-dim 5 \ $sign_dir/train/xvector.scp \ $list_someknown_dir/train_nobenign \ $be_dir diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 73b9bb4e..3b9eeaa9 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -111,8 +111,11 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | | | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | -| | | | Cosine + AS-Norm | 0.52 | 0.33 | 0.045 | +| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 | | | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | +| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | +| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | ### VoxCeleb 1 Entire-Clean trial list @@ -143,8 +146,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| | | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | | | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | - - +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | +| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | +| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | ### VoxCeleb 1 Hard-Clean trial list @@ -174,7 +178,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | | | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | | | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | - +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | +| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | +| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | ### VoxSRC2022 dev @@ -205,6 +211,10 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | | | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | | | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | +| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | +| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | + ## Results before 2023 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml index 9e302200..1016087d 100644 --- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -68,5 +68,5 @@ trainer: grad_clip: 250 use_amp: true log_interval: 1000 - epochs: 35 + epochs: 30 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml index 40fb362e..e98d6c13 100644 --- a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage1_v3.0.yaml @@ -68,5 +68,5 @@ trainer: grad_clip: 250 use_amp: true log_interval: 1000 - epochs: 35 + epochs: 30 eff_batch_size: 256 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml index 469e166b..5c9af011 100644 --- a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml @@ -44,7 +44,8 @@ model: margin_warmup_epochs: 0 intertop_margin: 0.1 override_dropouts: true - dropout_rate: 0.0 + # dropout_rate: 0.0 + dropout_rate: 0.2 trainer: optim: opt_type: sgd diff --git a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh index 8c0949f4..f933a7b2 100755 --- a/egs/voxceleb/v1.1/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v1.1/run_030_extract_xvectors.sh @@ -8,7 +8,7 @@ set -e stage=1 -nnet_stage=1 +nnet_stage=2 config_file=default_config.sh use_gpu=false xvec_chunk_length=12800 @@ -85,4 +85,4 @@ if [ $stage -le 2 ]; then done fi -exit + diff --git a/egs/voxceleb/v1.1/run_040_eval_be.sh b/egs/voxceleb/v1.1/run_040_eval_be.sh index 0780584c..6bdbdf92 100755 --- a/egs/voxceleb/v1.1/run_040_eval_be.sh +++ b/egs/voxceleb/v1.1/run_040_eval_be.sh @@ -8,7 +8,7 @@ set -e stage=1 -nnet_stage=1 +nnet_stage=2 config_file=default_config.sh diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh index f956bc8c..c151e270 100755 --- a/egs/voxceleb/v1.2/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -16,26 +16,31 @@ config_file=default_config.sh if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. - hyp_utils/conda_env.sh \ - prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ - --cat-videos --use-kaldi-ids \ - --output-dir data/voxceleb2cat_train + prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train fi if [ $stage -le 2 ];then # prepare voxceleb1 for test - #hyp_utils/conda_env.sh \ - prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ - --use-kaldi-ids \ - --output-dir data/voxceleb1_test + # hyp_utils/conda_env.sh + prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test #local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi -exit + if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then - local/prepare_voxsrc22_dev.py \ - --vox1-corpus-dir $voxceleb1_root \ - --voxsrc22-corpus-dir $voxsrc22_root \ - --output-dir data/voxsrc22_dev + prepare_data.py voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev + # local/prepare_voxsrc22_dev.py \ + # --vox1-corpus-dir $voxceleb1_root \ + # --voxsrc22-corpus-dir $voxsrc22_root \ + # --output-dir data/voxsrc22_dev + prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_test fi # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then @@ -46,5 +51,6 @@ fi if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then # # split vox2 into 2 parts, for cohort and qmf training - local/make_vox2_trials.py --data-dir data/voxceleb2cat_train + split_dataset_into_trials_and_cohort.py --data-dir data/voxceleb2cat_train + #local/make_vox2_trials.py --data-dir data/voxceleb2cat_train fi diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md index 5b5b93e5..c64a4b41 100644 --- a/egs/voxceleb/v2/README.md +++ b/egs/voxceleb/v2/README.md @@ -1,24 +1,9 @@ -# VoxCeleb V1.1 +# VoxCeleb V2 -Recipe for the VoxCeleb Speaker Verification Task +Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Hubert models from HuggingFace as feature extractors ## Differences w.r.t VoxCeleb V1 recipe -In recipe version V1: - - We compute speech augmentations and acoustic features offline and dump them to disk. - - Augmentation is performed using Kaldi scripts and wav-reverbate tool - - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. - -In this recipe: - - We compute speech augmentations and acoustic features are computed always on-the-fly, - we don't dump any features to disk. - - Augmentation is performed using Hyperin SpeechAugment class. - - The behavior of this class is controlled - by the the configuration file `conf/reverb_noise_aug.yml`, - which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. - - Babble noise is created offline by mixing 3-10 single speaker files. - - ## Citing ## Training Data @@ -41,15 +26,14 @@ In this recipe: ## Usage - Run the run_0*.sh scripts in sequence - - By default it will use Light ResNet (16 base channels) - - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as + - By default it will use + - For better performance use ```bash run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh ``` - - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` ## Recipe Steps: @@ -73,7 +57,9 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr - Creates training and validation lists for x-vector training - `run_011_train_xvector.sh` - - Trains the x-vector network + - Trains the x-vector model on frozen wav2vec features + - Finetunes wav2vec+x-vector model + - Large margin finetuning of wav2vec+x-vector model - `run_030_extract_xvectors.sh` - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training @@ -89,117 +75,30 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | -| | | | Cosine | 2.04 | 0.138 | 0.210 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.35 | 0.091 | 0.159 | -| | | | Cosine | 1.22 | 0.082 | 0.129 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.090 | 0.160 | -| | | | Cosine | 1.44 | 0.100 | 0.173 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 | -| | | | Cosine | 1.17 | 0.081 | 0.110 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 | -| | | | Cosine | 1.31 | 0.080 | 0.139 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 | -| | | | Cosine | 1.23 | 0.083 | 0.136 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.095 | 0.156 | -| | | | Cosine | 1.29 | 0.089 | 0.146 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.084 | 0.136 | -| | | | Cosine | 1.18 | 0.078 | 0.115 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.084 | 0.145 | -| | | | Cosine | 1.12 | 0.073 | 0.131 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.53 | 0.104 | 0.189 | -| | | | Cosine | 1.31 | 0.084 | 0.132 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 0.98 | 0.066 | 0.116 | -| | | | Cosine | 1.12 | 0.071 | 0.103 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.05 | 0.077 | 0.123 | -| | | | Cosine | 0.96 | 0.065 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.04 | 0.071 | 0.118 | -| | | | Cosine | 0.93 | 0.067 | 0.108 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 0.90 | 0.067 | 0.118 | -| | | | Cosine | 0.85 | 0.060 | 0.094 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 | -| | | | Cosine | 1.29 | 0.084 | 0.140 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 | - +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 | +| | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 | +| | | | Cosine + QMF | 0.75 | 0.054 | 0.086 | ### VoxCeleb 1 Entire-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 | -| | | | Cosine | 1.93 | 0.122 | 0.201 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 | -| | | | Cosine | 1.24 | 0.080 | 0.136 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 | -| | | | Cosine | 1.30 | 0.082 | 0.150 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 | -| | | | Cosine | 1.09 | 0.071 | 0.124 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 | -| | | | Cosine | 1.15 | 0.076 | 0.132 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 | -| | | | Cosine | 1.27 | 0.082 | 0.148 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.31 | 0.086 | 0.149 | -| | | | Cosine | 1.22 | 0.079 | 0.134 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.082 | 0.145 | -| | | | Cosine | 1.16 | 0.074 | 0.130 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.077 | 0.136 | -| | | | Cosine | 1.11 | 0.071 | 0.125 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.46 | 0.097 | 0.173 | -| | | | Cosine | 1.24 | 0.080 | 0.140 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.071 | 0.127 | -| | | | Cosine | 1.05 | 0.067 | 0.117 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.078 | 0.134 | -| | | | Cosine | 1.05 | 0.069 | 0.121 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.18 | 0.075 | 0.131 | -| | | | Cosine | 0.98 | 0.063 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA | 1.17 | 0.072 | 0.123 | -| | | | Cosine | 0.94 | 0.061 | 0.107 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 | -| | | | Cosine | 1.27 | 0.079 | 0.142 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 | - +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 | +| | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 | +| | | | Cosine + QMF | 0.75 | 0.046 | 0.076 | ### VoxCeleb 1 Hard-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | | ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 | -| | | | Cosine | 3.27 | 0.188 | 0.303 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 | -| | | | Cosine | 2.32 | 0.139 | 0.232 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 | -| | | | Cosine | 2.33 | 0.142 | 0.235 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 | -| | | | Cosine | 2.14 | 0.126 | 0.203 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 | -| | | | Cosine | 2.11 | 0.127 | 0.205 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 | -| | | | Cosine | 2.33 | 0.141 | 0.232 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 2.42 | 0.144 | 0.245 | -| | | | Cosine | 2.26 | 0.133 | 0.224 -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.39 | 0.141 | 0.235 | -| | | | Cosine | 2.17 | 0.128 | 0.215 -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.28 | 0.131 | 0.225 | -| | | | Cosine | 2.11 | 0.124 | 0.204 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 2.77 | 0.172 | 0.271 | -| | | | Cosine | 2.45 | 0.141 | 0.225 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 2.07 | 0.124 | 0.201 | -| | | | Cosine | 1.95 | 0.113 | 0.181 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 2.34 | 0.136 | 0.230 | -| | | | Cosine | 1.99 | 0.119 | 0.196 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 2.18 | 0.127 | 0.211 | -| | | | Cosine | 1.89 | 0.112 | 0.184 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 2.14 | 0.125 | 0.209 | -| | | | Cosine | 1.84 | 0.110 | 0.186 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 | -| | | | Cosine | 2.26 | 0.134 | 0.214 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 | +| | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 | +| | | | Cosine + QMF | 1.56 | 0.096 | 0.155 | + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 | +| | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 | +| | | | Cosine + QMF | 2.31 | 0.143 | 0.232 | diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..bd3e7f86 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..eed0ad1f --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..d66d6877 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..d7e3388f --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..b2430d97 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..373535c2 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/run_030_extract_xvectors.sh b/egs/voxceleb/v2/run_030_extract_xvectors.sh index 67122f85..16f29841 100755 --- a/egs/voxceleb/v2/run_030_extract_xvectors.sh +++ b/egs/voxceleb/v2/run_030_extract_xvectors.sh @@ -7,10 +7,10 @@ . ./path.sh set -e -stage=2 +stage=1 +nnet_stage=3 config_file=default_config.sh use_gpu=false -nnet_stage=3 hf_chunk_length=120 #seconds xvec_chunk_length=120 #seconds . parse_options.sh || exit 1; @@ -36,20 +36,20 @@ fi xvector_dir=exp/xvectors/$nnet_name -if [ $stage -le 1 ]; then +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then # Extract xvectors for training LDA/PLDA for name in voxceleb2cat_train do if [ $plda_num_augs -eq 0 ]; then steps_xvec/extract_wav2vec2xvectors.sh \ --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + --random-utt-length true --min-utt-length 2 --max-utt-length 30 \ $nnet data/${name} \ $xvector_dir/${name} else steps_xvec/extract_wav2vec2xvectors.sh \ --cmd "$xvec_cmd" --nj 300 ${xvec_args} \ - --random-utt-length true --min-utt-length 4 --max-utt-length 140 \ + --random-utt-length true --min-utt-length 2 --max-utt-length 30 \ --aug-config $plda_aug_config --num-augs $plda_num_augs \ $nnet data/${name} \ $xvector_dir/${name}_augx${plda_num_augs} \ @@ -60,7 +60,10 @@ fi if [ $stage -le 2 ]; then # Extracts x-vectors for evaluation - for name in voxceleb1_test + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data do num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') nj=$(($num_spk < 100 ? $num_spk:100)) @@ -71,4 +74,3 @@ if [ $stage -le 2 ]; then done fi -exit diff --git a/egs/voxceleb/v2/run_040_eval_be.sh b/egs/voxceleb/v2/run_040_eval_be.sh index ac561344..0982abeb 100755 --- a/egs/voxceleb/v2/run_040_eval_be.sh +++ b/egs/voxceleb/v2/run_040_eval_be.sh @@ -7,10 +7,10 @@ . ./path.sh set -e -# By default we evaluate the nnet after finetuning stage 3 and only with cosine scoring -stage=3 -config_file=default_config.sh +stage=1 nnet_stage=3 +config_file=default_config.sh + . parse_options.sh || exit 1; . $config_file @@ -25,6 +25,15 @@ elif [ $nnet_stage -eq 2 ];then elif [ $nnet_stage -eq 3 ];then nnet=$nnet_s3 nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name fi plda_label=${plda_type}y${plda_y_dim}_v1 @@ -35,8 +44,12 @@ be_dir=exp/be/$nnet_name/$be_name score_dir=exp/scores/$nnet_name/${be_name} score_plda_dir=$score_dir/plda score_cosine_dir=exp/scores/$nnet_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/cosine_qmf -if [ $stage -le 1 ]; then + +if [ "$do_plda" == "true" ];then + if [ $stage -le 1 ]; then echo "Train PLDA on Voxceleb2" steps_be/train_be_v1.sh \ --cmd "$train_cmd" \ @@ -45,14 +58,12 @@ if [ $stage -le 1 ]; then --y_dim $plda_y_dim --z_dim $plda_z_dim \ $xvector_dir/$plda_data/xvector.scp \ data/$plda_data \ - $be_dir & - - wait -fi - - -if [ $stage -le 2 ];then - + $be_dir + + fi + + + if [ $stage -le 2 ];then echo "Eval Voxceleb 1 with LDA+CentWhiten+LNorm+PLDA" steps_be/eval_be_v1.sh \ --cmd "$train_cmd" --plda_type $plda_type \ @@ -62,7 +73,7 @@ if [ $stage -le 2 ];then $be_dir/lda_lnorm.h5 \ $be_dir/plda.h5 \ $score_plda_dir/voxceleb1_scores - + $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir @@ -72,32 +83,267 @@ if [ $stage -le 2 ];then cat $f echo "" done - + fi fi -score_plda_dir=$score_cosine_dir + if [ $stage -le 3 ];then - echo "Eval Voxceleb 1 with Cosine scoring" + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + + echo "Eval voxsrc2 with Cosine scoring" steps_be/eval_be_cos.sh --cmd "$train_cmd" \ - data/voxceleb1_test/trials \ - data/voxceleb1_test/utt2model \ - $xvector_dir/voxceleb1_test/xvector.scp \ - $score_plda_dir/voxceleb1_scores + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $score_cosine_dir/voxsrc22_dev_scores & - $train_cmd --mem 10G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ - local/score_voxceleb1.sh data/voxceleb1_test $score_plda_dir + # steps_be/eval_be_cos.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $score_cosine_dir/voxsrc22_test_scores - for f in $(ls $score_plda_dir/*_results); + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_dir + + for f in $(ls $score_cosine_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + +fi + + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 22G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/*_results); + do + echo $f + cat $f + echo "" + done + fi + + if [ $stage -le 6 ];then + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_snorm.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_snorm_dir/voxsrc22_dev_scores & + + # steps_be/eval_be_cos_snorm.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_snorm_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_snorm_dir + + for f in $(ls $score_cosine_snorm_dir/voxsrc22_dev_results); + do + echo $f + cat $f + echo "" + done + fi +fi + + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxceleb2cat_train/utt2speech_dur \ + > $xvector_dir/voxceleb2cat_train/utt2num_frames + + echo "Train QMF in Vox2" + steps_be/train_be_cos_qmf.sh \ + --cmd "$train_cmd" --coh-nbest 1000 \ + data/voxceleb2cat_train/trials \ + data/voxceleb2cat_train/utt2model \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $xvector_dir/voxceleb2cat_train/utt2num_frames \ + data/voxceleb2cat_train/snorm_utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/voxceleb2_qmf_scores + + fi + + if [ $stage -le 8 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxceleb1_test/utt2speech_dur \ + > $xvector_dir/voxceleb1_test/utt2num_frames + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $xvector_dir/voxceleb1_test/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxceleb1_scores + + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_snorm.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1_qmf.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxceleb1{,_snorm,_qmf}_[oeh]_clean_results); + do + echo $f + cat $f + echo "" + done + + fi + + if [ $stage -le 9 ];then + awk '{ print $1, $2*100}' \ + $xvector_dir/voxsrc22_dev/utt2speech_dur \ + > $xvector_dir/voxsrc22_dev/utt2num_frames + + echo "Eval voxsrc2 with Cosine scoring" + steps_be/eval_be_cos_qmf.sh \ + --cmd "$train_cmd --mem 20G" --coh-nbest 1000 \ + data/voxsrc22_dev/trials \ + data/voxsrc22_dev/utt2model \ + $xvector_dir/voxsrc22_dev/xvector.scp \ + $xvector_dir/voxsrc22_dev/utt2num_frames \ + data/voxceleb2cat_train/utt2spk \ + $xvector_dir/voxceleb2cat_train/xvector.scp \ + $score_cosine_qmf_dir/qmf.h5 \ + $score_cosine_qmf_dir/voxsrc22_dev_scores & + + # awk '{ print $1, $2*100}' \ + # $xvector_dir/voxsrc22_test/utt2speech_dur \ + # > $xvector_dir/voxsrc22_test/utt2num_frames + # steps_be/eval_be_cos_qmf.sh --cmd "$train_cmd" \ + # data/voxsrc22_test/trials \ + # data/voxsrc22_test/utt2model \ + # $xvector_dir/voxsrc22_test/xvector.scp \ + # $xvector_dir/voxsrc22_test/utt2num_frames \ + # data/voxceleb2cat_train/utt2spk \ + # $xvector_dir/voxceleb2cat_train/xvector.scp \ + # $score_cosine_qmf_dir/qmf.h5 \ + # $score_cosine_qmf_dir/voxsrc22_test_scores + + wait + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_snorm.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _snorm + $train_cmd --mem 10G --num-threads 1 $score_cosine_qmf_dir/log/score_voxsrc22_dev_qmf.log \ + local/score_voxsrc22_dev.sh data/voxsrc22_dev $score_cosine_qmf_dir _qmf + + for f in $(ls $score_cosine_qmf_dir/voxsrc22_dev{,_snorm,_qmf}_results); do echo $f cat $f echo "" done + fi + +fi + +if [ "$do_pca" != "true" ];then + exit 0 +fi + + +be_name=pca_r${pca_var_r} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name/${be_name} +score_cosine_dir=exp/scores/$nnet_name/$be_name/cosine +score_cosine_snorm_dir=exp/scores/$nnet_name/$be_name/cosine_snorm +score_cosine_qmf_dir=exp/scores/$nnet_name/$be_name/cosine_qmf + +be_dir=exp/be/$nnet_name/ +score_be_dir=$score_dir/pca_r${pca_var_r} + +if [ $stage -le 10 ]; then + echo "Train projection on Voxceleb2" + $train_cmd $be_dir/log/train_be.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_proj_v1.py \ + --v-file scp:$xvector_dir/$plda_data/xvector.scp \ + --train-list data/$plda_data/utt2spk \ + --output-path $be_dir \ + --pca.pca-var-r $pca_var_r fi -exit +if [ $stage -le 11 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + steps_be/eval_be_cos.sh \ + --cmd "$train_cmd" \ + --preproc-file $be_dir/preproc.h5 \ + data/voxceleb1_test/trials \ + data/voxceleb1_test/utt2model \ + $xvector_dir/voxceleb1_test/xvector.scp \ + $score_cosine_dir/voxceleb1_scores + $train_cmd --mem 10G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + local/score_voxceleb1.sh data/voxceleb1_test $score_cosine_dir + + for f in $(ls $score_cosine_dir/*_results); + do + echo $f + cat $f + echo "" + done + +fi diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py index 7be882e0..f45b84a0 100755 --- a/hyperion/bin/adv_finetune_xvector_from_wav.py +++ b/hyperion/bin/adv_finetune_xvector_from_wav.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -32,6 +29,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py index f5a3ce15..bdf53786 100755 --- a/hyperion/bin/apply_mvn_select_frames.py +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -10,13 +10,6 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import RandomAccessDataReaderFactory as RDRF @@ -25,6 +18,8 @@ from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.utils import Utt2Info from hyperion.utils.kaldi_matrix import compression_methods +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def process_feats( diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py index 38e8dff2..ac8852a4 100755 --- a/hyperion/bin/audio_to_duration.py +++ b/hyperion/bin/audio_to_duration.py @@ -9,12 +9,11 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import SequentialAudioReader as AR from hyperion.utils import SegmentSet +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def audio_to_duration(audio_file, output_file, **kwargs): diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index 058f982a..e9773fff 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -9,17 +9,12 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.feats import EnergyVAD +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def compute_vad(input_path, output_path, write_num_frames, **kwargs): diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index ca6e26f7..442e4141 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -9,19 +9,14 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.np.feats import MFCC +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def compute_mfcc_feats( diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py index 0385cc55..4549caec 100755 --- a/hyperion/bin/copy_feats.py +++ b/hyperion/bin/copy_feats.py @@ -12,7 +12,6 @@ import time import numpy as np - from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index c7de38f1..972b247c 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -13,13 +13,6 @@ import numpy as np import pandas as pd import sentencepiece as spm -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -27,10 +20,13 @@ from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search +from hyperion.torch.models.wav2transducer.beam_search import (beam_search, + greedy_search) from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py index 8ef8d414..4fdc3140 100755 --- a/hyperion/bin/decode_wav2vec2rnn_transducer.py +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -13,9 +13,6 @@ import numpy as np import pandas as pd import sentencepiece as spm -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,6 +26,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index 50fd5088..7c9d4104 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -10,13 +10,6 @@ import numpy as np import pandas as pd -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -33,6 +26,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index 5697404d..fb0d402c 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -7,19 +7,11 @@ import os import sys import time - # [Added Sonal May21] from pathlib import Path import numpy as np import pandas as pd -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -37,6 +29,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) torch.backends.cudnn.enabled = False diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index 0ca1f740..2d5baa17 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -11,17 +11,10 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from art.estimators.classification import PyTorchClassifier -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn +from art.classifiers import PyTorchClassifier +from art.estimators.classification import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -29,15 +22,16 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import ( - ARTAttackFactory as AttackFactory, -) +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 49a762af..76af5d75 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -10,13 +10,6 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -31,6 +24,8 @@ from hyperion.torch.utils.misc import l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index b2c111ca..f33402a1 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -10,9 +10,6 @@ import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu @@ -29,6 +26,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index 8b6c8dae..f94dc497 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -11,17 +11,10 @@ import numpy as np import pandas as pd -from art.classifiers import PyTorchClassifier -from art.estimators.classification import PyTorchClassifier -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn +from art.classifiers import PyTorchClassifier +from art.estimators.classification import PyTorchClassifier from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -29,15 +22,16 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import ( - ARTAttackFactory as AttackFactory, -) +from hyperion.torch.adv_attacks.art_attack_factory import \ + ARTAttackFactory as AttackFactory from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index 98ba76b5..9efbd6dd 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -11,13 +11,6 @@ import numpy as np import pandas as pd -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -28,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index c4c4676f..6f7d269e 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -11,15 +11,8 @@ import numpy as np import pandas as pd -import torchaudio.transforms as tat -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch +import torchaudio.transforms as tat from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -28,26 +21,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info - -resamplers = {} - - -def get_resampler(source_fs, target_fs): - if source_fs in resamplers: - return resamplers[source_fs] - - resampler = tat.Resample( - int(source_fs), - int(target_fs), - lowpass_filter_width=64, - rolloff=0.9475937167399596, - resampling_method="kaiser_window", - beta=14.769656459379492, - ) - resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() - resamplers[source_fs] = resampler_f - return resampler_f - +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) resamplers = {} @@ -168,7 +143,10 @@ def extract_xvectors( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): t1 = time.time() @@ -240,7 +218,7 @@ def extract_xvectors( writer.write([key], [y]) if write_speech_dur is not None: keys.append(key) - info.append(str(x.shape[1] * fs)) + info.append(str(x.shape[1] / fs)) t8 = time.time() read_time = t2 - t1 diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index 926e0bcc..13ad4277 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -22,6 +19,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index f49a5fb0..577bbae7 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -11,13 +11,6 @@ import numpy as np import pandas as pd -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF @@ -28,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index eaf0a5cc..a54c4d64 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -10,15 +10,8 @@ import time import numpy as np -import yaml -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -27,6 +20,8 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index 9dc0aa2c..8939ba91 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -11,15 +11,8 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -29,6 +22,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_device(use_gpu): diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py index df267e72..6f17f800 100755 --- a/hyperion/bin/finetune_wav2vec2transducer.py +++ b/hyperion/bin/finetune_wav2vec2transducer.py @@ -12,9 +12,6 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index b3edd9b5..fc3c7084 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -26,6 +23,8 @@ HFWavLM2ResNet1dXVector) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, @@ -79,7 +78,12 @@ def init_model(num_classes, in_model_file, rank, **kwargs): def init_hard_prototype_mining(model, train_loader, val_loader, rank): - if not train_loader.batch_sampler.hard_prototype_mining: + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: return if rank == 0: @@ -118,7 +122,11 @@ def train_model(gpu_id, args): logging.info("trainer args={}".format(trn_args)) metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py index 2ac01025..17cafb85 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_feats.py +++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py @@ -12,9 +12,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py index ff97d3ca..f7832a47 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_wav.py +++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py @@ -10,9 +10,6 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data( diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py index 7a1fb5a9..ac9c2d0b 100755 --- a/hyperion/bin/finetune_xvector_from_feats.py +++ b/hyperion/bin/finetune_xvector_from_feats.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -23,6 +20,8 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 7d602709..1c7cbe58 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -10,13 +10,6 @@ import time from pathlib import Path -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML @@ -32,6 +25,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 8c6f38a6..209915c5 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -11,16 +11,9 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -31,6 +24,8 @@ from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialNdx, Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def read_utt_list(list_file, class2int_file, part_idx, num_parts): diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index fbd3a5fb..363e3afc 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -11,16 +11,9 @@ import numpy as np import pandas as pd -import yaml -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch import torch.nn as nn +import yaml from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -35,6 +28,8 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) class MyModel(nn.Module): diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py index 9e7bac5c..c5a3f6b9 100644 --- a/hyperion/bin/hyperion_dataset.py +++ b/hyperion/bin/hyperion_dataset.py @@ -4,27 +4,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from typing import Optional, Union, List from pathlib import Path - -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) +from typing import List, Optional, Union from hyperion.hyp_defs import config_logger -from hyperion.utils import ( - PathLike, - Dataset, - InfoTable, - RecordingSet, - FeatureSet, - ClassInfo, - EnrollmentMap, - SegmentSet, -) +from hyperion.utils import (ClassInfo, Dataset, EnrollmentMap, FeatureSet, + InfoTable, PathLike, RecordingSet, SegmentSet) +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) subcommands = ["add_features"] # table_dict = { diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py index a79a1dca..5a5f0b4f 100755 --- a/hyperion/bin/hyperion_tables.py +++ b/hyperion/bin/hyperion_tables.py @@ -4,26 +4,14 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from typing import Optional, Union, List from pathlib import Path - -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) +from typing import List, Optional, Union from hyperion.hyp_defs import config_logger -from hyperion.utils import ( - PathLike, - InfoTable, - RecordingSet, - FeatureSet, - ClassInfo, - EnrollmentMap, - SegmentSet, -) +from hyperion.utils import (ClassInfo, EnrollmentMap, FeatureSet, InfoTable, + PathLike, RecordingSet, SegmentSet) +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) subcommands = ["cat"] table_dict = { diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 972ff01f..4a356037 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -10,15 +10,14 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal - from hyperion.hyp_defs import config_logger from hyperion.io import AudioWriter as Writer from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal def make_noise(xs): diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index 4aafa075..78ac59c1 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -10,16 +10,11 @@ import time import numpy as np -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def pack_wav_rirs(input_path, output_spec, **kwargs): diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py index e011dfe8..e2157e3e 100755 --- a/hyperion/bin/plot_embedding_tsne.py +++ b/hyperion/bin/plot_embedding_tsne.py @@ -13,13 +13,12 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 6f35f074..6af0202c 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -13,15 +13,14 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) - from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.clustering import AHC from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet from hyperion.utils.math import cosine_scoring +from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, + ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py index 4105f482..e90ad0f7 100755 --- a/hyperion/bin/prepare_data.py +++ b/hyperion/bin/prepare_data.py @@ -6,11 +6,10 @@ import logging from pathlib import Path -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - from hyperion.data_prep import DataPrep from hyperion.hyp_defs import config_logger +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) def make_parser(data_prep_class): diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index 2f4e5cbc..e8adfd16 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -10,15 +10,14 @@ import time import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal - from hyperion.hyp_defs import config_logger from hyperion.io import AudioWriter as Writer from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) +from scipy import ndimage, signal def process_vad(vad, length, fs, dilation, erosion): diff --git a/hyperion/bin/split_dataset_into_trials_and_cohort.py b/hyperion/bin/split_dataset_into_trials_and_cohort.py new file mode 100755 index 00000000..24ec10bf --- /dev/null +++ b/hyperion/bin/split_dataset_into_trials_and_cohort.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from hyperion.hyp_defs import config_logger +from hyperion.utils import Dataset +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +if __name__ == "__main__": + + parser = ArgumentParser( + description=( + """Split speakers in dataset into test speaker to create ASV trials and + cohort speakers for S-Norm""" + ) + ) + + parser.add_argument("--data-dir", required=True, help="Path to dataset") + parser.add_argument( + "--num-1k-tar-trials", type=int, default=30, help="thousands of target trials" + ) + parser.add_argument( + "--num-trial-speakers", + type=int, + default=1000, + help="number of speakers to create trials", + ) + parser.add_argument( + "--intra-gender", + default=True, + action=ActionYesNo, + help="Whether we create intra gender trials or not", + ) + parser.add_argument("--seed", type=int, default=1123, help="random seed") + parser.add_argument( + "--trials-dir", default=None, help="Path to output trials dataset" + ) + parser.add_argument( + "--cohort-dir", default=None, help="Path to output cohort dataset" + ) + + args = parser.parse_args() + config_logger(1) + data_dir = args.data_dir + cohort_dir = args.cohort_dir + cohort_dir = f"{data_dir}_cohort" if cohort_dir is None else cohort_dir + trials_dir = args.trials_dir + trials_dir = f"{data_dir}_trials" if trials_dir is None else trials_dir + + del args.data_dir + del args.cohort_dir + del args.trials_dir + args = namespace_to_dict(args) + + dataset = Dataset.load(data_dir) + trials_dataset, cohort_dataset = dataset.split_into_trials_and_cohort(**args) + trials_dataset.save(trials_dir) + cohort_dataset.save(cohort_dir) diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 26fcf72c..8930b299 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -12,9 +12,6 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -23,6 +20,8 @@ from hyperion.torch.models import Wav2RNNRNNTransducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index 5daffb6d..7018c406 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -12,9 +12,6 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ HFWav2Vec2RNNTransducer) from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index ce53be86..55f3b996 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -12,9 +12,6 @@ import k2 import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -24,6 +21,8 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) from torch.nn.utils.rnn import pad_sequence model_dict = { diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 5e7ecafa..8e1653b1 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -25,6 +22,8 @@ HFWavLM2ResNet1dXVector) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index 7f4ab0fa..71bba080 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -11,9 +11,6 @@ from pathlib import Path import numpy as np -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - import torch import torch.nn as nn from hyperion.hyp_defs import config_logger, set_float_cpu @@ -28,6 +25,8 @@ from hyperion.torch.models import TransformerXVectorV1 as TFXVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index a210d429..b2e36cac 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -8,13 +8,6 @@ import os from pathlib import Path -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) - import torch from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD @@ -29,6 +22,8 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp +from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, + namespace_to_dict) xvec_dict = { "resnet": RXVec, diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py index 9ae59246..e978e219 100644 --- a/hyperion/data_prep/__init__.py +++ b/hyperion/data_prep/__init__.py @@ -6,3 +6,4 @@ from .data_prep import DataPrep from .voxceleb2 import VoxCeleb2DataPrep from .voxceleb1 import VoxCeleb1DataPrep +from .voxsrc22 import VoxSRC22DataPrep diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py index c23b64ff..b3958605 100644 --- a/hyperion/data_prep/voxceleb1.py +++ b/hyperion/data_prep/voxceleb1.py @@ -330,7 +330,7 @@ def prepare(self): logging.info("making dataset") dataset = Dataset( segments, - classes={"speaker": speakers, "languages": languages}, + classes={"speaker": speakers, "language_est": languages}, recordings={"recordings": recs}, enrollments=enrollments, trials=trials, diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index bef34ec9..29ad3e44 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -251,7 +251,7 @@ def prepare(self): logging.info("making dataset") dataset = Dataset( segments, - {"speaker": speakers, "languages": languages}, + {"speaker": speakers, "language_est": languages}, {"recordings": recs}, ) logging.info("saving dataset at %s", self.output_dir) diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py index 1999262a..79369149 100644 --- a/hyperion/data_prep/voxsrc22.py +++ b/hyperion/data_prep/voxsrc22.py @@ -53,7 +53,7 @@ def __init__( @staticmethod def dataset_name(): - return "voxceleb2" + return "voxsrc22" @staticmethod def add_class_args(parser): @@ -117,11 +117,13 @@ def prepare_track12_dev(self): vox1_segmentid.append(s) vox1_rec_files = [ - glob.glob(f"{self.vox1_corpus_dir}/**/{s}") for s in vox1_segmentid - ] - vox22_rec_files = [ - glob.glob(f"{self.corpus_dir}/**/{s}") for s in vox22_segmentid + glob.glob(f"{self.vox1_corpus_dir}/**/{s}")[0] for s in vox1_segmentid ] + # vox22_rec_files = [ + # glob.glob(f"{self.corpus_dir}/**/{s}")[0] for s in vox22_segmentid + # ] + vox22_rec_files = [f"{self.corpus_dir}/{s}" for s in vox22_segmentid] + rec_ids = vox22_segmentid + vox1_segmentid rec_files = vox22_rec_files + vox1_rec_files @@ -135,7 +137,11 @@ def prepare_track12_dev(self): recs["target_sample_freq"] = self.target_sample_freq logging.info("making SegmentsSet") - segments = pd.DataFrame({"id": rec_ids,}) + segments = pd.DataFrame( + { + "id": rec_ids, + } + ) segments = SegmentSet(segments) segments.sort() @@ -150,7 +156,8 @@ def prepare_track12_dev(self): logging.info("saving dataset at %s", self.output_dir) dataset.save(self.output_dir) logging.info( - "datasets containts %d segments", len(segments), + "datasets containts %d segments", + len(segments), ) # wav_file = voxsrc22_corpus_dir / file_id diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 7fbfbd71..6ee00307 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -205,7 +205,7 @@ def _set_class_weights(self): self.class_info.set_uniform_weights() elif self.weight_mode == "data-prior": weights = self.class_info["total_duration"].values - self.class_info.set_weights(self, weights) + self.class_info.set_weights(weights) if self.weight_exponent != 1.0: self.class_info.exp_weights(self.weight_exponent) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 5599fa1e..c2bcdf99 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -26,11 +26,9 @@ class HFWav2XVector(TorchModel): than one layer is used. """ - def __init__(self, - hf_feats, - xvector, - feat_fusion_start=0, - feat_fusion_method="weighted-avg"): + def __init__( + self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" + ): super().__init__() self.hf_feats = hf_feats @@ -51,12 +49,9 @@ def _make_fuser(self): self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "linear": self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weight.data = torch.ones(1, - num_layers) / num_layers + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers elif self.feat_fusion_method == "cat": - self.feat_fuser = nn.Linear(num_layers * layer_dim, - layer_dim, - bias=False) + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) def _fuse_hid_feats(self, hid_feats): """Fuses the hidden features from the Wav2Vec model. @@ -71,7 +66,7 @@ def _fuse_hid_feats(self, hid_feats): # There is only one layer of features return hid_feats[0] - hid_feats = hid_feats[self.feat_fusion_start:] + hid_feats = hid_feats[self.feat_fusion_start :] if self.feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) @@ -125,14 +120,14 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) - def forward_feats(self, - x, - x_lengths, - return_feat_layers=None, - chunk_length=0, - detach_chunks=False): - return_hid_states = (False if return_feat_layers is None - and self.feat_fusion_method == "last" else True) + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) with self._hf_context: hf_output = self.hf_feats( x, @@ -154,7 +149,8 @@ def forward_feats(self, # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) # as the hidden features of the x-vector encoder. hid_feats = [ - f.transpose(1, 2) for i, f in enumerate(hid_feats) + f.transpose(1, 2) + for i, f in enumerate(hid_feats) if i in return_feat_layers ] else: @@ -194,7 +190,8 @@ def forward( "h_classif" (list hidden classification head layers), "h_feats" (wav2vec features) """ feats, hid_feats, feat_lengths = self.forward_feats( - x, x_lengths, return_feat_layers) + x, x_lengths, return_feat_layers + ) output = self.xvector( feats, feat_lengths, @@ -230,16 +227,17 @@ def extract_embed( x, x_lengths = remove_silence(x, x_lengths) feats, _, feat_lengths = self.forward_feats( - x, - x_lengths, - chunk_length=hf_chunk_length, - detach_chunks=detach_chunks) - xvec_chunk_length = int(xvec_chunk_length * - self.hf_feats.sample_frequency * - feats.size(-1) // x.size(-1)) - return self.xvector.extract_embed(feats, feat_lengths, - xvec_chunk_length, embed_layer, - detach_chunks) + x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks + ) + xvec_chunk_length = int( + xvec_chunk_length + * self.hf_feats.sample_frequency + * feats.size(-1) + // x.size(-1) + ) + return self.xvector.extract_embed( + feats, feat_lengths, xvec_chunk_length, embed_layer, detach_chunks + ) def freeze_feat_fuser(self): if self.feat_fuser is None: @@ -258,6 +256,23 @@ def freeze_hf_feats(self): def freeze_hf_feature_encoder(self): self.hf_feats.freeze_feature_encoder() + def has_param_groups(self): + return self.hf_feats.has_param_groups() + + def trainable_param_groups(self): + if not self.has_param_groups(): + return self.trainable_parameters() + + param_groups = self.hf_feats.trainable_param_groups() + if self.feat_fusion_method == "weighted-avg": + if self.feat_fuser.requires_grad: + param_groups.append({"params": self.feat_fuser}) + else: + param_groups.append({"params": self.feat_fuser.parameters()}) + + param_groups.append({"params": self.xvector.trainable_parameters()}) + return param_groups + def set_train_mode(self, mode): if mode == self._train_mode: return @@ -302,11 +317,11 @@ def _train(self, train_mode: str): self.hf_feats.train() self.xvector._train("ft-embed_affine") elif train_mode in [ - "ft-xvector", - "hf-feats-frozen", - "ft-xvector-nograd", - "hf-feats-frozen-nograd", - "hf-feat-extractor-frozen", + "ft-xvector", + "hf-feats-frozen", + "ft-xvector-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", ]: self.hf_feats.train() self.xvector._train("full") @@ -369,16 +384,19 @@ def add_class_args(parser, prefix=None, skip=set()): "--feat-fusion-start", default=0, type=int, - help= - ("the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" - "the wav2vec num_layers"), + help=( + "the input to x-vector model will fuse the wav2vec layers from feat_fusion_start to" + "the wav2vec num_layers" + ), ) parser.add_argument( "--feat-fusion-method", default="weighted-avg", choices=["weighted-avg", "linear", "cat", "last"], - help=("method to fuse the hidden layers from the wav2vec model " - "in [weighted-avg, cat]"), + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, cat]" + ), ) if prefix is not None: diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 65e5884d..0cb887ca 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -13,8 +13,8 @@ class TorchModel(nn.Module): - """Base class for all Pytorch Models and NNet architectures - """ + """Base class for all Pytorch Models and NNet architectures""" + registry = {} def __init_subclass__(cls, **kwargs): @@ -45,6 +45,12 @@ def non_trainable_parameters(self, recurse: bool = True): if not param.requires_grad: yield param + def has_param_groups(self): + return False + + def trainable_param_groups(self): + return self.trainable_parameters() + def freeze(self): for param in self.parameters(): param.requires_grad = False @@ -109,10 +115,9 @@ def save(self, file_path): os.makedirs(file_dir, exist_ok=True) config = self.get_config() - torch.save({ - "model_cfg": self.get_config(), - "model_state_dict": self.state_dict() - }) + torch.save( + {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()} + ) @staticmethod def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): @@ -132,8 +137,7 @@ def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = TorchModel._load_cfg_state_dict( - file_path, cfg, state_dict) + cfg, state_dict = TorchModel._load_cfg_state_dict(file_path, cfg, state_dict) model = cls(**cfg) if state_dict is not None: @@ -148,14 +152,15 @@ def get_loss(self): @property def device(self): - devices = {param.device - for param in self.parameters() - } | {buf.device - for buf in self.buffers()} + devices = {param.device for param in self.parameters()} | { + buf.device for buf in self.buffers() + } if len(devices) != 1: raise RuntimeError( "Cannot determine device: {} different devices found".format( - len(devices))) + len(devices) + ) + ) return next(iter(devices)) @@ -217,5 +222,4 @@ def auto_load(file_path, extra_objs={}, map_location=None): # if it failed the 3 trials raise exception raise err # remove module prefix when is trained with dataparallel - state_dict = ODict( - (p.sub("", k), v) for k, v in state_dict.items()) + state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index b2198924..2957e433 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -135,6 +135,8 @@ class HFHubert(HFWav2VecBase): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. """ def __init__( @@ -182,6 +184,8 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, ): super().__init__( @@ -199,6 +203,8 @@ def __init__( left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -287,6 +293,32 @@ def num_encoder_layers(self): def hidden_size(self): return self.hf_config.hidden_size + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.1, + **kwargs, + ): + import transformers.models.hubert.modeling_hubert as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + module.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.HubertAttention): + module.dropout = activation_dropout + if isinstance(module, t.HubertFeatureProjection): + module.intermediate_dropout.p = activation_dropout + def drop_upper_layers(self, max_layers: int): if max_layers >= self.hf_config.num_hidden_layers: return diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index e1f21153..26da7beb 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -148,6 +148,8 @@ class HFWav2Vec2(HFWav2VecBase): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. """ def __init__( @@ -200,6 +202,8 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, ): super().__init__( @@ -217,6 +221,8 @@ def __init__( left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, ) if pretrained_model_path is not None and not ignore_pretrained: diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index b0a815c7..a9c4ddef 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -53,6 +53,8 @@ class HFWav2VecBase(TorchModel): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. """ def __init__( @@ -71,6 +73,8 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, ): super().__init__() self.pretrained_model_path = pretrained_model_path @@ -84,6 +88,8 @@ def __init__( self.override_spec_augment = override_spec_augment self.right_encoder_context = right_encoder_context self.left_encoder_context = left_encoder_context + self.feat_extract_lr = feat_extract_lr + self.encoder_lr = encoder_lr if pretrained_model_path is not None and not ignore_pretrained: rank = ddp_get_rank() @@ -215,7 +221,14 @@ def out_shape(self, in_shape): C = self.hf_model.config.hidden_size return (in_shape[0], out_length, C) - def change_config(self, override_dropouts, override_spec_augment, **kwargs): + def change_config( + self, + override_dropouts: bool, + override_spec_augment: bool, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, + **kwargs, + ): if override_spec_augment: logging.info("overriding speech augment") self.change_spec_augment(**kwargs) @@ -224,6 +237,9 @@ def change_config(self, override_dropouts, override_spec_augment, **kwargs): logging.info("overriding hf model dropouts") self.change_dropouts(**kwargs) + self.feat_extract_lr = feat_extract_lr + self.encoder_lr = encoder_lr + def change_spec_augment( self, apply_spec_augment: bool = True, @@ -249,6 +265,35 @@ def change_dropouts(self, **kwargs): def freeze_feature_encoder(self): self.hf_model.freeze_feature_encoder() + def has_param_groups(self): + return self.feat_extract_lr is not None or self.encoder_lr is not None + + def trainable_param_groups(self): + if not self.has_param_groups(): + return self.trainable_parameters() + + if self.feat_extract_lr == self.encoder_lr: + return [{"params": self.trainable_parameters(), "lr": self.encoder_lr}] + + param_groups = [ + {"params": self.hf_model.feature_extractor.parameters()}, + {"params": self.hf_model.feature_projection.parameters()}, + {"params": self.hf_model.encoder.parameters()}, + ] + if self.hf_model.adapter is not None: + param_groups.append({"params": self.hf_model.adapter.parameters()}) + + if self.feat_extract_lr is not None: + param_groups[0]["lr"] = self.feat_extract_lr + param_groups[1]["lr"] = self.feat_extract_lr + + if self.encoder_lr is not None: + param_groups[2]["lr"] = self.encoder_lr + if len(param_groups) == 4: + param_groups[3]["lr"] = self.encoder_lr + + return param_groups + @property def hf_config(self): return self.hf_model.config @@ -570,7 +615,6 @@ def add_class_args(parser, prefix=None, skip=set()): help=("file path or HuggingFace Hub path to pre-trained model"), ) - parser.add_argument( "--normalize-input", default=True, @@ -659,6 +703,24 @@ def add_class_args(parser, prefix=None, skip=set()): "when the signal is evaluated chunk by chunk." ), ) + parser.add_argument( + "--feat-extractor-lr", + default=None, + type=float, + help=( + "lr for conv feature extractor, it serves to set a lr " + "different than the global one." + ), + ) + parser.add_argument( + "--encoder-lr", + default=None, + type=float, + help=( + "lr for transformer encoder, it serves to set a lr " + "different than the global one." + ), + ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @@ -696,6 +758,24 @@ def add_finetune_args(parser, prefix=None, skip=set()): "arguments instead of the defaults in the pretrained model." ), ) + parser.add_argument( + "--feat-extractor-lr", + default=None, + type=float, + help=( + "lr for conv feature extractor, it serves to set a lr " + "different than the global one." + ), + ) + parser.add_argument( + "--encoder-lr", + default=None, + type=float, + help=( + "lr for transformer encoder, it serves to set a lr " + "different than the global one." + ), + ) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index 0d5c5ad3..e1b67d81 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -148,6 +148,8 @@ class HFWavLM(HFWav2VecBase): chunk by chunk, if it is too long to fit in GPU. right_encoder_context: (`int`): future context frames used by the transformer encoder. sample_frequency: (`int`) waveform sample frequency used to train the model. + feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. + encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. """ def __init__( @@ -200,6 +202,8 @@ def __init__( left_encoder_context: int = 16, right_encoder_context: int = 16, sample_frequency: int = 16000, + feat_extract_lr: Optional[float] = None, + encoder_lr: Optional[float] = None, ): super().__init__( @@ -217,6 +221,8 @@ def __init__( left_encoder_context=left_encoder_context, right_encoder_context=right_encoder_context, sample_frequency=sample_frequency, + feat_extract_lr=feat_extract_lr, + encoder_lr=encoder_lr, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -310,6 +316,32 @@ def num_encoder_layers(self): def hidden_size(self): return self.hf_config.hidden_size + def change_dropouts( + self, + hidden_dropout: float = 0.1, + activation_dropout: float = 0.1, + attention_dropout: float = 0.1, + feat_proj_dropout: float = 0.1, + **kwargs, + ): + import transformers.models.wavlm.modeling_wavlm as t + + self.hf_model.config.hidden_dropout = hidden_dropout + self.hf_model.config.activation_dropout = activation_dropout + self.hf_model.config.attention_dropout = attention_dropout + self.hf_model.config.feat_proj_dropout = feat_proj_dropout + + self.hf_model.feature_projection.dropout.p = feat_proj_dropout + for module in self.hf_model.encoder.modules(): + if isinstance(module, nn.Dropout): + module.p = hidden_dropout + + for module in self.hf_model.encoder.modules(): + if isinstance(module, t.WavLMAttention): + module.dropout = activation_dropout + if isinstance(module, t.WavLMFeatureProjection): + module.intermediate_dropout.p = activation_dropout + def drop_upper_layers(self, max_layers: int): if max_layers >= self.hf_config.num_hidden_layers: return diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index c8565d1d..5e41747c 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -163,7 +163,9 @@ def __init__( oss = False if ddp_type == DDPType.DDP else True self.optimizer = self._make_optimizer(optim, self.model, oss=oss) self.model = TorchDDP( - self.model, device_ids=[device], output_device=device, + self.model, + device_ids=[device], + output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) @@ -424,7 +426,9 @@ def _make_optimizer(self, optim, model, oss=False): opt_args["oss"] = oss if self.rank == 0: logging.info("optimizer args={}".format(opt_args)) - optimizer = OF.create(model.parameters(), **opt_args) + + # optimizer = OF.create(model.parameters(), **opt_args) + optimizer = OF.create(model.trainable_param_groups(), **opt_args) return optimizer def _make_lr_sched(self, lr_sched, optim): @@ -458,8 +462,8 @@ def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): def _get_lr(self): """Returns the current learning rate to show in the loggers""" - for param_group in self.optimizer.param_groups: - return param_group["lr"] + lrs = [param_group["lr"] for param_group in self.optimizer.param_groups] + return max(lrs) def _compute_grad_acc_steps(self, data_loader): if self.eff_batch_size is None: diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index 0ef81ab6..d1d969fb 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -2,10 +2,13 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +import logging from pathlib import Path from typing import Dict, Optional, Union - +from copy import deepcopy +import math +import numpy as np +import pandas as pd import yaml from .class_info import ClassInfo @@ -93,10 +96,13 @@ def _parse_dict_args(self, data, types): return objects, paths + def clone(self): + return deepcopy(self) + def segments(self, keep_loaded: bool = True): if self._segments is None: assert self._segments_path is not None - segments = SegmentSet.load(self.segments_path, sep=self.table_sep) + segments = SegmentSet.load(self._segments_path, sep=self.table_sep) if keep_loaded: self._segments = segments return segments @@ -111,6 +117,7 @@ def recordings_value(self, key: str, keep_loaded: bool = True): ) if keep_loaded: self._recordings[key] = recordings + return recordings return self._recordings[key] @@ -120,6 +127,7 @@ def features_value(self, key: str, keep_loaded: bool = True): features = FeatureSet.load(self._features_paths[key], sep=self.table_sep) if keep_loaded: self._features[key] = features + return features return self._features[key] @@ -129,6 +137,7 @@ def classes_value(self, key: str, keep_loaded: bool = True): classes = ClassInfo.load(self._classes_paths[key], self.table_sep) if keep_loaded: self._classes[key] = classes + return classes return self._classes[key] @@ -140,6 +149,7 @@ def enrollments_value(self, key: str, keep_loaded: bool = True): ) if keep_loaded: self._enrollments[key] = enrollments + return enrollments return self._enrollments[key] @@ -156,6 +166,7 @@ def trials_value(self, key: str, keep_loaded: bool = True): if keep_loaded: self._trials[key] = trials + return trials return self._trials[key] @@ -194,6 +205,49 @@ def trials(self, keep_loaded: bool = True): for key in self._trials.keys(): yield key, self.trials_value(key, keep_loaded) + # def add_recordings(self, recordings: Dict[str, Union[RecordingSet, PathLike]]): + # recordings, recordings_paths = self._parse_dict_args(recordings, RecordingSet) + # if self._recordings is None: + # self._recordings = self._recordings_paths = {} + # self._recordings.update(recordings) + # self._recordings_paths.update(recordings_paths) + + # def add_features(self, features: Dict[str, Union[FeatureSet, PathLike]]): + # features, features_paths = self._parse_dict_args(features, FeatureSet) + # if self._features is None: + # self._features = self._features_paths = {} + # self._features.update(features) + # self._features_paths.update(features_paths) + + # def add_classes(self, classes: Dict[str, Union[ClassInfo, PathLike]]): + # classes, classes_paths = self._parse_dict_args(classes, ClassInfo) + # if self._classes is None: + # self._classes = self._classes_paths = {} + # self._classes.update(classes) + # self._classes_paths.update(classes_paths) + + # def add_enrollments(self, enrollments: Dict[str, Union[EnrollmentMap, PathLike]]): + # enrollments, enrollments_paths = self._parse_dict_args( + # enrollments, + # EnrollmentMap, + # ) + # if self._enrollments is None: + # self._enrollments = self._enrollments_paths = {} + # self._enrollments.update(enrollments) + # self._enrollments_paths.update(enrollments_paths) + + # def add_trials( + # self, trials: Dict[str, Union[TrialKey, TrialNdx, SparseTrialKey, PathLike]] + # ): + # trials, trials_paths = self._parse_dict_args( + # trials, + # (TrialKey, TrialNdx, SparseTrialKey), + # ) + # if self._trials is None: + # self._trials = self._trials_paths = {} + # self._trials.update(trials) + # self._trials_paths.update(trials_paths) + @staticmethod def resolve_dataset_path(dataset_path): dataset_path = Path(dataset_path) @@ -209,6 +263,8 @@ def resolve_dataset_path(dataset_path): @staticmethod def resolve_file_path(dataset_dir, file_path): + dataset_dir = Path(dataset_dir) + file_path = Path(file_path) if file_path.is_file(): return file_path @@ -274,95 +330,100 @@ def save_changed( if update_paths: self._segments_path = file_path - file_names = {} - for k in self._recordings.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._recordings is not None - or file_path != self._recordings_paths[k] - or not file_path.exists() - ): - v = self.recordings_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._recordings_paths[k] = file_path - - if file_names: - dataset["recordings"] = file_names - - file_names = {} - for k in self._features.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._features is not None - or file_path != self._features_paths[k] - or not file_path.exists() - ): - v = self.features_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._features_paths[k] = file_path - - if file_names: - dataset["features"] = file_names - - file_names = {} - for k, v in self._classes.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._classes is not None - or file_path != self._classes_paths[k] - or not file_path.exists() - ): - v = self.classes_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._classes_paths[k] = file_path - - if file_names: - dataset["classes"] = file_names - - file_names = {} - for k, v in self._enrollments.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._enrollments is not None - or file_path != self._enrollments_paths[k] - or not file_path.exists() - ): - v = self.enrollments_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._enrollments_paths[k] = file_path - - if file_names: - dataset["enrollments"] = file_names - - file_names = {} - for k, v in self._trials.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._trials is not None - or file_path != self._trials_paths[k] - or not file_path.exists() - ): - v = self.trials_value(k, keep_loaded=False) - v.save(file_path) - if update_paths: - self._trials_paths[k] = file_path - - if file_names: - dataset["trials"] = file_names + if self._recordings is not None: + file_names = {} + for k in self._recordings.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._recordings[k] is not None + or file_path != self._recordings_paths[k] + or not file_path.exists() + ): + v = self.recordings_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._recordings_paths[k] = file_path + + if file_names: + dataset["recordings"] = file_names + + if self._features is not None: + file_names = {} + for k in self._features.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._features[k] is not None + or file_path != self._features_paths[k] + or not file_path.exists() + ): + v = self.features_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._features_paths[k] = file_path + + if file_names: + dataset["features"] = file_names + + if self._classes is not None: + file_names = {} + for k in self._classes.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._classes[k] is not None + or file_path != self._classes_paths[k] + or not file_path.exists() + ): + v = self.classes_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._classes_paths[k] = file_path + + if file_names: + dataset["classes"] = file_names + + if self._enrollments is not None: + file_names = {} + for k in self._enrollments.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._enrollments[k] is not None + or file_path != self._enrollments_paths[k] + or not file_path.exists() + ): + v = self.enrollments_value(k, keep_loaded=False) + v.save(file_path, sep=table_sep) + if update_paths: + self._enrollments_paths[k] = file_path + + if file_names: + dataset["enrollments"] = file_names + + if self._trials is not None: + file_names = {} + for k in self._trials.keys(): + file_name = k + table_ext + file_names[k] = file_name + file_path = dataset_dir / file_name + if ( + self._trials[k] is not None + or file_path != self._trials_paths[k] + or not file_path.exists() + ): + v = self.trials_value(k, keep_loaded=False) + v.save(file_path) + if update_paths: + self._trials_paths[k] = file_path + + if file_names: + dataset["trials"] = file_names with open(dataset_file, "w") as f: yaml.dump(dataset, f) @@ -491,7 +552,7 @@ def load( """ dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) - with open(dataset_file, "w") as f: + with open(dataset_file, "r") as f: dataset = yaml.safe_load(f) assert "segments" in dataset @@ -503,27 +564,27 @@ def load( trials = None if "classes" in dataset: classes = {} - for k, v in dataset["classes"]: + for k, v in dataset["classes"].items(): classes[k] = Dataset.resolve_file_path(dataset_dir, v) if "recordings" in dataset: recordings = {} - for k, v in dataset["recordings"]: + for k, v in dataset["recordings"].items(): recordings[k] = Dataset.resolve_file_path(dataset_dir, v) if "features" in dataset: features = {} - for k, v in dataset["features"]: + for k, v in dataset["features"].items(): features[k] = Dataset.resolve_file_path(dataset_dir, v) if "enrollments" in dataset: enrollments = {} - for k, v in dataset["enrollments"]: + for k, v in dataset["enrollments"].items(): enrollments[k] = Dataset.resolve_file_path(dataset_dir, v) if "trials" in dataset: trials = {} - for k, v in dataset["trials"]: + for k, v in dataset["trials"].items(): trials[k] = Dataset.resolve_file_path(dataset_dir, v) dataset = cls( @@ -541,6 +602,10 @@ def load( return dataset def add_features(self, features_name: str, features: Union[PathLike, FeatureSet]): + if self._features is None: + self._features = {} + self._features_paths = {} + if isinstance(features, (str, Path)): self._features[features_name] = None self._features_paths[features_name] = features @@ -555,6 +620,10 @@ def add_recordings( recordings_name: str, recordings: Union[PathLike, RecordingSet], ): + if self._recordings is None: + self._recordings = {} + self._recordings_paths = {} + if isinstance(features, (str, Path)): self._recordings[features_name] = None self._recordings_paths[recordings_name] = recordings @@ -565,6 +634,10 @@ def add_recordings( raise ValueError() def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): + if self._classes is None: + self._classes = {} + self._classes_paths = {} + if isinstance(classes, (str, Path)): self._classes[features_name] = None self._classes_paths[classes_name] = classes @@ -579,8 +652,12 @@ def add_enrollments( enrollments_name: str, enrollments: Union[PathLike, EnrollmentMap], ): - if isinstance(features, (str, Path)): - self._enrollments[features_name] = None + if self._enrollments is None: + self._enrollments = {} + self._enrollments_paths = {} + + if isinstance(enrollments, (str, Path)): + self._enrollments[enrollments_name] = None self._enrollments_paths[enrollments_name] = enrollments elif isinstance(enrollments, EnrollmentMap): self._enrollments[enrollments_name] = enrollments @@ -593,7 +670,11 @@ def add_trials( trials_name: str, trials: Union[PathLike, TrialKey, TrialNdx, SparseTrialKey], ): - if isinstance(features, (str, Path)): + if self._trials is None: + self._trials = {} + self._trials_paths = {} + + if isinstance(trials, (str, Path)): self._trials[features_name] = None self._trials_paths[trials_name] = trials elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)): @@ -601,3 +682,220 @@ def add_trials( self._trials_paths[trials_name] = None else: raise ValueError() + + def remove_features(self, features_name: str): + if self._features_paths[features_name] is not None: + file_path = Path(self._features_paths[features_name]) + if file_path.is_file(): + file_path.unlink() + + del self._features[features_name] + del self._features_paths[features_name] + + def remove_recordings( + self, + recordings_name: str, + ): + if self._recordingsr_paths[recordings_name] is not None: + file_path = Path(self._recordings_paths[recordings_name]) + if file_path.is_file(): + file_path.unlink() + + del self._recordings[recordings_name] + del self._recordings_paths[recordings_name] + + def remove_classes(self, classes_name: str): + if self._classes_paths[classes_name] is not None: + file_path = Path(self._classes_paths[classes_name]) + if file_path.is_file(): + file_path.unlink() + + del self._classes[classes_name] + del self._classes_paths[classes_name] + + def remove_enrollments( + self, + enrollments_name: str, + ): + if self._enrollments_paths[enrollments_name] is not None: + file_path = Path(self._enrollments_paths[enrollments_name]) + if file_path.is_file(): + file_path.unlink() + + del self._enrollments[enrollments_name] + del self._enrollments_paths[enrollments_name] + + def remove_trials( + self, + trials_name: str, + ): + if self._trials_paths[trials_name] is not None: + file_path = Path(self._trials_paths[trials_name]) + if file_path.is_file(): + file_path.unlink() + + del self._trials[trials_name] + del self._trials_paths[trials_name] + + def set_segments(self, segments: Union[PathLike, SegmentSet]): + if isinstance(segments, SegmentSet): + self._segments = segments + else: + self._segments_path = segments + + def clean(self): + rec_ids = self.segments().recording_ids() + for k, table in self.recordings(): + table = table.loc[table["id"].isin(rec_ids)].copy() + self._recordings[k] = RecordingSet(table) + + ids = self.segments()["id"].values + for k, table in self.features(): + table = table.loc[table["id"].isin(ids)].copy() + self._features[k] = FeatureSet(table) + + for k, table in self.classes(): + class_ids = self.segments()[k].unique() + table = table[table["id"].isin(class_ids)].copy() + self._classes[k] = ClassInfo(table) + + remove_keys = [] + for k, table in self.enrollments(): + table = table.loc[table["segmentid"].isin(ids)].copy() + if len(table) > 0: + self._enrollments[k] = EnrollmentMap(table) + else: + remove_keys.append(k) + + for k in remove_keys: + self.remove_enrollments(k) + + remove_keys = [] + for k, key in self.trials(): + keep_ids = [cur_id for cur_id in key.seg_set if cur_id in ids] + if keep_ids: + key = key.filter(key.model_set, keep_ids, keep=True) + self._trials[k] = key + else: + remove_keys.append(k) + + for k in remove_keys: + self.remove_trials(k) + + def _split_into_trials_and_cohort( + self, + segments: SegmentSet, + num_tar_trials: int, + num_trial_speakers: int, + seed: int, + ): + # select test speakers + rng = np.random.RandomState(seed=seed) + + spks = segments["speaker"].unique() + trial_spks = rng.choice(spks, size=(num_trial_speakers,), replace=False) + snorm_segments = SegmentSet(segments[~segments["speaker"].isin(trial_spks)]) + + trial_segments = segments[segments["speaker"].isin(trial_spks)] + # solution of 2nd degree eq. + # num_spks * n (n-1) /2 = num_trials + num_segs_per_spk = int( + math.ceil((1 + math.sqrt(1 + 8 * num_tar_trials // num_trial_speakers)) / 2) + ) + + n = num_trial_speakers * num_segs_per_spk + seg_ids = rng.choice(trial_segments["id"], size=(n,), replace=False) + trial_segments = SegmentSet(segments[segments["id"].isin(seg_ids)]) + seg_ids = trial_segments["id"].values + class_ids = trial_segments["speaker"].values + tar = np.zeros((n - 1, n), dtype=bool) + non = np.zeros((n - 1, n), dtype=bool) + + ntar = 0 + nnon = 0 + for i in range(n - 1): + for j in range(i + 1, n): + if class_ids[i] == class_ids[j]: + tar[i, j] = True + else: + non[i, j] = True + + logging.info("Got ntar=%d and nnon=%d", tar.sum(), non.sum()) + trials = TrialKey(seg_ids[:-1], seg_ids, tar, non) + df_enr = pd.DataFrame({"id": seg_ids[:-1], "segmentid": seg_ids[:-1]}) + enrollments = EnrollmentMap(df_enr) + return trials, enrollments, snorm_segments + + def split_into_trials_and_cohort( + self, + num_1k_tar_trials: int, + num_trial_speakers: int, + intra_gender: bool = True, + trials_name="trials_qmf", + seed=1123, + ): + """When training quality measure fusion in, e.g., VoxCeleb recipe. + We split the data into 2 parts: + 1) used to calculate SV scores to train the fusion + 2) cohort used to calculate the S-Norm parameters used in the QMF. + + The trials_file will be stored in the current dataset + A new dataset is created with only the cohort speakers + + Args: + num_1k_tar_trials: num of 1000 target trials. + num_trial_speakers: number of spks used to create trials. + intra_gender: if True, no cross gender trials are done. + + Returns: + Dataset used for trials with trial list. + Dataset used for cohort. + """ + num_tar_trials = num_1k_tar_trials * 1000 + if intra_gender: + num_tar_trials = num_tar_trials // 2 + num_trial_speakers = num_trial_speakers // 2 + segments = self.segments() + segments_male = SegmentSet(segments[segments["gender"] == "m"]) + segments_female = SegmentSet(segments[segments["gender"] == "f"]) + trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort( + segments_male, + num_tar_trials, + num_trial_speakers, + seed, + ) + ( + trials_female, + enroll_female, + cohort_female, + ) = self._split_into_trials_and_cohort( + segments_female, + num_tar_trials, + num_trial_speakers, + seed, + ) + trials = TrialKey.merge([trials_male, trials_female]) + enroll = EnrollmentMap.cat([enroll_male, enroll_female]) + cohort = SegmentSet.cat([cohort_male, cohort_female]) + else: + segments = self.segments() + trials, enroll, cohort = self._split_into_trials_and_cohort( + segments, + num_tar_trials, + num_trial_speakers, + seed, + ) + + dataset_trials = self.clone() + segments = self.segments() + trials_segments = SegmentSet(segments.loc[segments["id"].isin(trials.seg_set)]) + dataset_trials.set_segments(trials_segments) + dataset_trials.add_trials("trials", trials) + dataset_trials.add_enrollments("enrollments", enroll) + dataset_trials.clean() + + dataset_cohort = self.clone() + dataset_cohort.set_segments(cohort) + dataset_cohort.clean() + + return dataset_trials, dataset_cohort diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index 1852d25d..6aef5bb2 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -8,7 +8,7 @@ class SegmentSet(InfoTable): """Class to store information about a speech segment - Internally, it uses a pandas table. + Internally, it uses a pandas table. """ def __init__(self, df): @@ -29,7 +29,13 @@ def has_time_marks(self): def has_recording_ids(self): return "recording_id" in self.df - def recording_ids(self, ids): + def recording_ids(self, ids=None): + if ids is None: + if "recording_id" in self.df: + return self.df["recording_id"] + else: + return self.df["id"] + if "recording_id" in self.df: return self.df.loc[ids, "recording_id"] From c1d193abd8161a35017d316382b6025ef2c22db0 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 19 Jul 2023 16:36:06 -0400 Subject: [PATCH 104/154] finished experiments of models 2.0 in voxceleb/v2 --- egs/voxceleb/v1.2/run_001_prepare_data.sh | 18 +---- egs/voxceleb/v2/README.md | 60 +++++++++++++++ ...lsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...baseplus9l_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...baseplus_ecapatdnn512x3_stage1_v2.0_0.yaml | 59 +++++++++++++++ ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 2 +- ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 2 +- ...lmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++++++++++++++ ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 73 +++++++++++++++++++ ...wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml | 59 +++++++++++++++ ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++++++++++++++ ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 73 +++++++++++++++++++ ...v2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml | 45 ++++++++++++ .../wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml | 44 +++++++++++ .../wavlmlarge12l_ecapatdnn512x3_v2.0.yaml | 45 ++++++++++++ .../conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml | 44 +++++++++++ ...wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ ...ig_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ ...fig_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ ...onfig_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ .../config_wavlmlarge_ecapatdnn512x3_v2.0.sh | 54 ++++++++++++++ hyp_utils/create_data_split_dirs.sh | 4 +- hyp_utils/create_data_split_links.sh | 6 +- hyp_utils/feats/make_evad.sh | 2 +- hyperion/bin/compute_energy_vad.py | 37 ++++++++-- hyperion/io/ark_data_writer.py | 20 +++-- hyperion/io/audio_reader.py | 12 ++- hyperion/io/data_rw_factory.py | 8 +- hyperion/io/data_writer.py | 36 ++++++++- hyperion/io/h5_data_writer.py | 19 +++-- hyperion/io/rw_specifiers.py | 47 ++++++++---- hyperion/utils/__init__.py | 1 + 34 files changed, 1281 insertions(+), 67 deletions(-) create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh index c151e270..aef70e96 100755 --- a/egs/voxceleb/v1.2/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -23,34 +23,24 @@ fi if [ $stage -le 2 ];then # prepare voxceleb1 for test - # hyp_utils/conda_env.sh prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ --use-kaldi-ids \ --output-dir data/voxceleb1_test - #local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then prepare_data.py voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ --vox1-corpus-dir $voxceleb1_root \ --output-dir data/voxsrc22_dev - # local/prepare_voxsrc22_dev.py \ - # --vox1-corpus-dir $voxceleb1_root \ - # --voxsrc22-corpus-dir $voxsrc22_root \ - # --output-dir data/voxsrc22_dev - prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \ - --vox1-corpus-dir $voxceleb1_root \ - --output-dir data/voxsrc22_test fi # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then -# local/prepare_voxsrc22_test.py \ -# --corpus-dir $voxsrc22_root \ -# --output-dir data/voxsrc22_test +# prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \ +# --vox1-corpus-dir $voxceleb1_root \ +# --output-dir data/voxsrc22_test # fi if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then - # # split vox2 into 2 parts, for cohort and qmf training + # split vox2 into 2 parts, for cohort and qmf training split_dataset_into_trials_and_cohort.py --data-dir data/voxceleb2cat_train - #local/make_vox2_trials.py --data-dir data/voxceleb2cat_train fi diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md index c64a4b41..a005b6e8 100644 --- a/egs/voxceleb/v2/README.md +++ b/egs/voxceleb/v2/README.md @@ -78,6 +78,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 | | | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 | | | | | Cosine + QMF | 0.75 | 0.054 | 0.086 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.069 | 0.108 | +| | | | Cosine + AS-Norm | 0.86 | 0.067 | 0.108 | +| | | | Cosine + QMF | 0.77 | 0.066 | 0.105 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.057 | 0.085 | +| | | | Cosine + AS-Norm | 0.73 | 0.055 | 0.093 | +| | | | Cosine + QMF | 0.66 | 0.051 | 0.094 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.053 | 0.080 | +| | | | Cosine + AS-Norm | 0.71 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.64 | 0.045 | 0.087 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.063 | 0.111 | +| | | | Cosine + AS-Norm | 0.68 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.63 | 0.048 | 0.071 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.14 | 0.074 | 0.107 | +| | | | Cosine + AS-Norm | 0.94 | 0.060 | 0.089 | +| | | | Cosine + QMF | 0.89 | 0.054 | 0.076 | ### VoxCeleb 1 Entire-Clean trial list @@ -86,6 +101,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 | | | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 | | | | | Cosine + QMF | 0.75 | 0.046 | 0.076 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.056 | 0.099 | +| | | | Cosine + AS-Norm | 0.86 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.82 | 0.050 | 0.085 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.049 | 0.088 | +| | | | Cosine + AS-Norm | 0.76 | 0.045 | 0.080 | +| | | | Cosine + QMF | 0.73 | 0.043 | 0.078 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.91 | 0.056 | 0.094 | +| | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.086 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.050 | 0.086 | +| | | | Cosine + AS-Norm | 0.73 | 0.045 | 0.074 | +| | | | Cosine + QMF | 0.69 | 0.042 | 0.069 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.99 | 0.058 | 0.103 | +| | | | Cosine + AS-Norm | 0.87 | 0.052 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.085 | ### VoxCeleb 1 Hard-Clean trial list @@ -94,6 +124,21 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 | | | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 | | | | | Cosine + QMF | 1.56 | 0.096 | 0.155 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.88 | 0.122 | 0.200 | +| | | | Cosine + AS-Norm | 1.77 | 0.110 | 0.175 | +| | | | Cosine + QMF | 1.66 | 0.104 | 0.168 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.67 | 0.103 | 0.165 | +| | | | Cosine + AS-Norm | 1.54 | 0.093 | 0.152 | +| | | | Cosine + QMF | 1.45 | 0.089 | 0.145 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.78 | 0.106 | 0.174 | +| | | | Cosine + AS-Norm | 1.70 | 0.099 | 0.162 | +| | | | Cosine + QMF | 1.61 | 0.094 | 0.153 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.49 | 0.087 | 0.137 | +| | | | Cosine + AS-Norm | 1.29 | 0.074 | 0.117 | +| | | | Cosine + QMF | 1.22 | 0.069 | 0.111 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.84 | 0.107 | 0.172 | +| | | | Cosine + AS-Norm | 1.47 | 0.083 | 0.128 | +| | | | Cosine + QMF | 1.39 | 0.079 | 0.123 | ### VoxSRC2022 dev @@ -102,3 +147,18 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 | | | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 | | | | | Cosine + QMF | 2.31 | 0.143 | 0.232 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.82 | 0.183 | 0.286 | +| | | | Cosine + AS-Norm | 2.69 | 0.168 | 0.265 | +| | | | Cosine + QMF | 2.52 | 0.158 | 0.252 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.65 | 0.176 | 0.289 | +| | | | Cosine + AS-Norm | 2.55 | 0.171 | 0.292 | +| | | | Cosine + QMF | 2.38 | 0.159 | 0.266 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 | +| | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 | +| | | | Cosine + QMF | 0.242 | 0.144 | 0.231 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 | +| | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 | +| | | | Cosine + QMF | 1.92 | 0.117 | 0.200 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.83 | 0.175 | 0.276 | +| | | | Cosine + AS-Norm | 2.31 | 0.149 | 0.244 | +| | | | Cosine + QMF | 2.22 | 0.137 | 0.229 | diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..ad991124 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..254ff796 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..52be6db5 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml new file mode 100644 index 00000000..ebeedde6 --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0_0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 4850 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-4 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml index eed0ad1f..69a8322b 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -58,6 +58,6 @@ trainer: update_lr_on_opt_step: true use_amp: true log_interval: 1000 - epochs: 30 + epochs: 8 eff_batch_size: 512 train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml index d66d6877..3443591a 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -68,6 +68,6 @@ trainer: update_lr_on_opt_step: true use_amp: true log_interval: 1000 - epochs: 8 + epochs: 4 eff_batch_size: 256 train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..abe5da6e --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..7287188c --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..3443591a --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..2addaa1e --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..5e1260ad --- /dev/null +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..c3466259 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..dc3737e3 --- /dev/null +++ b/egs/voxceleb/v2/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..5025f047 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..0a6303f5 --- /dev/null +++ b/egs/voxceleb/v2/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..67a4665e --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params layers 2-12 + +# hugging face model +hf_model_name=wav2vec2xlsr300m12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..80ee785b --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..c2b30f68 --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..530096cc --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1b276bcd --- /dev/null +++ b/egs/voxceleb/v2/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/hyp_utils/create_data_split_dirs.sh b/hyp_utils/create_data_split_dirs.sh index 06c30779..b8aad6c8 100755 --- a/hyp_utils/create_data_split_dirs.sh +++ b/hyp_utils/create_data_split_dirs.sh @@ -6,7 +6,7 @@ storage_name=$(date +'%m_%d_%H_%M') -echo "$0 $@" # Print the command line for logging + if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; @@ -15,6 +15,7 @@ if [ $# -ne 3 ]; then echo "Usage: $0 " echo "$0 exp/vad_dir $USER/hyp-data/voxceleb/v1/vad/storage b0" fi + output_dir=$1 storage_dir=$2 nodes=$3 @@ -22,6 +23,7 @@ nodes=$3 link_dir=$output_dir/storage if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $linkdir ]; then + echo "$0 $@" # Print the command line for logging echo "Prepare to distribute data over multiple $nodes nodes" dir_name=$storage_dir/$storage_name/storage if [ "$nodes" == "b0" ];then diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh index fb5b8ca0..8416742e 100755 --- a/hyp_utils/create_data_split_links.sh +++ b/hyp_utils/create_data_split_links.sh @@ -6,11 +6,11 @@ storage_name=$(date +'%m_%d_%H_%M') -echo "$0 $@" # Print the command line for logging -if [ $# -ne 3 ]; then - echo "Usage: $0 < " +if [ $# -ne 2 ]; then + echo "Usage: $0 " echo "$0 exp/vad_dir/vad.JOB.ark 40" fi +echo "$0 $@" # Print the command line for logging output_file_pattern=$1 nj=$2 diff --git a/hyp_utils/feats/make_evad.sh b/hyp_utils/feats/make_evad.sh index 373fc4a6..16ddbf74 100755 --- a/hyp_utils/feats/make_evad.sh +++ b/hyp_utils/feats/make_evad.sh @@ -87,7 +87,7 @@ fi $cmd JOB=1:$nj $logdir/make_vad_${name}.JOB.log \ hyp_utils/conda_env.sh \ compute_energy_vad.py --cfg $vad_config $opt_args \ - --input $scp --output ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \ + --recordings-file $scp --output-spec ark,scp:$vaddir/vad_$name.JOB.ark,$vaddir/vad_$name.JOB.scp \ --part-idx JOB --num-parts $nj || exit 1 # concatenate the .scp files together. diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index e9773fff..9d50388c 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -13,19 +13,31 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.feats import EnergyVAD -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) -def compute_vad(input_path, output_path, write_num_frames, **kwargs): +def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): vad_args = EnergyVAD.filter_args(**kwargs) vad = EnergyVAD(**vad_args) input_args = AR.filter_args(**kwargs) - reader = AR(input_path, **input_args) + reader = AR(recordings_file, **input_args) - writer = DWF.create(output_path) + metadata_columns = [ + "frame_shift", + "frame_length", + "num_frames", + "num_speech_frames", + "prob_speech", + ] + + writer = DWF.create(output_spec, metadata_columns=metadata_columns) if write_num_frames is not None: f_num_frames = open(write_num_frames, "w") @@ -39,6 +51,7 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): rtf = vad.frame_shift * y.shape[0] / dt num_speech_frames = np.sum(y) prob_speech = num_speech_frames / y.shape[0] * 100 + logging.info( "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f", key, @@ -48,7 +61,14 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): dt, rtf, ) - writer.write([key], [y]) + metadata = { + "frame_shift": vad.frame_shift, + "frame_length": vad.frame_length, + "num_frames": y.shape[0], + "num_speech_frames": num_speech_frames, + "prob_speech": prob_speech, + } + writer.write([key], [y], metadata) if write_num_frames is not None: f_num_frames.write("%s %d\n" % (key, y.shape[0])) @@ -63,9 +83,10 @@ def compute_vad(input_path, output_path, write_num_frames, **kwargs): parser = ArgumentParser(description="Compute Kaldi Energy VAD") parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) - parser.add_argument("--output", dest="output_path", required=True) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--output-spec", required=True) parser.add_argument("--write-num-frames", default=None) + parser.add_argument("--write-stats", default=None) AR.add_class_args(parser) EnergyVAD.add_class_args(parser) diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py index 6adf78b2..26f77112 100644 --- a/hyperion/io/ark_data_writer.py +++ b/hyperion/io/ark_data_writer.py @@ -3,10 +3,10 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from typing import Union, Optional, List +from typing import Union, Optional, List, Dict import numpy as np - +import pandas as pd from ..hyp_defs import float_save from ..utils.kaldi_io_funcs import init_kaldi_output_stream, is_token, write_token from ..utils.kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix @@ -46,7 +46,10 @@ def __init__( self.f = open(archive_path, "w") if script_path is not None and not self.script_is_scp: - row = self.script_sep.join(["id", "storage_path", "storage_byte"]) + columns = ["id", "storage_path", "storage_byte"] + if self.metadata_columns is not None: + columns += self.metadata_columns + row = self.script_sep.join(columns) self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): @@ -97,6 +100,7 @@ def write( self, keys: Union[str, List[str], np.array], data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, ): """Writes data to file. @@ -107,9 +111,7 @@ def write( it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ - if isinstance(keys, str): - keys = [keys] - data = [data] + keys, data, metadata = self.standardize_write_args(keys, data, metadata) for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -125,7 +127,11 @@ def write( if self.script_is_scp: self.f_script.write(f"{key_i} {self.archive_path}:{pos}\n") else: - row = self.script_sep.join([key_i, self.archive_path, str(pos)]) + columns = [key_i, str(self.archive_path), str(pos)] + if metadata is not None: + metadata_i = [str(m[i]) for m in metadata] + columns += metadata_i + row = self.script_sep.join(columns) self.f_script.write(f"{row}\n") if self._flush: diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 1052ce8c..6c152cc5 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -346,7 +346,9 @@ def read(self, num_records: int = 0, time_offset: float = 0, time_durs: float = key = segment["id"] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: - key, file_path = self.recordings.iloc[self.cur_item] + segment = self.recordings.iloc[self.cur_item] + key = segment["id"] + file_path = segment["storage_path"] x_i, fs_i = self.read_wavspecifier( file_path, self.wav_scale, offset_i, dur_i ) @@ -397,7 +399,8 @@ def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) add_argparse_args = add_class_args @@ -423,7 +426,7 @@ def read( Args: keys: List of recording/segment_ids names. time_offset: float or float list with time-offsets - time_durs: float or float list with durations + time_durs: float or float list with durations Returns: data: List of waveforms @@ -527,7 +530,8 @@ def add_class_args(parser, prefix: Optional[str] = None): ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py index b56e8c27..092f5549 100644 --- a/hyperion/io/data_rw_factory.py +++ b/hyperion/io/data_rw_factory.py @@ -30,7 +30,10 @@ class DataWriterFactory(object): @staticmethod def create( - wspecifier: PathLike, compress: bool = False, compression_method: str = "auto" + wspecifier: PathLike, + compress: bool = False, + compression_method: str = "auto", + metadata_columns: Optional[List[str]] = None, ): if isinstance(wspecifier, str): wspecifier = WSpecifier.create(wspecifier) @@ -47,6 +50,7 @@ def create( flush=wspecifier.flush, compress=compress, compression_method=compression_method, + metadata_columns=metadata_columns, ) else: return ADW( @@ -56,6 +60,7 @@ def create( flush=wspecifier.flush, compress=compress, compression_method=compression_method, + metadata_columns=metadata_columns, ) @staticmethod @@ -76,7 +81,6 @@ def add_class_args(parser, prefix: Optional[PathLike] = None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='data writer options') class SequentialDataReaderFactory(object): diff --git a/hyperion/io/data_writer.py b/hyperion/io/data_writer.py index 8adbf87a..ff35ef2a 100644 --- a/hyperion/io/data_writer.py +++ b/hyperion/io/data_writer.py @@ -5,9 +5,10 @@ import os from abc import ABCMeta, abstractmethod -from typing import Union, Optional, List +from typing import Union, Optional, List, Dict from pathlib import Path import numpy as np +import pandas as pd from ..utils import PathLike @@ -34,12 +35,14 @@ def __init__( flush: bool = False, compress: bool = False, compression_method: str = "auto", + metadata_columns: Optional[List[str]] = None, ): self.archive_path = Path(archive_path) self.script_path = Path(script_path) if script_path is not None else None self._flush = flush self.compress = compress self.compression_method = compression_method + self.metadata_columns = metadata_columns archive_dir = self.archive_path.parent archive_dir.mkdir(exist_ok=True, parents=True) @@ -56,9 +59,7 @@ def __init__( self.f_script = open(self.script_path, "w") else: self.script_sep = "," if script_ext == ".csv" else "\t" - self.f_script = open(self.script_path, "w", "utf-8") - row = self.script_sep.join(["id", "storage_path"]) - self.f_script.write(f"{row}\n") + self.f_script = open(self.script_path, "w", encoding="utf-8") def __enter__(self): """Function required when entering contructions of type @@ -87,11 +88,37 @@ def flush(self): """Flushes the file""" pass + def standardize_write_args( + self, + keys: Union[str, List[str], np.array], + data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, + ): + if isinstance(keys, str): + keys = [keys] + data = [data] + + if metadata is not None: + if isinstance(metadata, pd.DataFrame): + metadata = metadata.to_dict() + + metadata_list = [] + for c in self.metadata_columns: + m_c = metadata[c] + if not isinstance(m_c, (list, np.ndarray)): + m_c = [m_c] + metadata_list.append(m_c) + + metadata = metadata_list + + return keys, data, metadata + @abstractmethod def write( self, keys: Union[str, List[str], np.array], data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, ): """Writes data to file. @@ -101,5 +128,6 @@ def write( If all the matrices have the same dimension it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. + metadata: dictionary/DataFrame with metadata """ pass diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py index c34aa0ca..4d05f963 100644 --- a/hyperion/io/h5_data_writer.py +++ b/hyperion/io/h5_data_writer.py @@ -3,10 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from typing import Union, Optional, List +from typing import Union, Optional, List, Dict import h5py import numpy as np +import pandas as pd from ..hyp_defs import float_save from ..utils.kaldi_io_funcs import is_token @@ -37,7 +38,10 @@ def __init__( self.f = h5py.File(archive_path, "w") if script_path is not None and not self.script_is_scp: - row = self.script_sep.join(["id", "storage_path"]) + columns = ["id", "storage_path"] + if self.metadata_columns is not None: + columns += self.metadata_columns + row = self.script_sep.join(columns) self.f_script.write(f"{row}\n") def __exit__(self, exc_type, exc_value, traceback): @@ -89,6 +93,7 @@ def write( self, keys: Union[str, List[str], np.array], data: Union[np.array, List[np.array]], + metadata: Optional[Union[pd.DataFrame, Dict]] = None, ): """Writes data to file. @@ -99,9 +104,7 @@ def write( it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ - if isinstance(keys, str): - keys = [keys] - data = [data] + keys, data, metadata = self.standardize_write_args(keys, data, metadata) for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -115,7 +118,11 @@ def write( if self.script_is_scp: self.f_script.write(f"{key_i} {self.archive_path}\n") else: - row = self.script_sep.join([key_i, self.archive_path]) + columns = [key_i, str(self.archive_path)] + if metadata is not None: + metadata_i = [str(m[i]) for m in metadata] + columns += metadata_i + row = self.script_sep.join(columns) self.f_script.write(f"{row}\n") if self._flush: diff --git a/hyperion/io/rw_specifiers.py b/hyperion/io/rw_specifiers.py index 37f579b4..93123247 100644 --- a/hyperion/io/rw_specifiers.py +++ b/hyperion/io/rw_specifiers.py @@ -7,6 +7,8 @@ import re from enum import Enum +from pathlib import Path +import pandas as pd class ArchiveType(Enum): @@ -174,6 +176,11 @@ def create(cls, wspecifier): archive_type = ArchiveType.AUDIO archive = archives[cur_archive] cur_archive += 1 + elif option == "csv": + assert script is None, "Repeated csv in wspecifier %s" % script + assert len(archives) > cur_archive + script = archives[cur_archive] + cur_archive += 1 elif option == "scp": assert script is None, "Repeated scp in wspecifier %s" % script assert len(archives) > cur_archive @@ -332,7 +339,7 @@ def create(cls, rspecifier): assert len(archives) == 1 spec_type = None - archive = archives[0] + archive = Path(archives[0]) archive_type = None once = False is_sorted = False @@ -361,6 +368,9 @@ def create(cls, rspecifier): assert spec_type is None spec_type = RSpecType.ARCHIVE archive_type = ArchiveType.RTTM + elif option == "csv": + assert spec_type is None + spec_type = RSpecType.SCRIPT elif option == "scp": assert spec_type is None spec_type = RSpecType.SCRIPT @@ -374,24 +384,31 @@ def create(cls, rspecifier): assert spec_type is not None, "Wrong wspecifier options %s" % fields[0] if spec_type == RSpecType.SCRIPT: - with open(archive, "r") as f: - scp_f2 = f.readline().strip().split(" ")[1] - if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None: + if archive.suffix == ".csv": + df = pd.read_csv(archive, nrows=2) + storage_path = df["storage_path"].values[0] + if re.match(r".*\.h5$", scp_f2) is not None: archive_type = ArchiveType.H5 - elif re.match(r".*\.ark:.*$", scp_f2) is not None: + elif re.match(r".*\.ark$", scp_f2) is not None: archive_type = ArchiveType.ARK - elif ( - re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) is not None - ): + elif re.match(r".*[cvg]$", scp_f2) is not None: archive_type = ArchiveType.AUDIO else: - archive_type = ArchiveType.ARK - - # .split('[')[0].split(':') - # if len(scp) == 1: - # archive_type = ArchiveType.H5 - # else: - # archive_type = ArchiveType.ARK + raise ValueError(f"Unknown format for {storage_path}") + else: + with open(archive, "r") as f: + scp_f2 = f.readline().strip().split(" ")[1] + if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None: + archive_type = ArchiveType.H5 + elif re.match(r".*\.ark:.*$", scp_f2) is not None: + archive_type = ArchiveType.ARK + elif ( + re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) + is not None + ): + archive_type = ArchiveType.AUDIO + else: + archive_type = ArchiveType.ARK if archive_type == ArchiveType.ARK: for option in options: diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index 51b476aa..e8ad5056 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .info_table import InfoTable from .class_info import ClassInfo from .dataset import Dataset from .enrollment_map import EnrollmentMap From 89efce43a3c25b1fc3284afb84823af803d92add Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 4 Sep 2023 18:59:26 -0400 Subject: [PATCH 105/154] voxceleb v1.2 works up to snorm backend --- egs/sre19-av-v/v0.1/steps_be/face_be_utils.py | 9 +- .../v1/steps_be/eval-tel-be-snorm-v2.py | 2 +- egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py | 2 +- .../v1/steps_be/train-tel-be-knn-v1.py | 2 +- .../v1/steps_be/train-tel-be-knn-v3.py | 2 +- .../v1/steps_be/train-tel-be-knn-v4.py | 2 +- .../adv.v2/steps_backend/eval-be-cos-Nvs1.py | 2 +- .../adv.v2/steps_backend/eval-be-cos.py | 2 +- egs/voxceleb/v1.1/conf/vad_16k.yaml | 1 + ...rain_ecapatdnn2048x4_xvec_stage1_v3.0.yaml | 84 +-- ...rain_ecapatdnn2048x4_xvec_stage2_v3.0.yaml | 16 +- ...train_ecapatdnn512x3_xvec_stage1_v3.0.yaml | 89 +-- ...train_ecapatdnn512x3_xvec_stage2_v3.0.yaml | 30 +- egs/voxceleb/v1.2/conf/vad_16k.yaml | 3 +- egs/voxceleb/v1.2/run_002_compute_evad.sh | 66 +++ .../v1.2/run_003_prepare_noises_rirs.sh | 102 ++++ .../v1.2/run_004_prepare_xvec_train_data.sh | 76 +++ egs/voxceleb/v1.2/run_005_train_xvector.sh | 78 +++ egs/voxceleb/v1.2/run_006_extract_xvectors.sh | 103 ++++ egs/voxceleb/v1/steps_be/eval_be_cos.py | 2 +- egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py | 2 +- egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py | 2 +- hyp_utils/create_audios_split_links.sh | 27 + hyp_utils/create_data_split_links.sh | 2 - .../xvectors/extract_wav2vec2xvectors.sh | 8 +- .../xvectors/extract_xvectors_from_wav.sh | 10 +- .../make_babble_noise_for_nnet_train.sh | 22 +- .../xvectors/pack_rirs_for_nnet_train.sh | 9 - .../preprocess_audios_for_nnet_train.sh | 8 +- hyperion/bin/eval_cosine_scoring_backend.py | 200 +++++++ .../eval_cosine_scoring_backend_with_qmf.py | 472 +++++++++++++++ hyperion/bin/eval_verification_metrics.py | 96 +++ hyperion/bin/eval_xvec_logits_from_wav.py | 20 +- hyperion/bin/extract_wav2vec2xvectors.py | 41 +- hyperion/bin/extract_wav2xvectors.py | 333 +++++++++++ hyperion/bin/extract_xvectors_from_feats.py | 20 +- hyperion/bin/extract_xvectors_from_wav.py | 26 +- .../extract_xvectors_slidwin_from_feats.py | 10 +- .../bin/extract_xvectors_slidwin_from_wav.py | 10 +- hyperion/bin/finetune_wav2xvector.py | 228 ++++++++ .../generate_adv_attacks_xvector_classif.py | 8 +- hyperion/bin/hyperion_dataset.py | 406 ++++++++++++- hyperion/bin/hyperion_tables.py | 33 +- hyperion/bin/make_babble_noise_audio_files.py | 102 ++-- hyperion/bin/make_wav2xvector.py | 91 +++ hyperion/bin/merge_scores.py | 99 ++++ hyperion/bin/pack_wav_rirs.py | 17 +- hyperion/bin/plot_embedding_tsne_per_class.py | 11 +- hyperion/bin/prepare_data.py | 9 +- hyperion/bin/preprocess_audio_files.py | 163 +++--- hyperion/bin/train_wav2vec2xvector.py | 19 +- hyperion/bin/train_wav2xvector.py | 196 +++++++ hyperion/data_prep/__init__.py | 2 + hyperion/data_prep/data_prep.py | 3 +- hyperion/data_prep/musan.py | 107 ++++ hyperion/data_prep/rirs.py | 103 ++++ hyperion/data_prep/voxceleb1.py | 18 +- hyperion/data_prep/voxceleb2.py | 23 +- hyperion/data_prep/voxsrc22.py | 49 +- hyperion/helpers/trial_data_reader.py | 2 +- hyperion/helpers/vector_class_reader.py | 2 +- hyperion/io/ark_data_reader.py | 6 +- hyperion/io/audio_reader.py | 18 +- hyperion/io/audio_writer.py | 71 ++- hyperion/io/hyp_data_reader.py | 5 +- hyperion/io/packed_audio_reader.py | 6 +- hyperion/io/rw_specifiers.py | 6 +- hyperion/np/augment/noise_augment.py | 26 +- hyperion/np/augment/reverb_augment.py | 15 +- hyperion/np/augment/speech_augment.py | 2 +- hyperion/np/augment/speed_augment.py | 22 +- .../classifiers/binary_logistic_regression.py | 2 +- hyperion/np/classifiers/greedy_fusion.py | 4 +- hyperion/np/classifiers/linear_gbe.py | 8 +- hyperion/np/classifiers/linear_gbe_up.py | 9 +- hyperion/np/classifiers/linear_svmc.py | 8 +- .../np/classifiers/logistic_regression.py | 6 +- hyperion/np/classifiers/q_scoring_homo_gbe.py | 2 +- hyperion/np/classifiers/svmc.py | 4 +- hyperion/np/feats/energy_vad.py | 40 +- hyperion/np/feats/mfcc.py | 57 +- hyperion/np/metrics/__init__.py | 7 +- hyperion/np/metrics/cllr.py | 2 +- hyperion/np/metrics/utils.py | 2 +- hyperion/np/metrics/verification_evaluator.py | 78 ++- hyperion/np/pdfs/core/normal.py | 21 +- hyperion/np/pdfs/core/normal_diag_cov.py | 11 +- hyperion/np/pdfs/hmm/hmm.py | 4 +- hyperion/np/pdfs/jfa/jfa_total.py | 9 +- .../np/pdfs/mixtures/exp_family_mixture.py | 2 +- hyperion/np/pdfs/mixtures/gmm.py | 24 +- hyperion/np/pdfs/mixtures/gmm_diag_cov.py | 13 +- .../np/pdfs/mixtures/gmm_tied_diag_cov.py | 13 +- hyperion/np/pdfs/plda/frplda.py | 4 +- hyperion/np/pdfs/plda/plda.py | 4 +- hyperion/np/pdfs/plda/splda.py | 4 +- hyperion/np/transforms/skl_tsne.py | 4 +- hyperion/torch/data/audio_dataset.py | 12 + hyperion/torch/layers/audio_feats_factory.py | 2 +- hyperion/torch/models/__init__.py | 18 +- hyperion/torch/models/plda/splda.py | 2 +- .../models/wav2xvectors/hf_wav2xvector.py | 4 +- .../wav2xvectors/wav2resnet1d_xvector.py | 18 + .../models/wav2xvectors/wav2resnet_xvector.py | 18 + .../torch/models/wav2xvectors/wav2xvector.py | 113 +++- hyperion/torch/narchs/audio_feats_mvn.py | 4 + hyperion/torch/torch_model.py | 19 +- hyperion/utils/class_info.py | 16 + hyperion/utils/dataset.py | 552 +++++++++++++----- hyperion/utils/fold_list.py | 2 +- hyperion/utils/info_table.py | 72 ++- hyperion/utils/{math.py => math_funcs.py} | 22 +- hyperion/utils/plotting.py | 3 +- hyperion/utils/scp_list.py | 2 +- hyperion/utils/segment_set.py | 42 +- hyperion/utils/sparse_trial_key.py | 18 +- hyperion/utils/sparse_trial_scores.py | 124 +++- hyperion/utils/train_val_eval_list.py | 2 +- hyperion/utils/trial_key.py | 16 +- hyperion/utils/trial_ndx.py | 84 ++- hyperion/utils/trial_scores.py | 86 ++- hyperion/utils/utt2info.py | 2 +- 122 files changed, 4509 insertions(+), 945 deletions(-) create mode 100755 egs/voxceleb/v1.2/run_002_compute_evad.sh create mode 100755 egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh create mode 100755 egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh create mode 100755 egs/voxceleb/v1.2/run_005_train_xvector.sh create mode 100755 egs/voxceleb/v1.2/run_006_extract_xvectors.sh create mode 100755 hyp_utils/create_audios_split_links.sh create mode 100755 hyperion/bin/eval_cosine_scoring_backend.py create mode 100755 hyperion/bin/eval_cosine_scoring_backend_with_qmf.py create mode 100755 hyperion/bin/eval_verification_metrics.py create mode 100755 hyperion/bin/extract_wav2xvectors.py create mode 100755 hyperion/bin/finetune_wav2xvector.py mode change 100644 => 100755 hyperion/bin/hyperion_dataset.py create mode 100755 hyperion/bin/make_wav2xvector.py create mode 100755 hyperion/bin/merge_scores.py create mode 100755 hyperion/bin/train_wav2xvector.py create mode 100644 hyperion/data_prep/musan.py create mode 100644 hyperion/data_prep/rirs.py rename hyperion/utils/{math.py => math_funcs.py} (93%) diff --git a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py index 14e3fc20..b6252df7 100644 --- a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py +++ b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py @@ -2,15 +2,11 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - import logging import numpy as np from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import softmax +from hyperion.utils.math_funcs import softmax from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import LNorm from hyperion.np.clustering import AHC @@ -23,9 +19,6 @@ def lnorm(x): def cosine_scr(x1, x2): - # t = LNorm() - # x1 = t.predict(x1) - # x2 = t.predict(x2) x1 = lnorm(x1) x2 = lnorm(x2) return np.dot(x1, x2.T) diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py index 907509fd..c9657a66 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py @@ -16,7 +16,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores from hyperion.helpers import TrialDataReader as TDR -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.np.transforms import TransformList from hyperion.np.score_norm import AdaptSNorm as SNorm diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py index b661cbde..24ef731b 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py index 8e7715e0..bdef3fc3 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py @@ -17,7 +17,7 @@ from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py index 12f1725b..51795676 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py @@ -17,7 +17,7 @@ from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank, svd diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py index 234f966c..79c1cd6f 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py @@ -17,7 +17,7 @@ from hyperion.np.transforms import TransformList, PCA, LDA, LNorm from hyperion.helpers import PLDAFactory as F from hyperion.utils.utt2info import Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from numpy.linalg import matrix_rank, svd diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py index 85e82149..48094d0f 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py @@ -15,7 +15,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py index d5cd6a55..49720cb5 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.np.transforms import TransformList diff --git a/egs/voxceleb/v1.1/conf/vad_16k.yaml b/egs/voxceleb/v1.1/conf/vad_16k.yaml index 5fb0111c..a8d7b4d4 100644 --- a/egs/voxceleb/v1.1/conf/vad_16k.yaml +++ b/egs/voxceleb/v1.1/conf/vad_16k.yaml @@ -6,3 +6,4 @@ vad_energy_threshold: 5.5 vad_energy_mean_scale: 0.5 vad_proportion_threshold: 0.12 vad_frames_context: 2 +wav_scale: 32767 diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml index 1633f4a2..2cf31713 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -29,48 +29,50 @@ data: min_chunk_length: 2.0 data_loader: num_workers: 8 -feats: fbank80_specaug1_stmn_16k.yaml -model: - resnet_enc: - in_feats: 80 - in_conv_channels: 2048 - in_kernel_size: 5 - in_stride: 1 - resb_type: seres2bn - resb_repeats: - - 1 - - 1 - - 1 - - 1 - resb_channels: - - 2048 - resb_kernel_sizes: - - 3 - resb_dilations: - - 2 - - 3 - - 4 - - 5 - resb_strides: - - 1 - res2net_width_factor: 1 - res2net_scale: 8 - se_r: 4 - multilayer: true - multilayer_concat: true - endpoint_channels: 4096 - norm_before: false + +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 dropout_rate: 0.2 - hid_act: swish - pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 128 - embed_dim: 192 - cos_scale: 30.0 - margin: 0.2 - margin_warmup_epochs: 5.0 - dropout_rate: 0.2 - norm_before: false + norm_before: false trainer: optim: opt_type: adam diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml index 877736b3..21f0db8b 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -37,15 +37,15 @@ data: num_hard_prototypes: 8 data_loader: num_workers: 8 -feats: fbank80_stmn_16k.yaml model: - cos_scale: 30.0 - margin: 0.3 - margin_warmup_epochs: 0 - intertop_margin: 0.1 - resnet_enc: - override_dropouts: true - dropout_rate: 0.25 + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0.25 trainer: optim: opt_type: sgd diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml index f15d453d..03a7f736 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -29,47 +29,48 @@ data: min_chunk_length: 2.0 data_loader: num_workers: 8 -feats: fbank80_specaug1_stmn_16k.yaml -model: - resnet_enc: - in_feats: 80 - in_conv_channels: 512 - in_kernel_size: 5 - in_stride: 1 - resb_type: seres2bn - resb_repeats: - - 1 - - 1 - - 1 - resb_channels: - - 512 - resb_kernel_sizes: - - 3 - resb_dilations: - - 2 - - 3 - - 4 - resb_strides: - - 1 - res2net_width_factor: 1 - res2net_scale: 8 - se_r: 4 - multilayer: true - multilayer_concat: true - endpoint_channels: 1536 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 norm_before: false - dropout_rate: 0.002 hid_act: swish - pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 128 - embed_dim: 192 - cos_scale: 30.0 - margin: 0.2 - margin_warmup_epochs: 5.0 - dropout_rate: 0.0 - norm_before: false - hid_act: swish trainer: optim: opt_type: adam @@ -91,3 +92,5 @@ trainer: log_interval: 1000 epochs: 40 eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml index 45e55d97..9788bb7c 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,31 +21,31 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: num_workers: 8 -feats: fbank80_stmn_16k.yaml model: - cos_scale: 30.0 - margin: 0.3 - margin_warmup_epochs: 0 - intertop_margin: 0.1 - resnet_enc: - override_dropouts: true - dropout_rate: 0. + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0. trainer: optim: opt_type: sgd @@ -67,3 +67,5 @@ trainer: swa_start: 31 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/vad_16k.yaml b/egs/voxceleb/v1.2/conf/vad_16k.yaml index 5fb0111c..e5a6bb82 100644 --- a/egs/voxceleb/v1.2/conf/vad_16k.yaml +++ b/egs/voxceleb/v1.2/conf/vad_16k.yaml @@ -2,7 +2,8 @@ sample_frequency: 16000 frame_shift: 10 frame_length: 25 snip_edges: false -vad_energy_threshold: 5.5 +vad_energy_threshold: -4.89 vad_energy_mean_scale: 0.5 vad_proportion_threshold: 0.12 vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/voxceleb/v1.2/run_002_compute_evad.sh b/egs/voxceleb/v1.2/run_002_compute_evad.sh new file mode 100755 index 00000000..e7593df2 --- /dev/null +++ b/egs/voxceleb/v1.2/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + compute_energy_vad.py --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion_tables.py cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion_dataset.py add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..aed1dae4 --- /dev/null +++ b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + prepare_data.py musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + preprocess_audio_files.py \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion_tables.py cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion_dataset.py set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + make_babble_noise_audio_files.py \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion_dataset.py make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + prepare_data.py rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + pack_wav_rirs.py ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion_dataset.py add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..7649ff22 --- /dev/null +++ b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + preprocess_audio_files.py \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion_tables.py cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion_dataset.py set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion_dataset.py remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion_dataset.py remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion_dataset.py split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/v1.2/run_005_train_xvector.sh b/egs/voxceleb/v1.2/run_005_train_xvector.sh new file mode 100755 index 00000000..d2f31ea1 --- /dev/null +++ b/egs/voxceleb/v1.2/run_005_train_xvector.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_wav2xvector.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2xvector.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh new file mode 100755 index 00000000..09b8c8e9 --- /dev/null +++ b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=120.0 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + extract_wav2xvectors.py ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion_tables.py cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + extract_wav2xvectors.py ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion_tables.py cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + + diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos.py b/egs/voxceleb/v1/steps_be/eval_be_cos.py index 1f9978ee..a9bc03d1 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos.py @@ -20,7 +20,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.list_utils import ismember from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.np.transforms import TransformList diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py index 7034126a..bf66d72b 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_qmf.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import TrialNdx, TrialScores, Utt2Info -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.np.pdfs import PLDA from hyperion.utils.list_utils import ismember from hyperion.helpers import TrialDataReader as TDR diff --git a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py index dad89ced..0eca769d 100755 --- a/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py +++ b/egs/voxceleb/v1/steps_be/eval_be_cos_snorm.py @@ -20,7 +20,7 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.list_utils import ismember from hyperion.utils import TrialNdx, TrialScores -from hyperion.utils.math import cosine_scoring +from hyperion.utils.math_funcs import cosine_scoring from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.np.transforms import TransformList diff --git a/hyp_utils/create_audios_split_links.sh b/hyp_utils/create_audios_split_links.sh new file mode 100755 index 00000000..7125a2c4 --- /dev/null +++ b/hyp_utils/create_audios_split_links.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright +# 2023 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# Creates links to distrubute data into multiple nodes in clsp grid + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo "$0 exp/xvector_audios/voxceleb data/voxceleb/recordings.csv flac" +fi +echo "$0 $@" # Print the command line for logging +output_dir=$1 +rec_file=$2 +file_format=$3 + +if [[ $(hostname -f) != *.clsp.jhu.edu ]]; then + exit 0 +fi + +for f in $(awk -F "," '$1!="id" { print $1}' $rec_file); do + # the next command does nothing unless $output_dir/storage/ exists, see + # utils/create_data_link.pl for more info. + hyp_utils/create_data_link.pl $output_dir/$f.$file_format +done + + + diff --git a/hyp_utils/create_data_split_links.sh b/hyp_utils/create_data_split_links.sh index 8416742e..c7cfa3eb 100755 --- a/hyp_utils/create_data_split_links.sh +++ b/hyp_utils/create_data_split_links.sh @@ -4,8 +4,6 @@ # Apache 2.0. # Creates links to distrubute data into multiple nodes in clsp grid -storage_name=$(date +'%m_%d_%H_%M') - if [ $# -ne 2 ]; then echo "Usage: $0 " echo "$0 exp/vad_dir/vad.JOB.ark 40" diff --git a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh index 6c6f0fdf..d8ae2e55 100755 --- a/hyp_utils/xvectors/extract_wav2vec2xvectors.sh +++ b/hyp_utils/xvectors/extract_wav2vec2xvectors.sh @@ -87,9 +87,9 @@ if [ $stage -le 0 ];then extract_wav2vec2xvectors.py \ ${args} $write_speech_dur_opt \ --part-idx JOB --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ - --output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp + --output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp set -e fi @@ -109,9 +109,9 @@ if [ $stage -le 1 ];then extract_wav2vec2xvectors.py \ ${args} $write_speech_dur_opt \ --part-idx $i --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length \ - --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & + --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & fi done wait diff --git a/hyp_utils/xvectors/extract_xvectors_from_wav.sh b/hyp_utils/xvectors/extract_xvectors_from_wav.sh index 0b5227cc..b763a25c 100755 --- a/hyp_utils/xvectors/extract_xvectors_from_wav.sh +++ b/hyp_utils/xvectors/extract_xvectors_from_wav.sh @@ -87,10 +87,10 @@ if [ $stage -le 0 ];then hyp_utils/conda_env.sh --num-gpus $num_gpus \ extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ - --part-idx JOB --num-parts $nj \ - --input $data_dir/wav.scp \ + --part-idx JOB --num-parts $nj \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --chunk-length $chunk_length \ - --output ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp + --output-spec ark,scp:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.scp set -e fi @@ -110,9 +110,9 @@ if [ $stage -le 1 ];then extract_xvectors_from_wav.py \ --feats $feat_config ${args} $write_num_frames_opt \ --part-idx $i --num-parts $nj \ - --input $data_dir/wav.scp \ + --recordings-file $data_dir/wav.scp \ --model-path $nnet_file --chunk-length $chunk_length \ - --output ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & + --output-spec ark,scp:$output_dir/xvector.$i.ark,$output_dir/xvector.$i.scp & fi done wait diff --git a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh index 27c77454..4530ad3b 100755 --- a/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh +++ b/hyp_utils/xvectors/make_babble_noise_for_nnet_train.sh @@ -8,9 +8,7 @@ nj=1 cmd="run.pl" stage=0 file_format=flac -nodes=b1 storage_name=$(date +'%m_%d_%H_%M') -#proc_opts="--remove-dc-offset" min_spks=3 max_spks=10 num_reuses=5 @@ -23,10 +21,8 @@ if [ $# != 3 ]; then echo "Usage: $0 " echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" echo "Options: " - #echo " --nj # number of parallel jobs" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --file-format # Output file_format supported by soundfile (flac,ogg,wav,...)" - #echo " --proc-opts # Extra arguments for proc-audio-files.py" echo " --min-spks # max number of spks per utterance" echo " --max-spks # max number of spks per utterance" echo " --num-reuses # number of times a signal is reused to create babble" @@ -51,22 +47,12 @@ output_dir=$(utils/make_absolute.sh $dir) args="" $cmd $dir/log/make_babble_noise_${name}.log \ hyp_utils/conda_env.sh \ - make_babble_noise_audio_files.py ${args} \ - --output-audio-format $file_format $args $proc_opts \ + make_babble_noise_audio_files.py \ + --audio-format $file_format $args $proc_opts \ --min-spks $min_spks --max-spks $max_spks --num-reuses $num_reuses \ --write-time-durs $data_out/utt2dur \ - --input $data_in/wav.scp \ + --recordings-file $data_in/wav.scp \ --output-path $output_dir \ - --output-script $data_out/wav.scp - - - -# for n in $(seq $nj); do -# cat $output_dir/wav.${name}.$n.scp || exit 1; -# done > ${data_out}/wav.scp || exit 1 - -# for n in $(seq $nj); do -# cat $output_dir/utt2dur.${name}.$n || exit 1; -# done > ${data_out}/utt2dur || exit 1 + --output-recordings-file $data_out/wav.scp echo "$0: Succeeded making babble noise for $name" diff --git a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh index c6634135..437cd208 100755 --- a/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh +++ b/hyp_utils/xvectors/pack_rirs_for_nnet_train.sh @@ -66,13 +66,4 @@ $cmd $dir/log/pack_rirs_${name}.log \ pack_wav_rirs.py ${args} --input $data_in/wav.scp \ --output ${file_format},scp:$output_dir/rirs_${name}.${file_format},$data_out/rirs.scp || exit 1; - -# for n in $(seq $nj); do -# cat $output_dir/wav.${name}.$n.scp || exit 1; -# done > ${data_out}/wav.scp || exit 1 - -# for n in $(seq $nj); do -# cat $output_dir/utt2dur.${name}.$n || exit 1; -# done > ${data_out}/utt2dur || exit 1 - echo "$0: Succeeded packing RIRs for $name" diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index 8321169f..aed40672 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -92,12 +92,14 @@ fi $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ hyp_utils/conda_env.sh \ - preprocess_audio_files.py ${args} --output-audio-format $file_format $args $proc_opts \ + preprocess_audio_files.py ${args} --audio-format $file_format $args $proc_opts \ --write-time-durs $output_dir/utt2dur.${name}.JOB \ --part-idx JOB --num-parts $nj \ - --input $data_in/wav.scp \ + # --input $data_in/wav.scp \ + --recordings-file $data_in/wav.scp \ --output-path $output_dir \ - --output-script $output_dir/wav.${name}.JOB.scp + --output-recordings-file $output_dir/wav.${name}.JOB.scp + #--output-script $output_dir/wav.${name}.JOB.scp for n in $(seq $nj); do cat $output_dir/wav.${name}.$n.scp || exit 1; diff --git a/hyperion/bin/eval_cosine_scoring_backend.py b/hyperion/bin/eval_cosine_scoring_backend.py new file mode 100755 index 00000000..1a740024 --- /dev/null +++ b/hyperion/bin/eval_cosine_scoring_backend.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import TrialNdx, TrialKey, TrialScores, EnrollmentMap, SegmentSet +from hyperion.utils.math_funcs import cosine_scoring +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + return enroll_map, ndx, x_e, x_t + + +def load_cohort_data(segments_file, feats_file): + + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + return segments, x + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + preproc_file, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, +): + + logging.info("loading data") + enroll_map, ndx, x_e, x_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + + t1 = time.time() + logging.info("computing score") + if preproc_file is not None: + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + + scores = cosine_scoring(x_e, x_t, ids1=enroll_ids) + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids) + snorm = AdaptSNorm(cohort_nbest) + scores = snorm(scores, scores_coh_test, scores_enr_coh) + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if num_enroll_parts > 1 or num_test_parts > 1: + score_file = Path(score_file) + new_suffix = f".{enroll_part_idx}.{test_part_idx}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + logging.info("saving scores to %s", score_file) + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Eval cosine-scoring with optional AS-Norm") + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py new file mode 100755 index 00000000..f567dd81 --- /dev/null +++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py @@ -0,0 +1,472 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +from pathlib import Path + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import ( + TrialNdx, + TrialKey, + TrialScores, + EnrollmentMap, + SegmentSet, + InfoTable, +) +from hyperion.utils.math_funcs import cosine_scoring, average_vectors +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList +from hyperion.np.score_norm import AdaptSNorm + + +def get_precomp_qm_names(quality_measures): + # snorm qm will be calculated later + return [q for q in quality_measures if q not in ["snorm-mu", "snorm-mu/s"]] + + +def normalize_duration(q, min_dur, max_dur, frame_rate): + + q = q / frame_rate + q = np.log(np.clip(q / frame_rate, a_min=min_dur, a_max=max_dur)) + log_min_dur = np.log(min_dur) + log_max_dur = np.log(max_dur) + q = (q - log_min_dur) / (log_max_dur - log_min_dur) + return q + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + + # quality measures may be in segments file or/and feature_set file + # so we combine both if both are given + if segments_file is not None: + test_segments = SegmentSet.load(segments_file) + if enroll_segments_file is not None and segments_file != enroll_segments_file: + enroll_segments = SegmentSet.load(enroll_segments_file) + else: + enroll_segments = test_segments + + test_feats_set = test_feats_reader.feature_set + enroll_feats_set = enroll_feats_reader.feature_set + if segments_file: + test_segments.add_columns(test_feats_set) + if enroll_feats_set != test_feats_set or enroll_segments != test_segments: + enroll_segments.add_columns(enroll_feats_set) + + # now we retrive the quality measures + q_e = [] + q_t = [] + # snorm qm will be calculated later + retrieve_qm = get_precomp_qm_names(quality_measures) + q_e = enroll_segments.loc[enroll_map["segmentid"], retrieve_qm] + q_t = test_segments.loc[ndx.seg_set, retrieve_qm] + + # normalize durations + if "speech_duration" in retrieve_qm: + q_e["speech_duration"] = normalize_duration( + q_e["speech_duration"], min_dur, max_dur, 1 + ) + q_t["speech_duration"] = normalize_duration( + q_t["speech_duration"], min_dur, max_dur, 1 + ) + + if "num_speech_frames" in retrieve_qm: + q_e["num_speech_frames"] = normalize_duration( + q_e["num_speech_frames"], min_dur, max_dur, frame_rate + ) + q_t["num_speech_frames"] = normalize_duration( + q_t["num_speech_frames"], min_dur, max_dur, frame_rate + ) + + # q_e = np.asarray(q_e) + # q_t = np.asarray(q_t) + + return enroll_map, ndx, x_e, x_t, q_e, q_t + + +def load_cohort_data(segments_file, feats_file): + + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + + # segments.add_columns(feats_reader.feature_set) + + # retrieve_qm = get_precomp_qm_names(quality_measures) + # q = np.asarray(segments[retrieve_qm]) + return segments, x # , q + + +def average_qm(q, model_set, ids): + q_avg = average_vectors(q.values, ids) + q_avg = pd.DataFrame(q, columns=q.columns) + q_avg["id"] = model_set + q_avg.set_index("id", drop=False, inplace=True) + return q_avg + + +def get_score_filepath( + score_file, + score_name, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + + score_file = Path(score_file) + new_suffix = "" + if score_name is not None: + new_suffix = f".{score_name}" + + if num_enroll_parts > 1 or num_test_parts > 1: + new_suffix = ( + f"{new_suffix}.{enroll_part_idx}.{test_part_idx}{score_file.suffix}" + ) + + if new_suffix: + new_suffix = f"{new_suffix}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + return score_file + +def save_scores(ndx, scores, score_file, score_name, enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts): + +def save_empty_scores(ndx, score_file, score_name, enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts): + scores = np.zeros(ndx.trial_mask.shape, dtype="float32") + score_file = get_score_filepath(score_file, score_name,enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts) + + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file) + + + + +def segment_to_trial_qm(q_e, q_t): + q_trial = {} + for q_name in ["speech_duration", "num_speech_frames"]: + if q_name in q_e: + q_trial_name = f"max_{q_name}" + q_trial[q_trial_name] = np.maximum( + q_e[q_name].values[:, None], q_t[q_name].values[None, :] + ) + q_trial_name = f"min_{q_name}" + q_trial[q_trial_name] = np.minimum( + q_e[q_name].values[:, None], q_t[q_name].values[None, :] + ) + + return q_trial + + +def align_scores_to_ndx(enroll_set, ndx, scores, scores_norm, q_trial): + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + if scores_norm is not None: + scores_norm = scores_norm[sort_idx] + for qm in q_trial: + q_trial[qm] = q_trial[qm][sort_idx] + + return scores, scores_norm, q_trial + + +def make_qm_table(ndx, scores, scores_norm, q_trial): + if scores_norm is None: + scores = scores[ndx.trial_mask] + else: + scores = scores_norm[ndx.trial_mask] + + for qm in q_trial: + q_trial[qm] = q_trial[qm][ndx.trial_mask] + + I, J = np.nonzero(ndx.trial_mask) + modelid = ndx.model_set[I] + segmentid = ndx.seg_set[J] + unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)] + + q_dict = { + "id": unique_id, + "modelid": modelid, + "segmentid": segmentid, + "scores": scores, + } + q_dict.update(q_trial) + df = pd.DataFrame(q_dict) + return InfoTable(df) + + + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + preproc_file, + qmf_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + + logging.info("loading data") + enroll_map, ndx, x_e, x_t, q_e, q_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_segments_file, + segments_file, + quality_measures, + min_dur, + max_dur, + frame_rate, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if not np.any(ndx.trial_mask): + # this part doesn't have any trials, save empty files + + + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + q_e = average_qm(q_e, enroll_set, enroll_ids) + + t1 = time.time() + logging.info("computing score") + if preproc_file is not None: + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + + scores = cosine_scoring(x_e, x_t, ids1=enroll_ids) + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + q_trial = segment_to_trial_qm(q_e, q_t) + scores_norm = None + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = cosine_scoring(x_e, x_coh, ids2=cohort_ids) + logging.info("computing cohort vs test") + scores_coh_test = cosine_scoring(x_coh, x_t, ids1=cohort_ids) + snorm = AdaptSNorm(cohort_nbest) + scores_norm, mu_z, s_z, mu_t, s_t = snorm( + scores, scores_coh_test, scores_enr_coh, return_stats=True + ) + if "snorm-mu" in quality_measures: + q_trial["max_snorm-mu"] = np.maximum(mu_z, mu_t) + q_trial["min_snorm-mu"] = np.minimum(mu_z, mu_t) + if "snorm-mu/s" in quality_measures: + mu_z = mu_z / s_z + mu_t = mu_t / s_t + q_trial["max_snorm-mu/s"] = np.maximum(mu_z, mu_t) + q_trial["min_snorm-mu/s"] = np.minimum(mu_z, mu_t) + + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + scores, scores_norm, q_trial = align_scores_to_ndx( + enroll_set, ndx, scores, scores_norm, q_trial + ) + if qmf_file is None: + qm_table = make_qm_table(ndx, scores, scores_norm, q_trial) + qm_file = get_score_filepath( + score_file, + "qm", + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + qm_table.save(qm_file) + return + + score_file_nonorm = get_score_filepath( + score_file, + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + logging.info("saving scores to %s", score_file_nonorm) + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file_nonorm) + + if scores_norm is not None: + score_file_snorm = get_score_filepath( + score_file, + "snorm", + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + logging.info("saving scores with AS-Norm to %s", score_file_snorm) + scores.scores = scores_norm + scores.save(score_file_snorm) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Eval cosine-scoring with optional AS-Norm and QMF" + ) + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--enroll-segments-file", default=None) + parser.add_argument("--segments-file", default=None) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--qmf-file", default=None) + parser.add_argument( + "--quality-measures", + default=["snorm-mu/s", "speech_duration"], + nargs="+", + choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"], + ) + parser.add_argument( + "--min-dur", default=0.1, type=float, help="lower bound to clip durations" + ) + parser.add_argument( + "--max-dur", default=30.0, type=float, help="upper bound to clip durations" + ) + parser.add_argument( + "--frame-rate", + default=100, + type=float, + help="frames/sec when durationa are expressed in frames", + ) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) diff --git a/hyperion/bin/eval_verification_metrics.py b/hyperion/bin/eval_verification_metrics.py new file mode 100755 index 00000000..83227558 --- /dev/null +++ b/hyperion/bin/eval_verification_metrics.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.np.metrics import VerificationEvaluator as VE + +from jsonargparse import ( + ActionConfigFile, + ActionYesNo, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + + +def eval_verification_metrics( + key_files, + score_files, + key_names, + score_names, + p_tar, + c_miss, + c_fa, + sparse, + output_file, +): + + assert len(key_files) == len(key_names) + assert len(score_files) == len(score_names) + dfs = [] + for score_file, score_name in zip(score_files, score_names): + for key_file, key_name in zip(key_files, key_names): + logging.info("Evaluating %s - %s", score_name, key_name) + evaluator = VE( + key_file, + score_file, + p_tar, + c_miss, + c_fa, + key_name, + score_name, + sparse=sparse, + ) + df_ij = evaluator.compute_dcf_eer() + dfs.append(df_ij) + + df = pd.concat(dfs) + logging.info("saving results to %s", output_file) + output_file = Path(output_file) + output_file.parent.mkdir(exist_ok=True, parents=True) + sep = "\t" if output_file.suffix == ".tsv" else "," + df.to_csv(output_file, sep=sep, index=False, float_format="{:,.4f}".format) + + pd.options.display.float_format = "{:.4}".format + print(df.to_string(), flush=True) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Evaluate speaker verification metrics") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--key-files", required=True, nargs="+") + parser.add_argument("--score-files", required=True, nargs="+") + parser.add_argument("--key-names", required=True, nargs="+") + parser.add_argument("--score-names", required=True, nargs="+") + parser.add_argument( + "--p-tar", + default=[0.05, 0.01, 0.005, 0.001], + nargs="+", + type=float, + help="target priors", + ) + parser.add_argument( + "--c-miss", default=None, nargs="+", type=float, help="cost of miss" + ) + parser.add_argument( + "--c-fa", default=None, nargs="+", type=float, help="cost of false alarm" + ) + parser.add_argument("--sparse", default=False, action=ActionYesNo) + parser.add_argument("--output-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + ) + + args = parser.parse_args() + kwargs = namespace_to_dict(args) + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + eval_verification_metrics(**kwargs) diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index 9efbd6dd..f60c7508 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -21,8 +21,12 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -76,13 +80,15 @@ def augment(key0, x0, augmenter, aug_df, aug_id): def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( - "extract-random-utt %s of length=%d first-frame=%d" - % (key, x.shape[1], first_frame) + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, ) return x @@ -105,7 +111,7 @@ def eval_xvec( **kwargs ): - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 6f7d269e..5eba1b99 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -21,8 +21,12 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) resamplers = {} @@ -84,9 +88,11 @@ def augment(key0, x0, augmenter, aug_df, aug_id): def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=fs * min_utt_length, high=fs * max_utt_length + 1) + utt_length = rng.integers( + low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1) + ) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( "extract-random-utt %s of length=%d first-frame=%d", @@ -98,7 +104,7 @@ def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): def extract_xvectors( - input_spec, + recordings_file, output_spec, vad_spec, write_speech_dur, @@ -117,7 +123,7 @@ def extract_xvectors( **kwargs, ): - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) model = load_model(model_path, device) @@ -138,15 +144,12 @@ def extract_xvectors( logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info(f"opening input stream: {input_spec} with args={ar_args}") - with AR(input_spec, **ar_args) as reader: + logging.info(f"opening input stream: {recordings_file} with args={ar_args}") + with AR(recordings_file, **ar_args) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create( - vad_spec, - path_prefix=vad_path_prefix, - ) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) while not reader.eof(): t1 = time.time() @@ -160,9 +163,7 @@ def extract_xvectors( t2 = time.time() if fs != model.sample_frequency: resampler = get_resampler(fs, model.sample_frequency) - print(f"x01 {x0.shape} {np.max(x0)}") x0 = resampler(x0) - print(f"x01 {x0.shape} {np.max(x0)}") logging.info("processing utt %s", key0) for aug_id in range(num_augs): @@ -260,7 +261,7 @@ def extract_xvectors( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument("--write-speech-dur", default=None) parser.add_argument( @@ -278,7 +279,7 @@ def extract_xvectors( parser.add_argument("--model-path", required=True) parser.add_argument( "--hf-chunk-length", - type=int, + type=float, default=0, help=( "max. chunk length used in each forward pass " @@ -288,7 +289,7 @@ def extract_xvectors( ) parser.add_argument( "--xvec-chunk-length", - type=int, + type=float, default=0, help=( "max. chunk length used in each forward pass " @@ -314,18 +315,18 @@ def extract_xvectors( ) parser.add_argument( "--min-utt-length", - type=int, + type=float, default=5, help=("minimum utterance length in secs when using random utt length"), ) parser.add_argument( "--max-utt-length", - type=int, + type=float, default=120, help=("maximum utterance length in secs when using random utt length"), ) - parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--output-spec", required=True) parser.add_argument( "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" ) diff --git a/hyperion/bin/extract_wav2xvectors.py b/hyperion/bin/extract_wav2xvectors.py new file mode 100755 index 00000000..7b04fcc8 --- /dev/null +++ b/hyperion/bin/extract_wav2xvectors.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python +""" + Copyright 2019 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +import torchaudio.transforms as tat +from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.np.augment import SpeechAugment +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.utils import open_device +from hyperion.utils import Utt2Info +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +resamplers = {} + + +def get_resampler(source_fs, target_fs): + if source_fs in resamplers: + return resamplers[source_fs] + + resampler = tat.Resample( + int(source_fs), + int(target_fs), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + resamplers[source_fs] = resampler_f + return resampler_f + + +def init_device(use_gpu): + set_float_cpu("float32") + num_gpus = 1 if use_gpu else 0 + logging.info("initializing devices num_gpus=%d", num_gpus) + device = open_device(num_gpus=num_gpus) + return device + + +def load_model(model_path, device): + logging.info("loading model %s", model_path) + model = TML.load(model_path) + logging.info(f"xvector-model={model}") + model.to(device) + model.eval() + return model + + +def augment(key0, x0, augmenter, aug_df, aug_id): + if augmenter is None: + x = x0 + key = key0 + else: + x, aug_info = augmenter(x0) + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) + + return key, x + + +def select_random_chunk(key, x, fs, min_utt_length, max_utt_length, rng): + utt_length = rng.integers( + low=int(fs * min_utt_length), high=int(fs * max_utt_length + 1) + ) + if utt_length < x.shape[1]: + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] + logging.info( + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, + ) + return x + + +def extract_xvectors( + recordings_file, + output_spec, + vad_spec, + write_speech_dur, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs, +): + + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) + device = init_device(use_gpu) + model = load_model(model_path, device) + + if write_speech_dur is not None: + keys = [] + info = [] + + if aug_cfg is not None: + augmenter = SpeechAugment.create(aug_cfg, rng=rng) + aug_df = [] + else: + augmenter = None + aug_df = None + num_augs = 1 + + metadata_columns = ["speech_duration"] + + ar_args = AR.filter_args(**kwargs) + logging.info("opening output stream: %s with args=%s", output_spec, str(ar_args)) + with DWF.create(output_spec, metadata_columns=metadata_columns) as writer: + + logging.info(f"opening input stream: {recordings_file} with args={ar_args}") + with AR(recordings_file, **ar_args) as reader: + + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + + while not reader.eof(): + t1 = time.time() + key, x0, fs = reader.read(1) + if len(key) == 0: + break + + x0 = x0[0] + key0 = key[0] + fs = fs[0] + t2 = time.time() + if fs != model.sample_frequency: + resampler = get_resampler(fs, model.sample_frequency) + x0 = resampler(x0) + + logging.info("processing utt %s", key0) + for aug_id in range(num_augs): + metadata = {} + t3 = time.time() + key, x = augment(key0, x0, augmenter, aug_df, aug_id) + t4 = time.time() + with torch.no_grad(): + x = torch.tensor( + x[None, :], dtype=torch.get_default_dtype() + ).to(device) + t5 = time.time() + tot_samples = x.shape[1] + if vad_spec is not None: + vad = v_reader.read(key0)[0] + vad = torch.tensor( + vad[None, None, :], dtype=torch.float + ).to(device) + vad = torch.nn.functional.interpolate( + vad, size=x.size(-1), mode="nearest" + ).bool()[0, 0] + x = x[:, vad] + + logging.info( + "utt %s detected %d/%d (%.2f %%) speech samples", + key, + x.shape[1], + tot_samples, + x.shape[1] / tot_samples * 100, + ) + + if random_utt_length: + x = select_random_chunk( + key, x, fs, min_utt_length, max_utt_length, rng + ) + + metadata["speech_duration"] = ( + x.shape[1] / model.sample_frequency + ) + + t6 = time.time() + if x.shape[1] == 0: + y = np.zeros((model.embed_dim,), dtype=float_cpu()) + else: + y = ( + model.extract_embed( + x, + chunk_length=chunk_length, + embed_layer=embed_layer, + ) + .cpu() + .numpy()[0] + ) + + t7 = time.time() + writer.write([key], [y], metadata=metadata) + if write_speech_dur is not None: + keys.append(key) + info.append(str(x.shape[1] / fs)) + + t8 = time.time() + read_time = t2 - t1 + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ), + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x.shape[1] / fs / tot_time, + ) + + if write_speech_dur is not None: + logging.info("writing speech duration in secs to %s", write_speech_dur) + u2sd = Utt2Info.create(keys, info) + u2sd.save(write_speech_dur) + + if aug_info_path is not None: + aug_df = pd.concat(aug_df, ignore_index=True) + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="""Extracts x-vectors from waveform computing acoustic features on the fly""" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--recordings-file", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument("--write-speech-dur", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + AR.add_class_args(parser) + + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=float, + default=0, + help=( + "max. chunk length used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=float, + default=5, + help=("minimum utterance length in secs when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=float, + default=120, + help=("maximum utterance length in secs when using random utt length"), + ) + + parser.add_argument("--output-spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + extract_xvectors(**namespace_to_dict(args)) diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index 13ad4277..b02db70c 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -19,8 +19,12 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -50,13 +54,15 @@ def load_model(model_path, device): def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( - "extract-random-utt %s of length=%d first-frame=%d" - % (key, x.shape[1], first_frame) + "extract-random-utt %s of length=%d first-frame=%d", + key, + x.shape[1], + first_frame, ) return x @@ -78,7 +84,7 @@ def extract_xvectors( ): logging.info("initializing") - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) model = load_model(model_path, device) diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index 577bbae7..6a8130d3 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -21,8 +21,12 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -76,9 +80,9 @@ def augment(key0, x0, augmenter, aug_df, aug_id): def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) + utt_length = rng.integers(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + first_frame = rng.integers(low=0, high=x.shape[1] - utt_length) x = x[:, first_frame : first_frame + utt_length] logging.info( "extract-random-utt %s of length=%d first-frame=%d", @@ -90,7 +94,7 @@ def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): def extract_xvectors( - input_spec, + recordings_file, output_spec, vad_spec, write_num_frames_spec, @@ -108,7 +112,7 @@ def extract_xvectors( **kwargs ): - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -130,9 +134,9 @@ def extract_xvectors( with DWF.create(output_spec) as writer: logging.info( - "opening input stream: {} with args={}".format(input_spec, ar_args) + "opening input stream: {} with args={}".format(recordings_file, ar_args) ) - with AR(input_spec, **ar_args) as reader: + with AR(recordings_file, **ar_args) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) @@ -235,12 +239,12 @@ def extract_xvectors( parser = ArgumentParser( description=( - "Extracts x-vectors from waveform computing " "acoustic features on the fly" + "Extracts x-vectors from waveform computing acoustic features on the fly" ) ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( "--write-num-frames", dest="write_num_frames_spec", default=None @@ -299,7 +303,7 @@ def extract_xvectors( help=("maximum utterance length when using random utt length"), ) - parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--output-spec", required=True) parser.add_argument( "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" ) diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index a54c4d64..bcec5133 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -20,8 +20,12 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -71,7 +75,7 @@ def extract_xvectors( ): logging.info("initializing") - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) model = load_model(model_path, device) diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index 8939ba91..f1a64e1b 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -22,8 +22,12 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def init_device(use_gpu): @@ -96,7 +100,7 @@ def extract_xvectors( **kwargs ): - rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) + rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) diff --git a/hyperion/bin/finetune_wav2xvector.py b/hyperion/bin/finetune_wav2xvector.py new file mode 100755 index 00000000..b100b544 --- /dev/null +++ b/hyperion/bin/finetune_wav2xvector.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +import sys +import time +from pathlib import Path + +import torch +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch import TorchModelLoader as TML +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +# from hyperion.torch.narchs import AudioFeatsMVN as AF +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, in_model_file, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_finetune_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["xvector"]["num_classes"] = num_classes + model = TML.load(in_model_file) + model.change_config(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def init_hard_prototype_mining(model, train_loader, val_loader, rank): + try: + hard_prototype_mining = train_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + if rank == 0: + logging.info("setting hard prototypes") + + affinity_matrix = model.compute_prototype_affinity() + train_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + try: + hard_prototype_mining = val_loader.batch_sampler.hard_prototype_mining + except: + hard_prototype_mining = False + + if not hard_prototype_mining: + return + + val_loader.batch_sampler.set_hard_prototypes(affinity_matrix) + + +def train_xvec(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + init_hard_prototype_mining(model, train_loader, val_loader, rank) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_finetune_args(parser, prefix="model") + parser.add_argument("--in-model-file", required=True) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Fine-tune x-vector model from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 209915c5..4336b7b9 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -24,8 +24,12 @@ from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialNdx, Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def read_utt_list(list_file, class2int_file, part_idx, num_parts): diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py old mode 100644 new mode 100755 index c5a3f6b9..2e3a35ec --- a/hyperion/bin/hyperion_dataset.py +++ b/hyperion/bin/hyperion_dataset.py @@ -8,30 +8,40 @@ from typing import List, Optional, Union from hyperion.hyp_defs import config_logger -from hyperion.utils import (ClassInfo, Dataset, EnrollmentMap, FeatureSet, - InfoTable, PathLike, RecordingSet, SegmentSet) -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - -subcommands = ["add_features"] -# table_dict = { -# "segments": SegmentSet, -# "recordings": RecordingSet, -# "features": FeatureSet, -# "classes": ClassInfo, -# "enrollments": EnrollmentMap, -# "generic": InfoTable, -# } +from hyperion.utils import ( + ClassInfo, + Dataset, + EnrollmentMap, + FeatureSet, + InfoTable, + PathLike, + RecordingSet, + SegmentSet, +) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, + ActionYesNo, +) + +subcommand_list = [ + "add_features", + "set_recordings", + "make_from_recordings", + "remove_short_segments", + "rebuild_class_idx", + "remove_classes_few_segments", + "split_train_val", + "copy", + "add_cols_to_segments", +] def add_common_args(parser): parser.add_argument( - "-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int, + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, ) @@ -45,6 +55,11 @@ def make_add_features_parser(): "--features-name", required=True, help="""name of the feature""" ) parser.add_argument("--features-file", required=True, help="""feature set file""") + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) add_common_args(parser) return parser @@ -54,10 +69,353 @@ def add_features( dataset: PathLike, features_name: str, features_file: PathLike, + output_dataset: PathLike, ): + if output_dataset is None: + output_dataset = dataset + dataset = Dataset.load(dataset, lazy=True) dataset.add_features(features_name, features_file) - dataset.save(dataset) + dataset.save(output_dataset) + + +def make_set_recordings_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--recordings-file", required=True, help="""recordings set file""" + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + parser.add_argument( + "--remove-features", + default=None, + nargs="+", + help="""removes feature files from the dataset, + since they maybe obsolote after modifiying the recordings""", + ) + parser.add_argument( + "--update-seg-durs", + default=False, + action=ActionYesNo, + help="""updates the durations in the segment table""", + ) + + add_common_args(parser) + return parser + + +def set_recordings( + dataset: PathLike, + recordings_file: PathLike, + output_dataset: PathLike, + remove_features: List[str], + update_seg_durs: bool, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.set_recordings(recordings_file, update_seg_durs) + if remove_features is not None: + for features_name in remove_features: + dataset.remove_features(features_name) + + dataset.save(output_dataset) + + +def make_make_from_recordings_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--recordings-file", required=True, help="""recordings set file""" + ) + + add_common_args(parser) + return parser + + +def make_from_recordings( + dataset: PathLike, recordings_file: PathLike, +): + output_dataset = dataset + import pandas as pd + + rec_df = pd.read_csv(recordings_file) + seg_df = rec_df[["id"]] + segments = SegmentSet(seg_df) + dataset = Dataset(segments, recordings=recordings_file) + dataset.save(output_dataset) + + +def make_remove_short_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--min-length", + required=True, + type=float, + help="""minimum required length of the segment""", + ) + + parser.add_argument( + "--length-name", + default="duration", + help="""name of the column indicating the length of the segment""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_short_segments( + dataset: PathLike, min_length: float, length_name: str, output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.remove_short_segments(min_length, length_name) + dataset.save(output_dataset) + + +def make_rebuild_class_idx_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def rebuild_class_idx( + dataset: PathLike, class_name: str, output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.rebuild_class_idx(class_name) + dataset.save(output_dataset) + + +def make_remove_classes_few_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--min-segs", default=1, type=int, help="""min. num. of segments/class""" + ) + parser.add_argument( + "--rebuild-idx", + default=False, + action=ActionYesNo, + help="""regenerate class indexes from 0 to new_num_classes-1""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_classes_few_segments( + dataset: PathLike, + class_name: str, + min_segs: int, + rebuild_idx: bool, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.remove_classes_few_segments(class_name, min_segs, rebuild_idx) + dataset.save(output_dataset) + + +def make_split_train_val_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""input dataset dir or .yaml file""" + ) + parser.add_argument( + "--val-prob", + default=0.05, + type=float, + help="""proportion of segments used for val""", + ) + parser.add_argument( + "--min-train-samples", + default=1, + type=int, + help="""min. number of training samples / class""", + ) + + parser.add_argument( + "--joint-classes", + default=None, + nargs="+", + help="""types of classes that need to have same classes in train and val""", + ) + parser.add_argument( + "--disjoint-classes", + default=None, + nargs="+", + help="""types of classes that need to have different classes in train and val""", + ) + parser.add_argument( + "--seed", default=11235813, type=int, help="""random seed""", + ) + + parser.add_argument( + "--train-dataset", required=True, help="""output train dataset dir""", + ) + parser.add_argument( + "--val-dataset", required=True, help="""output val dataset dir""", + ) + + add_common_args(parser) + return parser + + +def split_train_val( + dataset: PathLike, + val_prob: float, + joint_classes: List[str], + disjoint_classes: List[str], + min_train_samples: int, + seed: int, + train_dataset: PathLike, + val_dataset: PathLike, +): + dataset = Dataset.load(dataset, lazy=True) + train_ds, val_ds = dataset.split_train_val( + val_prob, joint_classes, disjoint_classes, min_train_samples, seed + ) + train_ds.save(train_dataset) + val_ds.save(val_dataset) + + num_total = len(dataset) + num_train = len(train_ds) + num_val = len(val_ds) + logging.info( + "train: %d (%.2f%%) segments, val: %d (%.2f%%) segments", + num_train, + num_train / num_total * 100, + num_val, + num_val / num_total * 100, + ) + + +def make_copy_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--output-dataset", + required=True, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def copy( + dataset: PathLike, output_dataset: PathLike, +): + dataset = Dataset.load(dataset, lazy=True) + dataset.save(output_dataset) + + +def make_add_cols_to_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--right-table", required=True, help="table where the new data is" + ) + parser.add_argument( + "--columns", + required=True, + nargs="+", + help="""columns to copy to segments table""", + ) + parser.add_argument( + "--on", default=["id"], nargs="+", help="""columns to match both tables rows""", + ) + parser.add_argument( + "--right-on", + default=None, + nargs="+", + help="""columns to match both tables rows""", + ) + + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def add_cols_to_segments( + dataset: PathLike, + right_table: PathLike, + column_names: List[str], + on: List[str], + right_on: List[str], + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.add_cols_to_segments(right_table, column_names, on, right_on) + dataset.save(output_dataset) if __name__ == "__main__": @@ -66,15 +424,15 @@ def add_features( parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() - for subcommand in subcommands: + for subcommand in subcommand_list: parser_func = f"make_{subcommand}_parser" subparser = globals()[parser_func]() - subcommands.add_subcommand(k, subparser) + subcommands.add_subcommand(subcommand, subparser) args = parser.parse_args() subcommand = args.subcommand kwargs = namespace_to_dict(args)[args.subcommand] config_logger(kwargs["verbose"]) del kwargs["verbose"] - + del kwargs["cfg"] globals()[subcommand](**kwargs) diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py index 5a5f0b4f..7f61b35a 100755 --- a/hyperion/bin/hyperion_tables.py +++ b/hyperion/bin/hyperion_tables.py @@ -8,12 +8,23 @@ from typing import List, Optional, Union from hyperion.hyp_defs import config_logger -from hyperion.utils import (ClassInfo, EnrollmentMap, FeatureSet, InfoTable, - PathLike, RecordingSet, SegmentSet) -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) - -subcommands = ["cat"] +from hyperion.utils import ( + ClassInfo, + EnrollmentMap, + FeatureSet, + InfoTable, + PathLike, + RecordingSet, + SegmentSet, +) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +subcommand_list = ["cat"] table_dict = { "segments": SegmentSet, "recordings": RecordingSet, @@ -73,11 +84,11 @@ def cat( table_type: str, input_files: Union[List[PathLike], None], output_file: PathLike, - num_table: int, + num_tables: int, base_idx: int = 1, ): - assert input_files is not None or num_jobs != 0 + assert input_files is not None or num_tables != 0 output_file = Path(output_file) if input_files is None: ext = output_file.suffix @@ -103,15 +114,15 @@ def cat( parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() - for subcommand in subcommands: + for subcommand in subcommand_list: parser_func = f"make_{subcommand}_parser" subparser = globals()[parser_func]() - subcommands.add_subcommand(k, subparser) + subcommands.add_subcommand(subcommand, subparser) args = parser.parse_args() subcommand = args.subcommand kwargs = namespace_to_dict(args)[args.subcommand] config_logger(kwargs["verbose"]) del kwargs["verbose"] - + del kwargs["cfg"] globals()[subcommand](**kwargs) diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 4a356037..68e5b22b 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -15,12 +15,15 @@ from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from scipy import ndimage, signal +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) -def make_noise(xs): +def make_noise(xs, max_value): lens = np.array([x.shape[0] for x in xs]) max_len = np.max(lens) @@ -28,73 +31,78 @@ def make_noise(xs): for i in range(len(xs)): xs[i] = np.tile(xs[i], int(num_tiles[i]))[:max_len] + xs[0] -= xs[0].mean() for i in range(1, len(xs)): xs[0] += xs[i] - xs[i].mean() + max_x = np.max(np.abs(xs[0])) + if max_x > max_value: + xs[0] *= max_value / max_x + return xs[0] def make_babble_noise_audio_files( - input_path, + recordings_file, output_path, - output_script, - write_time_durs_spec, + output_recordings_file, + write_time_durs, min_spks=3, max_spks=7, num_reuses=5, random_seed=112358, - **kwargs + **kwargs, ): input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) - logging.info("input_args={}".format(input_args)) - logging.info("output_args={}".format(output_args)) + logging.info(f"input_args={input_args}") + logging.info(f"output_args={output_args}") - rng = np.random.RandomState(seed=random_seed) + rng = np.random.default_rng(seed=random_seed) - if write_time_durs_spec is not None: + if write_time_durs is not None: okeys = [] info = [] count = 0 t1 = time.time() - with AR(input_path, **input_args) as reader: + with AR(recordings_file, **input_args) as reader, Writer( + output_path, output_recordings_file, **output_args + ) as writer: keys = reader.keys - with Writer(output_path, output_script, **output_args) as writer: - - for iters in range(num_reuses): - keys = rng.permutation(keys) - - cur_spks = min_spks + for iters in range(num_reuses): + keys = rng.permutation(keys) + + cur_spks = min_spks + utt_list = [] + for utt_idx in range(len(keys)): + if len(utt_list) < cur_spks: + utt_list.append(keys[utt_idx]) + continue + + x, fs = reader.read(utt_list) + fs = fs[0] + y = make_noise(x, reader.wav_scale) + babble_id = "babble-%05d" % (count) + logging.info("writing file %s", babble_id) + writer.write([babble_id], [y], [fs]) + if write_time_durs is not None: + okeys.append(babble_id) + info.append(y.shape[0] / fs) + + count += 1 utt_list = [] - for utt_idx in range(len(keys)): - if len(utt_list) < cur_spks: - utt_list.append(keys[utt_idx]) - continue - - x, fs = reader.read(utt_list) - fs = fs[0] - y = make_noise(x) - babble_id = "babble-%05d" % (count) - logging.info("writing file % s" % (babble_id)) - writer.write([babble_id], [y], [fs]) - if write_time_durs_spec is not None: - okeys.append(babble_id) - info.append(y.shape[0] / fs) - - count += 1 - utt_list = [] - cur_spks += 1 - if cur_spks > max_spks: - cur_spks = min_spks - - if write_time_durs_spec is not None: - logging.info("writing time durations to %s" % (write_time_durs_spec)) + cur_spks += 1 + if cur_spks > max_spks: + cur_spks = min_spks + + if write_time_durs is not None: + logging.info("writing time durations to %s", write_time_durs) u2td = Utt2Info.create(okeys, info) - u2td.save(write_time_durs_spec) + u2td.save(write_time_durs) - logging.info("finished making babble files, elapsed-time=%f" % (time.time() - t1)) + logging.info("finished making babble files, elapsed-time=%f", time.time() - t1) if __name__ == "__main__": @@ -102,10 +110,10 @@ def make_babble_noise_audio_files( parser = ArgumentParser(description="Creates babble noise by adding speech files") parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--output-path", required=True) - parser.add_argument("--output-script", required=True) - parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) + parser.add_argument("--output-recordings-file", required=True) + parser.add_argument("--write-time-durs", default=None) AR.add_class_args(parser) Writer.add_class_args(parser) diff --git a/hyperion/bin/make_wav2xvector.py b/hyperion/bin/make_wav2xvector.py new file mode 100755 index 00000000..b5972d1b --- /dev/null +++ b/hyperion/bin/make_wav2xvector.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +""" + Copyright 2023 Jesus Villalba (Johns Hopkins University) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import os +import sys +import time + +import numpy as np +import pandas as pd +import torch +from hyperion.hyp_defs import config_logger + +# from hyperion.torch import TorchModelLoader as TML +from hyperion.torch import TorchModel + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import ResNet1dXVector as R1dXVec +from hyperion.torch.models import ResNetXVector as RXVec +from hyperion.torch.models import Wav2ResNet1dXVector as W2R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as W2RXVec +from hyperion.torch.narchs import AudioFeatsMVN as AF +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + + +def init_feats(feats): + feat_args = AF.filter_args(**feats) + logging.info(f"feat args={feat_args}") + logging.info("initializing feature extractor") + feat_extractor = AF(trans=True, **feat_args) + logging.info(f"feat-extractor={feat_extractor}") + return feat_extractor + + +def load_model(model_path): + logging.info("loading model %s", model_path) + model = TorchModel.auto_load(model_path) + logging.info(f"xvector-model={model}") + return model + + +def make_wav2xvector(feats, xvector_path, output_path): + + feats = init_feats(feats) + xvector_model = load_model(xvector_path) + if isinstance(xvector_model, RXVec): + model = W2RXVec(feats, xvector_model) + elif isinstance(xvector_model, R1dXVec): + model = W2R1dXVec(feats, xvector_model) + else: + TypeError( + "Conversion of xvector class=%s not available", xvector_model.__class__ + ) + + logging.info("saving model of class %s to %s", model.__class__, output_path) + model.save(output_path) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="""Combines the feature extractor config with XVector model + to produce a Wav2XVector model with integrated feature extraction""" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + AF.add_class_args(parser, prefix="feats") + parser.add_argument("--xvector-path", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + del args.cfg + logging.debug(args) + + make_wav2xvector(**namespace_to_dict(args)) diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py new file mode 100755 index 00000000..6a275f5c --- /dev/null +++ b/hyperion/bin/merge_scores.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path + +from hyperion.hyp_defs import config_logger + +from hyperion.utils import TrialScores +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + + +def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx): + + output_file = Path(output_file) + output_file.parent.mkdir(exist_ok=True, parents=True) + + ext = output_file.suffix + + if input_files is None: + input_file_base = output_file.with_suffix("") + input_files = [] + for i in range(num_enroll_parts): + idx_i = base_idx + i + for j in range(num_test_parts): + idx_j = base_idx + j + input_file_i = input_file_base.with_suffix(f".{idx_i}.{idx_j}{ext}") + input_files.append(input_file_i) + + if ext == ".h5": + # if files are h5 we need to load everything in RAM + score_list = [] + for score_file in input_files: + scores = TrialScores.load(score_file) + score_list.append(scores) + + scores = TrialScores.merge(score_list) + scores.save(output_file) + else: + has_header = ext in [".csv", ".tsv"] + write_header = True + with open(output_file, "w", encoding="utf-8") as f_out: + for score_file in input_files: + with open(score_file) as f_in: + for i, line in enumerate(f_in): + if i == 0 and has_header and not write_header: + continue + f_out.write(line) + write_header = False + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--input-files", default=None, nargs="+", help="optional list of input files" + ) + parser.add_argument( + "--output-file", + required=True, + help="""output file, if input-files is None, input files names are derived from it""", + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts we divided the enrollment set""", + ) + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts we divided the test set""", + ) + + parser.add_argument( + "--base-idx", + default=1, + type=int, + help="""index of the first job, typically 0 or 1""", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + ) + + args = parser.parse_args() + kwargs = namespace_to_dict(args) + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + merge_scores(**kwargs) diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index 78ac59c1..b2a1bc2b 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -13,8 +13,12 @@ from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def pack_wav_rirs(input_path, output_spec, **kwargs): @@ -32,12 +36,15 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): h[h < 1e-3] = 0 h = np.trim_zeros(h) logging.info( - "Packing rir %s h_max=%f h_delay=%d h-length=%d" - % (key, h_max, h_delay, len(h)) + "Packing rir %s h_max=%f h_delay=%d h-length=%d", + key, + h_max, + h_delay, + len(h), ) writer.write([key], [h]) - logging.info("Packed RIRS elapsed-time=%.f" % (time.time() - t1)) + logging.info("Packed RIRS elapsed-time=%.f", time.time() - t1) if __name__ == "__main__": diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 6af0202c..14da4d07 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -18,9 +18,14 @@ from hyperion.np.clustering import AHC from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet -from hyperion.utils.math import cosine_scoring -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) +from hyperion.utils.math_funcs import cosine_scoring +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py index e90ad0f7..f6723c7d 100755 --- a/hyperion/bin/prepare_data.py +++ b/hyperion/bin/prepare_data.py @@ -8,8 +8,12 @@ from hyperion.data_prep import DataPrep from hyperion.hyp_defs import config_logger -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) def make_parser(data_prep_class): @@ -33,6 +37,5 @@ def make_parser(data_prep_class): config_logger(1) data_prep_class = DataPrep.registry[args.subcommand] args = namespace_to_dict(args)[args.subcommand] - data_prep = data_prep_class(**args) data_prep.prepare() diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index e8adfd16..bda9a503 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -15,13 +15,26 @@ from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from scipy import ndimage, signal +def resample_vad(vad, length): + step = (len(vad) - 1) / length + assert step < 1 + idx = step * np.arange(length, dtype=float) + idx = np.round(idx).astype(int) + return vad[idx] + + def process_vad(vad, length, fs, dilation, erosion): - vad = signal.resample(vad, length) > 0.5 + # vad = signal.resample(vad, length) > 0.5 + vad = resample_vad(vad, length) if dilation > 0: iters = int(dilation * fs) vad = ndimage.binary_dilation(vad, iterations=iters) @@ -34,9 +47,9 @@ def process_vad(vad, length, fs, dilation, erosion): def process_audio_files( - input_path, + recordings_file, output_path, - output_script, + output_recordings_file, write_time_durs_spec, vad_spec, vad_path_prefix, @@ -44,86 +57,92 @@ def process_audio_files( vad_dilation=0, vad_erosion=0, remove_dc_offset=False, - **kwargs + **kwargs, ): input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) - logging.info("input_args={}".format(input_args)) - logging.info("output_args={}".format(output_args)) + logging.info(f"input_args={input_args}") + logging.info(f"output_args={output_args}") if write_time_durs_spec is not None: keys = [] info = [] - with AR(input_path, **input_args) as reader: - with Writer(output_path, output_script, **output_args) as writer: + with AR(recordings_file, **input_args) as reader, Writer( + output_path, output_recordings_file, **output_args + ) as writer: - if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) + if vad_spec is not None: + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) - t1 = time.time() - for data in reader: - key, x, fs = data - logging.info("Processing audio %s" % (key)) - t2 = time.time() - - tot_samples = x.shape[0] - if vad_spec is not None: - num_vad_frames = int(round(tot_samples * vad_fs / fs)) - vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( - "bool", copy=False - ) - logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) - vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion) - logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) - x = x[vad] - - logging.info( - "utt %s detected %f/%f secs (%.2f %%) speech " - % ( - key[0], - x.shape[0] / fs, - tot_samples / fs, - x.shape[0] / tot_samples * 100, - ) - ) + t1 = time.time() + for data in reader: + key, x, fs = data + logging.info("Processing audio %s", key) + t2 = time.time() - if x.shape[0] > 0: - if remove_dc_offset: - x -= np.mean(x) - - writer.write([key], [x], [fs]) - if write_time_durs_spec is not None: - keys.append(key) - info.append(x.shape[0] / fs) - - xmax = np.max(x) - xmin = np.min(x) - else: - xmax = 0 - xmin = 0 - - t3 = time.time() - dt2 = (t2 - t1) * 1000 - dt3 = (t3 - t1) * 1000 - time_dur = len(x) / fs - rtf = (time_dur * 1000) / dt3 - logging.info( - ( - "Packed audio %s length=%0.3f secs " - "elapsed-time=%.2f ms. " - "read-time=%.2f ms. write-time=%.2f ms. " - "real-time-factor=%.2f" - "x-range=[%f-%f]" - ) - % (key, time_dur, dt3, dt2, dt3 - dt2, rtf, xmin, xmax) + tot_samples = x.shape[0] + if vad_spec is not None: + num_vad_frames = int(round(tot_samples * vad_fs / fs)) + vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( + "bool", copy=False ) - t1 = time.time() + logging.info("vad=%d/%d", np.sum(vad == 1), len(vad)) + vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion) + logging.info("vad=%d/%d", np.sum(vad == 1), len(vad)) + x = x[vad] + + logging.info( + "utt %s detected %f/%f secs (%.2f %%) speech ", + key[0], + x.shape[0] / fs, + tot_samples / fs, + x.shape[0] / tot_samples * 100, + ) + + if x.shape[0] > 0: + if remove_dc_offset: + x -= np.mean(x) + + writer.write([key], [x], [fs]) + if write_time_durs_spec is not None: + keys.append(key) + info.append(x.shape[0] / fs) + + xmax = np.max(x) + xmin = np.min(x) + else: + xmax = 0 + xmin = 0 + + t3 = time.time() + dt2 = (t2 - t1) * 1000 + dt3 = (t3 - t1) * 1000 + time_dur = len(x) / fs + rtf = (time_dur * 1000) / dt3 + logging.info( + ( + "Packed audio %s length=%0.3f secs " + "elapsed-time=%.2f ms. " + "read-time=%.2f ms. write-time=%.2f ms. " + "real-time-factor=%.2f " + "x-range=[%f - %f]" + ), + key, + time_dur, + dt3, + dt2, + dt3 - dt2, + rtf, + xmin, + xmax, + ) + t1 = time.time() if write_time_durs_spec is not None: - logging.info("writing time durations to %s" % (write_time_durs_spec)) + logging.info("writing time durations to %s", write_time_durs_spec) u2td = Utt2Info.create(keys, info) u2td.save(write_time_durs_spec) @@ -135,9 +154,9 @@ def process_audio_files( ) parser.add_argument("--cfg", action=ActionConfigFile) - parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--recordings-file", required=True) parser.add_argument("--output-path", required=True) - parser.add_argument("--output-script", required=True) + parser.add_argument("--output-recordings-file", required=True) parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 8e1653b1..f132a35c 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -5,6 +5,7 @@ """ import logging import multiprocessing + # import sys import os import time @@ -17,13 +18,19 @@ from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.models import (HFHubert2ResNet1dXVector, - HFWav2Vec2ResNet1dXVector, - HFWavLM2ResNet1dXVector) +from hyperion.torch.models import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, @@ -95,7 +102,7 @@ def train_model(gpu_id, args): trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: - logging.info("trainer args={}".format(trn_args)) + logging.info(f"trainer args={trn_args}") metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py new file mode 100755 index 00000000..ddf292b8 --- /dev/null +++ b/hyperion/bin/train_wav2xvector.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +from pathlib import Path + +import torch +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import AudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.trainers import XVectorTrainer as Trainer +from hyperion.torch.utils import ddp +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_xvector(num_classes, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["model"]) + if rank == 0: + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["xvector"]["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info("x-vector-model={}".format(model)) + return model + + +def train_xvec(gpu_id, args): + + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + model = init_xvector(list(train_loader.dataset.num_classes.values())[0], **kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset", skip={}) + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.dataset.class_files", "data.val.dataset.class_files" + ) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_class_args(parser, prefix="model") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train Wav2XVector from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) diff --git a/hyperion/data_prep/__init__.py b/hyperion/data_prep/__init__.py index e978e219..9d885718 100644 --- a/hyperion/data_prep/__init__.py +++ b/hyperion/data_prep/__init__.py @@ -4,6 +4,8 @@ """ from .data_prep import DataPrep +from .musan import MusanDataPrep +from .rirs import RIRSDataPrep from .voxceleb2 import VoxCeleb2DataPrep from .voxceleb1 import VoxCeleb1DataPrep from .voxsrc22 import VoxSRC22DataPrep diff --git a/hyperion/data_prep/data_prep.py b/hyperion/data_prep/data_prep.py index d9828674..0f654676 100644 --- a/hyperion/data_prep/data_prep.py +++ b/hyperion/data_prep/data_prep.py @@ -67,7 +67,8 @@ def _get_recording_duration(recordings, i, n): def get_recording_duration(self, recording_set): import itertools - from ..utils import SCPList + + # from ..utils import SCPList #don't remember why I put this here futures = [] logging.info("submitting threats...") diff --git a/hyperion/data_prep/musan.py b/hyperion/data_prep/musan.py new file mode 100644 index 00000000..abf7a46c --- /dev/null +++ b/hyperion/data_prep/musan.py @@ -0,0 +1,107 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import glob +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class MusanDataPrep(DataPrep): + """Class for preparing Musan database into tables + + Attributes: + corpus_dir: input data directory + subset: subset of the data noise, music, speech + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + subset: str, + output_dir: PathLike, + target_sample_freq: int, + num_threads: int = 10, + **kwargs, + ): + super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads) + self.subset = subset + + @staticmethod + def dataset_name(): + return "musan" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + parser.add_argument( + "--subset", + choices=["noise", "music", "speech"], + help="""musan subset in [noise, music, speech]""", + required=True, + ) + + def prepare(self): + logging.info( + "Peparing Musan %s corpus_dir:%s -> data_dir:%s", + self.subset, + self.corpus_dir, + self.output_dir, + ) + rec_dir = self.corpus_dir / self.subset + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + rec_ids = [f.with_suffix("").name for f in rec_files] + storage_paths = [str(f) for f in rec_files] + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + { + "id": rec_ids, + "duration": recs.loc[rec_ids, "duration"].values, + "noise_type": self.subset, + } + ) + segments = SegmentSet(segments) + segments.sort() + logging.info("making dataset") + dataset = Dataset( + segments, + recordings=recs, + ) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", + len(segments), + ) diff --git a/hyperion/data_prep/rirs.py b/hyperion/data_prep/rirs.py new file mode 100644 index 00000000..066819a8 --- /dev/null +++ b/hyperion/data_prep/rirs.py @@ -0,0 +1,103 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import glob +import re +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ActionYesNo +from tqdm import tqdm + +from ..utils import Dataset, RecordingSet, SegmentSet +from ..utils.misc import PathLike, urlretrieve_progress +from .data_prep import DataPrep + + +class RIRSDataPrep(DataPrep): + """Class for preparing Musan database into tables + + Attributes: + corpus_dir: input data directory + output_dir: output data directory + target_sample_freq: target sampling frequency to convert the audios to. + """ + + def __init__( + self, + corpus_dir: PathLike, + output_dir: PathLike, + target_sample_freq: int, + num_threads: int = 10, + **kwargs, + ): + super().__init__(corpus_dir, output_dir, False, target_sample_freq, num_threads) + + @staticmethod + def dataset_name(): + return "rirs" + + @staticmethod + def add_class_args(parser): + DataPrep.add_class_args(parser) + + def prepare(self): + logging.info( + "Peparing RIRS corpus_dir:%s -> data_dir:%s", + self.corpus_dir, + self.output_dir, + ) + rec_dir = self.corpus_dir + rirs_file = self.corpus_dir / "rir_list" + if rirs_file.exists(): + rirs_table = pd.read_csv( + rirs_file, + sep=" ", + header=None, + names=["dummy1", "rir_id", "dummy2", "room_id", "rec_files"], + ) + rec_files = [Path(f) for f in rirs_table["rec_files"].values] + room_ids = rirs_table["room_id"].values + else: + logging.info("searching audio files in %s", str(rec_dir)) + rec_files = list(rec_dir.glob("**/*.wav")) + room_ids = None + if not rec_files: + # symlinks? try glob + rec_files = [ + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + ] + + assert len(rec_files) > 0, "recording files not found" + + rec_ids = [f.with_suffix("").name for f in rec_files] + storage_paths = [str(f) for f in rec_files] + logging.info("making RecordingSet") + recs = pd.DataFrame({"id": rec_ids, "storage_path": storage_paths}) + recs = RecordingSet(recs) + recs.sort() + + logging.info("getting recording durations") + self.get_recording_duration(recs) + if self.target_sample_freq: + recs["target_sample_freq"] = self.target_sample_freq + + logging.info("making SegmentsSet") + segments = pd.DataFrame( + {"id": rec_ids, "duration": recs.loc[rec_ids, "duration"].values,} + ) + if room_ids is not None: + segments["room_id"] = room_ids + segments = SegmentSet(segments) + segments.sort() + logging.info("making dataset") + dataset = Dataset(segments, recordings=recs,) + logging.info("saving dataset at %s", self.output_dir) + dataset.save(self.output_dir) + logging.info( + "datasets containts %d segments", len(segments), + ) diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py index b3958605..025fad37 100644 --- a/hyperion/data_prep/voxceleb1.py +++ b/hyperion/data_prep/voxceleb1.py @@ -233,17 +233,19 @@ def prepare(self): Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) ] + assert len(rec_files) > 0, "recording files not found" + speakers = [f.parents[1].name for f in rec_files] video_ids = [f.parent.name for f in rec_files] if self.cat_videos: + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)] lists_cat_dir = self.output_dir / "lists_cat" lists_cat_dir.mkdir(exist_ok=True, parents=True) - uniq_video_ids, uniq_video_idx, video_idx = np.unique( - video_ids, return_index=True, return_inverse=True + rec_ids, uniq_rec_idx, rec_idx = np.unique( + rec_ids, return_index=True, return_inverse=True ) - rec_ids = uniq_video_ids - speakers = [speakers[i] for i in uniq_video_idx] - rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] + speakers = [speakers[i] for i in uniq_rec_idx] + video_ids = [video_ids[i] for i in uniq_rec_idx] file_paths = [] futures = [] @@ -256,15 +258,13 @@ def prepare(self): lists_cat_dir, rec_id, rec_files, - video_idx, + rec_idx, i, ) futures.append(future) logging.info("waiting threats...") file_paths = [f.result() for f in tqdm(futures)] - video_ids = uniq_video_ids - else: file_names = [f.with_suffix("").name for f in rec_files] if self.use_kaldi_ids: @@ -331,7 +331,7 @@ def prepare(self): dataset = Dataset( segments, classes={"speaker": speakers, "language_est": languages}, - recordings={"recordings": recs}, + recordings=recs, enrollments=enrollments, trials=trials, sparse_trials=False, diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index 29ad3e44..969f2228 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -148,24 +148,27 @@ def prepare(self): df_lang = self._get_langs_est() rec_dir = self.corpus_dir / self.subset logging.info("searching audio files in %s", str(rec_dir)) - rec_files = list(rec_dir.glob("**/*.m4a")) + rec_files1 = list(rec_dir.glob("**/*.m4a")) + rec_files = [Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True)] if not rec_files: # symlinks? try glob rec_files = [ - Path(f) for f in glob.iglob(f"{rec_dir}/**/*.wav", recursive=True) + Path(f) for f in glob.iglob(f"{rec_dir}/**/*.m4a", recursive=True) ] + assert len(rec_files) > 0, "recording files not found" + speakers = [f.parents[1].name for f in rec_files] video_ids = [f.parent.name for f in rec_files] if self.cat_videos: + rec_ids = [f"{s}-{v}" for s, v in zip(speakers, video_ids)] lists_cat_dir = self.output_dir / "lists_cat" lists_cat_dir.mkdir(exist_ok=True, parents=True) - uniq_video_ids, uniq_video_idx, video_idx = np.unique( - video_ids, return_index=True, return_inverse=True + rec_ids, uniq_rec_idx, rec_idx = np.unique( + rec_ids, return_index=True, return_inverse=True ) - rec_ids = uniq_video_ids - speakers = [speakers[i] for i in uniq_video_idx] - rec_ids = [f"{s}-{v}" for s, v in zip(speakers, uniq_video_ids)] + speakers = [speakers[i] for i in uniq_rec_idx] + video_ids = [video_ids[i] for i in uniq_rec_idx] file_paths = [] futures = [] @@ -178,15 +181,13 @@ def prepare(self): lists_cat_dir, rec_id, rec_files, - video_idx, + rec_idx, i, ) futures.append(future) logging.info("waiting threats...") file_paths = [f.result() for f in tqdm(futures)] - video_ids = uniq_video_ids - else: file_names = [f.with_suffix("").name for f in rec_files] if self.use_kaldi_ids: @@ -252,7 +253,7 @@ def prepare(self): dataset = Dataset( segments, {"speaker": speakers, "language_est": languages}, - {"recordings": recs}, + recs, ) logging.info("saving dataset at %s", self.output_dir) dataset.save(self.output_dir) diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py index 79369149..f81f6eaf 100644 --- a/hyperion/data_prep/voxsrc22.py +++ b/hyperion/data_prep/voxsrc22.py @@ -127,6 +127,9 @@ def prepare_track12_dev(self): rec_ids = vox22_segmentid + vox1_segmentid rec_files = vox22_rec_files + vox1_rec_files + assert len(vox22_rec_files) > 0, "vox22 recording files not found" + assert len(vox1_rec_files) > 0, "vox1 recording files not found" + recs = pd.DataFrame({"id": rec_ids, "storage_path": rec_files}) recs = RecordingSet(recs) recs.sort() @@ -148,7 +151,7 @@ def prepare_track12_dev(self): logging.info("making dataset") dataset = Dataset( segments, - recordings={"recordings": recs}, + recordings=recs, enrollments=enrollments, trials=trials, sparse_trials=False, @@ -160,50 +163,6 @@ def prepare_track12_dev(self): len(segments), ) - # wav_file = voxsrc22_corpus_dir / file_id - # wav_file = vox1_corpus_dir / "wav" / file_id - # logging.info("searching audio files in %s", self.vox1_corpus_dir) - # vox1_rec_files = list(self.vox1_corpus_dir.glob("**/*.wav")) - # if not vox1_rec_files: - # # symlinks? try glob - # vox1_rec_files = [ - # Path(f) for f in glob.iglob(f"{self.vox1_corpus_dir}/**/*.wav", recursive=True) - # ] - - # vox1_rec_ids = [ f.parent.parent.name / f.parent.name / f.name for f in vox1_rec_files] - # rec_files = - - # rec_files = list(self.corpus_dir.glob("**/*.wav")) - # if not rec_files: - # # symlinks? try glob - # rec_files = [ - # Path(f) for f in glob.iglob(f"{self.corpus_dir}/**/*.wav", recursive=True) - # ] - - # u2s_file = output_dir / "utt2spk" - # logging.info("creating utt2spk file %s", u2s_file) - # file_ids = np.unique(np.concatenate((df_trials["enroll"], df_trials["test"]))) - # with open(u2s_file, "w") as f: - # for file_id in file_ids: - # f.write("%s %s\n" % (file_id, file_id)) - - # s2u_file = output_dir / "spk2utt" - # logging.info("creating spk2utt file %s", s2u_file) - # with open(s2u_file, "w") as f: - # for file_id in file_ids: - # f.write("%s %s\n" % (file_id, file_id)) - - # wav_file = output_dir / "wav.scp" - # logging.info("creating wav.scp file %s", wav_file) - # with open(wav_file, "w") as f: - # for file_id in file_ids: - # if "VoxSRC2022_dev" in file_id: - # wav_file = voxsrc22_corpus_dir / file_id - # else: - # wav_file = vox1_corpus_dir / "wav" / file_id - - # f.write("%s %s\n" % (file_id, wav_file)) - def prepare_track12_test(self): logging.info( "Preparing VoxSRC22 %s corpus:%s -> %s", diff --git a/hyperion/helpers/trial_data_reader.py b/hyperion/helpers/trial_data_reader.py index 4f33770b..85904eb2 100644 --- a/hyperion/helpers/trial_data_reader.py +++ b/hyperion/helpers/trial_data_reader.py @@ -16,7 +16,7 @@ from ..utils.utt2info import Utt2Info -class TrialDataReader(object): +class TrialDataReader: """ Loads Ndx, enroll file and x-vectors to evaluate PLDA. """ diff --git a/hyperion/helpers/vector_class_reader.py b/hyperion/helpers/vector_class_reader.py index c4c531ad..a9993768 100644 --- a/hyperion/helpers/vector_class_reader.py +++ b/hyperion/helpers/vector_class_reader.py @@ -49,7 +49,7 @@ def __init__( v[0]: int(v[1]) for v in [line.rstrip().split() for line in f] } - self.rng = np.random.RandomState(vcr_seed) + self.rng = np.random.default_rng(vcr_seed) self.csplit_max_spc = csplit_max_spc self.csplit_min_spc = csplit_min_spc self.csplit_mode = csplit_mode diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py index 6cf22d5f..eaf76d49 100644 --- a/hyperion/io/ark_data_reader.py +++ b/hyperion/io/ark_data_reader.py @@ -223,8 +223,8 @@ def read( self._eof = True break - row_offset_i = row_offset[i] if row_offset_is_list else row_offset - num_rows_i = num_rows[i] if num_rows_is_list else num_rows + row_offset_i = row_offset[count] if row_offset_is_list else row_offset + num_rows_i = num_rows[count] if num_rows_is_list else num_rows binary = init_kaldi_input_stream(self.f) data_i = KaldiMatrix.read( @@ -269,7 +269,7 @@ def __init__( self, file_path: PathLike, path_prefix: Optional[PathLike] = None, **kwargs ): super().__init__(file_path, permissive=False, **kwargs) - self.feature_set = FeatureSet.load(self.file_path, sep=scp_sep) + self.feature_set = FeatureSet.load(self.file_path) if self.num_parts > 1: self.feature_set = self.feature_set.split(self.part_idx, self.num_parts) diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 6c152cc5..a1adaef0 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -55,7 +55,7 @@ def __init__( self, recordings: Union[RecordingSet, PathLike], segments: Union[SegmentSet, PathLike, None] = None, - wav_scale: float = 2 ** 15 - 1, + wav_scale: float = 1.0, ): if not isinstance(recordings, RecordingSet): recordings = RecordingSet.load(recordings) @@ -255,7 +255,7 @@ def __init__( self, recordings: Union[RecordingSet, PathLike], segments: Union[SegmentSet, PathLike, None] = None, - wav_scale: float = 2 ** 15 - 1, + wav_scale: float = 1.0, part_idx: int = 1, num_parts: int = 1, ): @@ -373,7 +373,8 @@ def add_class_args(parser, prefix: Optional[str] = None): parser.add_argument( "--wav-scale", - default=2 ** 15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) @@ -399,8 +400,7 @@ def add_class_args(parser, prefix: Optional[str] = None): if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args @@ -411,7 +411,7 @@ def __init__( self, recordings: Union[RecordingSet, PathLike], segments: Union[SegmentSet, PathLike, None] = None, - wav_scale: float = 2 ** 15 - 1, + wav_scale: float = 1.0, ): super().__init__(recordings, segments, wav_scale) @@ -524,14 +524,14 @@ def add_class_args(parser, prefix: Optional[str] = None): parser.add_argument( "--wav-scale", - default=2 ** 15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), + "--" + prefix, action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py index e416c209..ca0dde9f 100644 --- a/hyperion/io/audio_writer.py +++ b/hyperion/io/audio_writer.py @@ -27,12 +27,33 @@ "DOUBLE": "float64", "MS_ADPCM": "int16", "ULAW": "int16", - "PCM_U8": "uint8", - "PCM_S8": "int8", + "PCM_S8": "int16", "VORBIS": "float32", "GSM610": "int16", "G721_32": "int16", - "PCM_24": "int24", + "PCM_24": "int32", +} + +scale_32 = 2 ** 31 - 1 +scale_24 = 2 ** 23 - 1 +scale_16 = 2 ** 15 - 1 +scale_8 = 2 ** 7 - 1 + + +subtype_to_scale = { + "PCM_32": scale_32, + "ALAW": scale_16, + "IMA_ADPCM": scale_16, + "FLOAT": 1, + "PCM_16": scale_16, + "DOUBLE": 1, + "MS_ADPCM": scale_16, + "ULAW": scale_16, + "PCM_S8": scale_8, + "VORBIS": 1, + "GSM610": scale_16, + "G721_32": scale_16, + "PCM_24": scale_24, } @@ -45,6 +66,7 @@ class AudioWriter(object): audio_format: audio file format audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], if None, it uses soundfile defaults (recommended) + wav_scale: scale of the input waveform """ def __init__( @@ -53,6 +75,7 @@ def __init__( script_path: Optional[PathLike] = None, audio_format: str = "wav", audio_subtype: Optional[str] = None, + wav_scale: float = 1.0, ): self.output_path = Path(output_path) self.script_path = Path(script_path) if script_path is not None else None @@ -63,9 +86,15 @@ def __init__( if audio_subtype is None: self.subtype = sf.default_subtype(self.audio_format) else: - self.subtype = audio_subtype + self.subtype = audio_subtype.upper() assert sf.check_format(self.audio_format, self.subtype) + self._dtype = subtype_to_npdtype[self.subtype] + + self.wav_scale = wav_scale + # we multiply the audio for this number before saving it. + self._output_wav_scale = subtype_to_scale[self.subtype] / wav_scale + self.script_is_scp = False self.script_sep = None self.f_script = None @@ -78,7 +107,7 @@ def __init__( self.f_script = open(self.script_path, "w") else: self.script_sep = "," if script_ext == ".csv" else "\t" - self.f_script = open(self.script_path, "w", "utf-8") + self.f_script = open(self.script_path, "w", encoding="utf-8") row = self.script_sep.join( ["id", "storage_path", "duration", "sample_freq"] ) @@ -123,8 +152,7 @@ def write( data = [data] fs_is_list = isinstance(fs, (list, np.ndarray)) - assert self.subtype in subtype_to_npdtype - dtype = subtype_to_npdtype[self.subtype] + output_files = [] for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i @@ -135,7 +163,7 @@ def write( self.audio_format, ) fs_i = int(fs[i]) if fs_is_list else fs - data_i = data[i].astype(dtype, copy=False) + data_i = (self._output_wav_scale * data[i]).astype(self._dtype, copy=False) sf.write(output_file, data_i, fs_i, subtype=self.subtype) output_files.append(output_file) @@ -156,14 +184,11 @@ def write( @staticmethod def filter_args(**kwargs): valid_args = ( - "output_fs", - "output_wav_scale", - "output_audio_format", - "output_audio_subtype", - ) - return dict( - (re.sub("output_", "", k), kwargs[k]) for k in valid_args if k in kwargs + "wav_scale", + "audio_format", + "audio_subtype", ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -171,23 +196,27 @@ def add_class_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") - # parser.add_argument(p1+'output-wav-scale', default=1, type=float, - # help=('scale to divide the waveform before writing')) - parser.add_argument( - "--output-audio-format", + "--audio-format", default="flac", choices=["flac", "ogg", "wav"], help=("ouput audio format"), ) parser.add_argument( - "--output-audio-subtype", + "--audio-subtype", default=None, - choices=["pcm_16", "pcm_24", "float", "double", "vorbis"], + choices=["pcm_16", "pcm_24", "pcm_32", "float", "double", "vorbis"], help=("coding format for audio file"), ) + try: + parser.add_argument( + "--wav-scale", default="1.0", help=("input waveform scale wrt 1"), + ) + except: + pass + if prefix is not None: outer_parser.add_argument( "--" + prefix, action=ActionParser(parser=parser), diff --git a/hyperion/io/hyp_data_reader.py b/hyperion/io/hyp_data_reader.py index 575c3087..63d463fb 100644 --- a/hyperion/io/hyp_data_reader.py +++ b/hyperion/io/hyp_data_reader.py @@ -76,9 +76,8 @@ def read_random_slice(self, key, num_samples, rng, field=""): dataset = key + field assert dataset in self.f, "Dataset %s not found" % dataset num_rows = self.f[dataset].shape[0] - # print('hola',num_rows,num_samples,num_rows-num_samples) - # index = rng.random_integers(low=0, high=num_rows-num_samples, size=1)[0] - index = rng.randint(low=0, high=num_rows - num_samples + 1) + + index = rng.integers(low=0, high=num_rows - num_samples + 1) X = self.f[dataset][index : index + num_samples] return X, index diff --git a/hyperion/io/packed_audio_reader.py b/hyperion/io/packed_audio_reader.py index 17f78bc2..fb17cb18 100644 --- a/hyperion/io/packed_audio_reader.py +++ b/hyperion/io/packed_audio_reader.py @@ -378,7 +378,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( p1 + "wav-scale", - default=2 ** 15 - 1, + default=1.0, + # default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) @@ -633,7 +634,8 @@ def add_class_args(parser, prefix=None): parser.add_argument( p1 + "wav-scale", - default=2 ** 15, + default=1.0, + # default=2 ** 15, type=float, help=("multiplicative factor for waveform"), ) diff --git a/hyperion/io/rw_specifiers.py b/hyperion/io/rw_specifiers.py index 93123247..60e01ef1 100644 --- a/hyperion/io/rw_specifiers.py +++ b/hyperion/io/rw_specifiers.py @@ -387,11 +387,11 @@ def create(cls, rspecifier): if archive.suffix == ".csv": df = pd.read_csv(archive, nrows=2) storage_path = df["storage_path"].values[0] - if re.match(r".*\.h5$", scp_f2) is not None: + if re.match(r".*\.h5$", storage_path) is not None: archive_type = ArchiveType.H5 - elif re.match(r".*\.ark$", scp_f2) is not None: + elif re.match(r".*\.ark$", storage_path) is not None: archive_type = ArchiveType.ARK - elif re.match(r".*[cvg]$", scp_f2) is not None: + elif re.match(r".*[cvg]$", storage_path) is not None: archive_type = ArchiveType.AUDIO else: raise ValueError(f"Unknown format for {storage_path}") diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py index 799db930..1cc1a0be 100644 --- a/hyperion/np/augment/noise_augment.py +++ b/hyperion/np/augment/noise_augment.py @@ -26,7 +26,7 @@ class SingleNoiseAugment(object): min_snr: mininimum SNR(dB) to sample from. max_snr: maximum SNR(dB) to sample from. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -46,7 +46,7 @@ def __init__( self.cache = None self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -96,7 +96,7 @@ def forward(self, x): while noise is None or noise.shape[0] < num_samples: with self.lock: - noise_idx = self.rng.randint(len(self.noise_keys)) + noise_idx = self.rng.integers(len(self.noise_keys)) key = self.noise_keys[noise_idx] noise_k, fs_k = self.r.read([key]) noise_k = noise_k[0] @@ -112,12 +112,22 @@ def forward(self, x): with self.lock: self.cache = noise_k[need_samples:] + num_zeros = np.sum(noise == 0) with self.lock: + # add dither for noises files with many 0s. + if num_zeros > len(noise) // 3: + noise += 0.0001 * self.rng.standard_normal( + noise.shape, dtype=noise.dtype + ) + target_snr = self.rng.uniform(self.min_snr, self.max_snr) + scale = self._compute_noise_scale(x, noise, target_snr) info = {"noise_type": self.noise_type, "snr": target_snr} - return x + scale * noise, info + y = x + scale * noise + + return y, info def __call__(self, x): return self.forward(x) @@ -136,7 +146,7 @@ class NoiseAugment(object): is proportional to how often we want to sample a given noise type. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): @@ -166,7 +176,7 @@ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -177,7 +187,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: NoiseAugment object @@ -208,7 +218,7 @@ def forward(self, x): # decide whether to add noise or not with self.lock: - p = self.rng.random_sample() + p = self.rng.random() if p > self.noise_prob: # we don't add noise diff --git a/hyperion/np/augment/reverb_augment.py b/hyperion/np/augment/reverb_augment.py index cf4cc6cb..0b1f3596 100644 --- a/hyperion/np/augment/reverb_augment.py +++ b/hyperion/np/augment/reverb_augment.py @@ -39,7 +39,7 @@ class SingleReverbAugment(object): its first sample. preload_rirs: if True all RIRS are loaded into RAM. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -80,7 +80,7 @@ def __init__( self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -129,7 +129,7 @@ def forward(self, x): num_samples = x.shape[0] with self.lock: - rir_idx = self.rng.randint(len(self.rir_keys)) + rir_idx = self.rng.integers(len(self.rir_keys)) if self.preload_rirs: h = self.rirs[rir_idx] @@ -155,6 +155,7 @@ def forward(self, x): "h_max": h_max, "h_delay": h_delay, } + return y, info def __call__(self, x): @@ -176,7 +177,7 @@ class ReverbAugment(object): max_reverb_context: number of samples required as left context for the convolution operation. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -210,7 +211,7 @@ def __init__( self.lock = multiprocessing.Lock() if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -221,7 +222,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with reverb options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: ReverbAugment object. @@ -267,7 +268,7 @@ def forward(self, x): # decide whether to add reverb or not with self.lock: - p = self.rng.random_sample() + p = self.rng.random() if p > self.reverb_prob: # we don't add reverb diff --git a/hyperion/np/augment/speech_augment.py b/hyperion/np/augment/speech_augment.py index 0b1233f1..c27ca321 100644 --- a/hyperion/np/augment/speech_augment.py +++ b/hyperion/np/augment/speech_augment.py @@ -37,7 +37,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: SpeechAugment object. diff --git a/hyperion/np/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py index 18a15651..a648190d 100644 --- a/hyperion/np/augment/speed_augment.py +++ b/hyperion/np/augment/speed_augment.py @@ -22,7 +22,7 @@ class SpeedAugment(object): keep_length: applies padding or cropping to keep the lenght of the signal. random_seed: random seed for random number generator. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). """ def __init__( @@ -34,14 +34,16 @@ def __init__( rng=None, ): logging.info( - "init speed augment with prob={}, speed_ratios={}, keep_length={}". - format(speed_prob, speed_ratios, keep_length)) + "init speed augment with prob={}, speed_ratios={}, keep_length={}".format( + speed_prob, speed_ratios, keep_length + ) + ) self.speed_prob = speed_prob self.speed_ratios = speed_ratios self.keep_length = keep_length if rng is None: - self.rng = np.random.RandomState(seed=random_seed) + self.rng = np.random.default_rng(seed=random_seed) else: self.rng = deepcopy(rng) @@ -52,7 +54,7 @@ def create(cls, cfg, random_seed=112358, rng=None): Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by - np.random.RandomState (optional). + np.random.default_rng (optional). Returns: NoiseAugment object. @@ -84,7 +86,7 @@ def forward(self, x): """ # decide whether to add noise or not - p = self.rng.random_sample() + p = self.rng.random() if p > self.speed_prob: # we don't add speed perturbation info = {"speed_ratio": 1} @@ -98,14 +100,12 @@ def forward(self, x): # print(f"1 r={r} {x.shape} {y.shape}", flush=True) if self.keep_length: if r > 1: - dither = np.max(x) / 2**15 # we add some dither in the padding - pad_y = dither * np.ones( - (x.shape[-1] - y.shape[-1], ), dtype=y.dtype) + dither = np.max(x) / 2 ** 15 # we add some dither in the padding + pad_y = dither * np.ones((x.shape[-1] - y.shape[-1],), dtype=y.dtype) y = np.concatenate((y, pad_y), axis=-1) elif r < 1: - y = y[:x.shape[-1]] + y = y[: x.shape[-1]] - # print(f"2 r={r} {x.shape} {y.shape}", flush=True) return y, info def __call__(self, x): diff --git a/hyperion/np/classifiers/binary_logistic_regression.py b/hyperion/np/classifiers/binary_logistic_regression.py index 82a84529..e77115cd 100644 --- a/hyperion/np/classifiers/binary_logistic_regression.py +++ b/hyperion/np/classifiers/binary_logistic_regression.py @@ -29,7 +29,7 @@ class BinaryLogisticRegression(LogisticRegression): In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. priors: prior prob for having a positive sample. - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. diff --git a/hyperion/np/classifiers/greedy_fusion.py b/hyperion/np/classifiers/greedy_fusion.py index 842b850e..f03a05a0 100644 --- a/hyperion/np/classifiers/greedy_fusion.py +++ b/hyperion/np/classifiers/greedy_fusion.py @@ -42,8 +42,8 @@ class GreedyFusionBinaryLR(NPModel): In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. priors: prior prob for having a positive sample. - random_state: int, RandomState instance or None, optional, default: None - The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. + random_state: int, default_rng instance or None, optional, default: None + The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If default_rng instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and diff --git a/hyperion/np/classifiers/linear_gbe.py b/hyperion/np/classifiers/linear_gbe.py index a6b8c7cc..f551af14 100644 --- a/hyperion/np/classifiers/linear_gbe.py +++ b/hyperion/np/classifiers/linear_gbe.py @@ -10,7 +10,7 @@ from scipy.special import gammaln from ...hyp_defs import float_cpu -from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax +from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax from ..np_model import NPModel @@ -426,7 +426,8 @@ def add_class_args(parser, prefix=None): parser.add_argument("--name", default="lgbe", help="model name") if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) @staticmethod @@ -468,7 +469,8 @@ def add_eval_args(parser, prefix=None): ) if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/np/classifiers/linear_gbe_up.py b/hyperion/np/classifiers/linear_gbe_up.py index 8566aeab..37ac9656 100644 --- a/hyperion/np/classifiers/linear_gbe_up.py +++ b/hyperion/np/classifiers/linear_gbe_up.py @@ -9,8 +9,13 @@ from scipy.special import gammaln from ...hyp_defs import float_cpu -from ...utils.math import (fullcov_varfloor, int2onehot, invert_pdmat, - logdet_pdmat, softmax) +from ...utils.math_funcs import ( + fullcov_varfloor, + int2onehot, + invert_pdmat, + logdet_pdmat, + softmax, +) from ..np_model import NPModel from .linear_gbe import LinearGBE diff --git a/hyperion/np/classifiers/linear_svmc.py b/hyperion/np/classifiers/linear_svmc.py index 5d743a46..6a977df9 100644 --- a/hyperion/np/classifiers/linear_svmc.py +++ b/hyperion/np/classifiers/linear_svmc.py @@ -10,7 +10,7 @@ from sklearn.svm import LinearSVC as SVC from ...hyp_defs import float_cpu -from ...utils.math import softmax +from ...utils.math_funcs import softmax from ..np_model import NPModel @@ -41,7 +41,7 @@ class LinearSVMC(NPModel): The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None max_iter: int, default: 100 Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge. @@ -61,7 +61,7 @@ class LinearSVMC(NPModel): penalty and dual will be ignored. verbose: int, default: 0 balance_class_weight: if True and class_weight is None, it makes class_weight="balanced". - lr_seed: seed form RandomState, used when random_state is None. + lr_seed: seed form default_rng, used when random_state is None. labels: list of class labels """ @@ -93,7 +93,7 @@ def __init__( class_weight = "balanced" if random_state is None: - random_state = np.random.RandomState(seed=lr_seed) + random_state = np.random.default_rng(seed=lr_seed) self.use_bias = use_bias self.bias_scaling = bias_scaling diff --git a/hyperion/np/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py index 8e3d7e2e..4c4c0cfc 100644 --- a/hyperion/np/classifiers/logistic_regression.py +++ b/hyperion/np/classifiers/logistic_regression.py @@ -9,7 +9,7 @@ from sklearn.linear_model import LogisticRegression as LR from ...hyp_defs import float_cpu -from ...utils.math import softmax +from ...utils.math_funcs import softmax from ..np_model import NPModel @@ -36,7 +36,7 @@ class LogisticRegression(NPModel): Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - random_state: RandomState instance or None, optional, default: None + random_state: default_rng instance or None, optional, default: None Used when solver == ‘sag’ or ‘liblinear’. solver: {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’ Algorithm to use in the optimization problem. @@ -93,7 +93,7 @@ def __init__( super().__init__(**kwargs) if random_state is None: - random_state = np.random.RandomState(seed=lr_seed) + random_state = np.random.default_rng(seed=lr_seed) if bias_scaling is None: if use_bias and solver == "liblinear": diff --git a/hyperion/np/classifiers/q_scoring_homo_gbe.py b/hyperion/np/classifiers/q_scoring_homo_gbe.py index 9e54e0f4..3345dd72 100644 --- a/hyperion/np/classifiers/q_scoring_homo_gbe.py +++ b/hyperion/np/classifiers/q_scoring_homo_gbe.py @@ -9,7 +9,7 @@ from scipy.special import gammaln from ...hyp_defs import float_cpu -from ...utils.math import int2onehot, invert_pdmat, logdet_pdmat, softmax +from ...utils.math_funcs import int2onehot, invert_pdmat, logdet_pdmat, softmax from ..np_model import NPModel diff --git a/hyperion/np/classifiers/svmc.py b/hyperion/np/classifiers/svmc.py index 6b54034b..ac5211ef 100644 --- a/hyperion/np/classifiers/svmc.py +++ b/hyperion/np/classifiers/svmc.py @@ -12,7 +12,7 @@ from sklearn.svm import SVC from ...hyp_defs import float_cpu -from ...utils.math import softmax +from ...utils.math_funcs import softmax from ...utils.misc import filter_func_args from ..np_model import NPModel @@ -49,7 +49,7 @@ def __init__( class_weight = "balanced" if random_state is None: - random_state = np.random.RandomState(seed=lr_seed) + random_state = np.random.default_rng(seed=lr_seed) self.C = C self.kernel = kernel diff --git a/hyperion/np/feats/energy_vad.py b/hyperion/np/feats/energy_vad.py index 5b9eb751..1d578c68 100644 --- a/hyperion/np/feats/energy_vad.py +++ b/hyperion/np/feats/energy_vad.py @@ -5,6 +5,7 @@ import logging import numpy as np +from jsonargparse import ActionParser, ArgumentParser from scipy.signal import lfilter from ...hyp_defs import float_cpu @@ -19,7 +20,7 @@ class EnergyVAD(object): sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) frame_length: Frame length in milliseconds (default = 25) frame_shift: Frame shift in milliseconds (default = 10) - dither: Dithering constant (0.0 means no dither) (default = 1) + dither: Dithering constant (0.0 means no dither) (default = 2^(-15)) snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True) vad_energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5) vad_energy_threshold: Constant term in energy threshold for MFCC0 for VAD (also see --vad-energy-mean-scale) (float, default = 5) @@ -32,7 +33,7 @@ def __init__( sample_frequency=16000, frame_length=25, frame_shift=10, - dither=1, + dither=1 / 2 ** 15, snip_edges=True, vad_energy_mean_scale=0.5, vad_energy_threshold=5, @@ -97,7 +98,7 @@ def compute(self, x, return_loge=False): # add dither if self.dither > 0: - n = self.dither * np.random.RandomState(seed=len(x)).randn( + n = self.dither * np.random.default_rng(seed=len(x)).randn( len(x) ).astype(float_cpu(), copy=False) x = x + n @@ -174,14 +175,12 @@ def add_class_args(parser, prefix=None): parser: Arguments parser prefix: Options prefix. """ - - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "sample-frequency", + "--sample-frequency", default=16000, type=int, help=( @@ -191,24 +190,21 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "frame-length", - type=int, - default=25, - help="Frame length in milliseconds", + "--frame-length", type=int, default=25, help="Frame length in milliseconds", ) parser.add_argument( - p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + "--frame-shift", type=int, default=10, help="Frame shift in milliseconds" ) parser.add_argument( - p1 + "dither", + "--dither", type=float, - default=1, + default=1 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) parser.add_argument( - p1 + "snip-edges", + "--snip-edges", default=True, type=str2bool, help=( @@ -221,7 +217,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "vad-energy-mean-scale", + "--vad-energy-mean-scale", type=float, default=0.5, help=( @@ -231,13 +227,13 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "vad-energy-threshold", + "--vad-energy-threshold", type=float, default=5, help="Constant term in energy threshold for MFCC0 for VAD", ) parser.add_argument( - p1 + "vad-frames-context", + "--vad-frames-context", type=int, default=0, help=( @@ -246,7 +242,7 @@ def add_class_args(parser, prefix=None): ), ) parser.add_argument( - p1 + "vad-proportion-threshold", + "--vad-proportion-threshold", type=float, default=0.6, help=( @@ -254,5 +250,7 @@ def add_class_args(parser, prefix=None): "the window that need to have more energy than the threshold" ), ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/np/feats/mfcc.py b/hyperion/np/feats/mfcc.py index cd98840d..b56728b8 100644 --- a/hyperion/np/feats/mfcc.py +++ b/hyperion/np/feats/mfcc.py @@ -6,6 +6,7 @@ from enum import Enum import numpy as np +from jsonargparse import ActionParser, ArgumentParser from scipy.fftpack import dct from scipy.signal import lfilter @@ -72,7 +73,7 @@ class MFCC(object): preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"blackmann") (default = 'povey') use_fft2: If true, it uses |X(f)|^2, if false, it uses |X(f)|, (default = True) - dither: Dithering constant (0.0 means no dither) (default = 1) + dither: Dithering constant (0.0 means no dither) (default = 1/2**15) fb_type: Filter-bank type: mel_kaldi, mel_etsi, mel_librosa, mel_librosa_htk, linear (default = 'mel_kaldi') low_freq: Low cutoff frequency for mel bins (default = 20) high_freq: High cutoff frequency for mel bins (if < 0, offset from Nyquist) (default = 0) @@ -98,7 +99,7 @@ def __init__( preemphasis_coeff=0.97, window_type="povey", use_fft2=True, - dither=1, + dither=1 / 2 ** 15, fb_type="mel_kaldi", low_freq=20, high_freq=0, @@ -256,7 +257,7 @@ def compute(self, x, return_fft=False, return_spec=False, return_logfb=False): # add dither if self.dither > 0: - n = self.dither * np.random.RandomState(seed=len(x)).randn( + n = self.dither * np.random.default_rng(seed=len(x)).randn( len(x) ).astype(float_cpu(), copy=False) x = x + n @@ -400,14 +401,12 @@ def add_class_args(parser, prefix=None): parser: Arguments parser prefix: Options prefix. """ - - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "sample-frequency", + "--sample-frequency", default=16000, type=int, help="Waveform data sample frequency " @@ -415,27 +414,22 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "frame-length", - type=int, - default=25, - help="Frame length in milliseconds", - ) - parser.add_argument( - p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + "--frame-length", type=int, default=25, help="Frame length in milliseconds", ) parser.add_argument( - p1 + "fft-length", type=int, default=512, help="Length of FFT" + "--frame-shift", type=int, default=10, help="Frame shift in milliseconds" ) + parser.add_argument("--fft-length", type=int, default=512, help="Length of FFT") parser.add_argument( - p1 + "remove-dc-offset", + "--remove-dc-offset", default=True, type=str2bool, help="Subtract mean from waveform on each frame", ) parser.add_argument( - p1 + "preemphasis-coeff", + "--preemphasis-coeff", type=float, default=0.97, help="Coefficient for use in signal preemphasis", @@ -444,30 +438,30 @@ def add_class_args(parser, prefix=None): FWF.add_class_args(parser, prefix) parser.add_argument( - p1 + "use-fft2", + "--use-fft2", default=True, type=str2bool, help="If true, it uses |X(f)|^2, if false, it uses |X(f)|", ) parser.add_argument( - p1 + "dither", + "--dither", type=float, - default=1, + default=1 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) FBF.add_class_args(parser, prefix) parser.add_argument( - p1 + "num-ceps", + "--num-ceps", type=int, default=13, help="Number of cepstra in MFCC computation (including C0)", ) parser.add_argument( - p1 + "snip-edges", + "--snip-edges", default=True, type=str2bool, help=( @@ -480,34 +474,34 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "energy-floor", + "--energy-floor", type=float, default=0, help="Floor on energy (absolute, not relative) in MFCC computation", ) parser.add_argument( - p1 + "raw-energy", + "--raw-energy", default=True, type=str2bool, help="If true, compute energy before preemphasis and windowing", ) parser.add_argument( - p1 + "use-energy", + "--use-energy", default=True, type=str2bool, help="Use energy (not C0) in MFCC computation", ) parser.add_argument( - p1 + "cepstral-lifter", + "--cepstral-lifter", type=float, default=22, help="Constant that controls scaling of MFCCs", ) parser.add_argument( - p1 + "input-step", + "--input-step", default="wave", choices=["wave", "fft", "spec", "log_spec", "logfb"], help=( @@ -516,7 +510,7 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - p1 + "output-step", + "--output-step", default="mfcc", choices=["fft", "spec", "log_spec", "logfb", "mfcc"], help=( @@ -524,4 +518,7 @@ def add_class_args(parser, prefix=None): ), ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + add_argparse_args = add_class_args diff --git a/hyperion/np/metrics/__init__.py b/hyperion/np/metrics/__init__.py index 36afdbf5..d45daba5 100644 --- a/hyperion/np/metrics/__init__.py +++ b/hyperion/np/metrics/__init__.py @@ -5,7 +5,10 @@ from .acc import compute_accuracy from .confusion_matrix import * -from .dcf import (compute_act_dcf, compute_dcf, compute_min_dcf, - fast_eval_dcf_eer) +from .dcf import compute_act_dcf, compute_dcf, compute_min_dcf, fast_eval_dcf_eer from .eer import compute_eer, compute_prbep from .utils import effective_prior +from .verification_evaluator import ( + VerificationEvaluator, + VerificationAdvAttackEvaluator, +) diff --git a/hyperion/np/metrics/cllr.py b/hyperion/np/metrics/cllr.py index ec816286..cd97a97c 100644 --- a/hyperion/np/metrics/cllr.py +++ b/hyperion/np/metrics/cllr.py @@ -5,7 +5,7 @@ import numpy as np -from ..utils.math import neglogsigmoid +from ..utils.math_funcs import neglogsigmoid from .utils import opt_loglr diff --git a/hyperion/np/metrics/utils.py b/hyperion/np/metrics/utils.py index 0715d809..e638fd1b 100644 --- a/hyperion/np/metrics/utils.py +++ b/hyperion/np/metrics/utils.py @@ -8,7 +8,7 @@ import numpy as np from ...hyp_defs import float_cpu -from ...utils.math import logsumexp, softmax +from ...utils.math_funcs import logsumexp, softmax def effective_prior(p_tar, c_miss, c_fa): diff --git a/hyperion/np/metrics/verification_evaluator.py b/hyperion/np/metrics/verification_evaluator.py index 2adf15cf..e35e7cf7 100644 --- a/hyperion/np/metrics/verification_evaluator.py +++ b/hyperion/np/metrics/verification_evaluator.py @@ -2,8 +2,6 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - - import copy import logging import re @@ -18,13 +16,13 @@ import matplotlib.pyplot as plt from ...hyp_defs import float_cpu -from ...utils import TrialKey, TrialScores +from ...utils import TrialKey, TrialScores, SparseTrialKey, SparseTrialScores from ...utils.trial_stats import TrialStats from .dcf import fast_eval_dcf_eer from .utils import effective_prior -class VerificationEvaluator(object): +class VerificationEvaluator: """Class computes performance metrics for verification problems. Same metrics can be obtained from fast_eval_dcf_eer functions @@ -34,21 +32,40 @@ class VerificationEvaluator(object): p_tar: target prior float or list/nparray sorted in ascending order c_miss: cost of miss c_fa: cost of false alarm - + key_name: name describing the key + score_name: name describing the score + sparse: use sparse versions of TrialScores and Keys """ - def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): - + def __init__( + self, + key, + scores, + p_tar, + c_miss=None, + c_fa=None, + key_name=None, + score_name=None, + sparse=False, + ): if isinstance(key, str): - logging.info("Load key: %s" % key) - key = TrialKey.load(key) + logging.info("Load key: %s", key) + if sparse: + key = SparseTrialKey.load(key) + else: + key = TrialKey.load(key) if isinstance(scores, str): - logging.info("Load scores: %s" % scores) - scores = TrialScores.load(scores) + logging.info("Load scores: %s", scores) + if sparse: + scores = SparseTrialScores.load(scores) + else: + scores = TrialScores.load(scores) self.key = key self.scores = scores.align_with_ndx(key) + self.key_name = key_name + self.score_name = score_name # compute effective prior is c_miss and c_fa are given if isinstance(p_tar, float): @@ -56,13 +73,16 @@ def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): p_tar = np.asarray(p_tar) if c_miss is not None and c_fa is not None: + assert len(c_miss) == len(p_tar) + assert len(c_fa) == len(p_tar) c_miss = np.asarray(c_miss) c_fa = np.asarray(c_fa) p_tar = effective_prior(p_tar, c_miss, c_fa) + self._p_tar_sort = np.argsort(p_tar) self.p_tar = p_tar - def compute_dcf_eer(self, return_df=False): + def compute_dcf_eer(self, return_df=True): """ Computes DCF/EER @@ -74,24 +94,38 @@ def compute_dcf_eer(self, return_df=False): """ logging.info("separating tar/non") tar, non = self.scores.get_tar_non(self.key) + ntar = len(tar) + nnon = len(non) logging.info("computing EER/DCF") - min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer(tar, non, self.p_tar) + min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer( + tar, non, self.p_tar[self._p_tar_sort] + ) + min_dcf[self._p_tar_sort] = min_dcf.copy() + act_dcf[self._p_tar_sort] = act_dcf.copy() if not return_df: - return min_dcf, act_dcf, eer + return min_dcf, act_dcf, eer, ntar, nnon if len(self.p_tar) == 1: eer = [eer] min_dcf = [min_dcf] act_dcf = [act_dcf] - df = pd.DataFrame({"eer": eer}) - + df = pd.DataFrame( + { + "scores": [self.score_name], + "key": [self.key_name], + "eer": eer, + "eer(%)": eer * 100, + } + ) for i in range(len(min_dcf)): pi = self.p_tar[i] df["min-dcf-%.3f" % (pi)] = min_dcf[i] df["act-dcf-%.3f" % (pi)] = act_dcf[i] + df["num_targets"] = ntar + df["num_nontargets"] = nnon return df @@ -116,9 +150,7 @@ class VerificationAdvAttackEvaluator(VerificationEvaluator): def __init__( self, key, scores, attack_scores, attack_stats, p_tar, c_miss=None, c_fa=None ): - super(VerificationAdvAttackEvaluator, self).__init__( - key, scores, p_tar, c_miss, c_fa - ) + super().__init__(key, scores, p_tar, c_miss, c_fa) if not isinstance(attack_scores, list): attack_scores = [attack_scores] if not isinstance(attack_stats, list): @@ -133,7 +165,7 @@ def __init__( if isinstance(attack_scores[0], str): l = [] for file_path in attack_scores: - logging.info("Load attack scores: %s" % file_path) + logging.info("Load attack scores: %s", file_path) scores = TrialScores.load(file_path) l.append(scores) attack_scores = l @@ -151,7 +183,7 @@ def __init__( if isinstance(attack_stats[0], str): l = [] for file_path in attack_stats: - logging.info("Load attack stats: %s" % file_path) + logging.info("Load attack stats: %s", file_path) scores = TrialStats.load(file_path) l.append(scores) attack_stats = l @@ -216,7 +248,7 @@ def compute_dcf_eer_vs_stats( stat_bins, attacked_trials="all", higher_better=False, - return_df=False, + return_df=True, ): """ Computes DCF/EER versus SNR/Linf/etc curves @@ -307,7 +339,7 @@ def find_best_attacks( threshold=None, prior_idx=0, higher_better=False, - return_df=False, + return_df=True, ): """ Find the best attacks from the point of view of some of the stats. E.g., diff --git a/hyperion/np/pdfs/core/normal.py b/hyperion/np/pdfs/core/normal.py index b8f8bb54..67872315 100644 --- a/hyperion/np/pdfs/core/normal.py +++ b/hyperion/np/pdfs/core/normal.py @@ -7,11 +7,20 @@ import scipy.linalg as la from ....hyp_defs import float_cpu -from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat, - logdet_pdmat, symmat2vec, vec2symmat) -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import ( + fullcov_varfloor, + invert_pdmat, + invert_trimat, + logdet_pdmat, + symmat2vec, + vec2symmat, +) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from .exp_family import ExpFamily @@ -213,7 +222,7 @@ def sample(self, num_samples, rng=None, seed=1024): assert self.is_init if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) return rng.multivariate_normal(self.mu, self.Sigma, size=(num_samples,)).astype( float_cpu() ) diff --git a/hyperion/np/pdfs/core/normal_diag_cov.py b/hyperion/np/pdfs/core/normal_diag_cov.py index c9986f4c..23535112 100644 --- a/hyperion/np/pdfs/core/normal_diag_cov.py +++ b/hyperion/np/pdfs/core/normal_diag_cov.py @@ -7,9 +7,12 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from .exp_family import ExpFamily @@ -183,7 +186,7 @@ def sample(self, num_samples, rng=None, seed=1024): """ assert self.is_init if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) return self.mu + 1.0 / self.cholLambda * x diff --git a/hyperion/np/pdfs/hmm/hmm.py b/hyperion/np/pdfs/hmm/hmm.py index 80232e36..92d9c371 100644 --- a/hyperion/np/pdfs/hmm/hmm.py +++ b/hyperion/np/pdfs/hmm/hmm.py @@ -6,7 +6,7 @@ import numpy as np from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax +from ....utils.math_funcs import logsumexp, softmax from ..core import PDF @@ -232,7 +232,7 @@ def viterbi_decode(self, x, nbest=1): def sample(self, num_seqs, num_steps, rng=None, seed=1024): if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) x = np.zeros((num_seqs, num_steps, self.num_states), dtype=float_cpu()) x[:, 0, :] = rng.multinomial(1, self.pi, size=(num_seqs,)) diff --git a/hyperion/np/pdfs/jfa/jfa_total.py b/hyperion/np/pdfs/jfa/jfa_total.py index 041431fb..6e2b79e3 100644 --- a/hyperion/np/pdfs/jfa/jfa_total.py +++ b/hyperion/np/pdfs/jfa/jfa_total.py @@ -7,8 +7,13 @@ from scipy import linalg as la from ....hyp_defs import float_cpu -from ....utils.math import (invert_pdmat, invert_trimat, logdet_pdmat, - symmat2vec, vec2symmat) +from ....utils.math_funcs import ( + invert_pdmat, + invert_trimat, + logdet_pdmat, + symmat2vec, + vec2symmat, +) from ..core.pdf import PDF diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index 5560882c..2186522e 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -7,7 +7,7 @@ import numpy as np from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax +from ....utils.math_funcs import logsumexp, softmax from ....utils.queues import GeneratorQueue from ..core import PDF diff --git a/hyperion/np/pdfs/mixtures/gmm.py b/hyperion/np/pdfs/mixtures/gmm.py index ca197142..7b080dae 100644 --- a/hyperion/np/pdfs/mixtures/gmm.py +++ b/hyperion/np/pdfs/mixtures/gmm.py @@ -8,12 +8,22 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import (fullcov_varfloor, invert_pdmat, invert_trimat, - logdet_pdmat, logsumexp, softmax, symmat2vec, - vec2symmat) -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import ( + fullcov_varfloor, + invert_pdmat, + invert_trimat, + logdet_pdmat, + logsumexp, + softmax, + symmat2vec, + vec2symmat, +) +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from ...clustering import KMeans from ..core import Normal from .exp_family_mixture import ExpFamilyMixture @@ -292,7 +302,7 @@ def sample(self, num_samples, rng=None, seed=1024, r=None): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if r is None: r = rng.multinomial(1, self.pi, size=(num_samples,)) diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index 90141573..7589243e 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -8,10 +8,13 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import logsumexp, softmax +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from ...clustering import KMeans from .exp_family_mixture import ExpFamilyMixture @@ -262,7 +265,7 @@ def sample(self, num_samples=1, rng=None, seed=1024, r=None): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if r is None: r = rng.multinomial(1, self.pi, size=(num_samples,)) diff --git a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py index 4dc8f46e..6ef7c891 100644 --- a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py @@ -7,10 +7,13 @@ from scipy.special import erf from ....hyp_defs import float_cpu -from ....utils.math import logsumexp, softmax -from ....utils.plotting import (plot_gaussian_1D, plot_gaussian_3D, - plot_gaussian_ellipsoid_2D, - plot_gaussian_ellipsoid_3D) +from ....utils.math_funcs import logsumexp, softmax +from ....utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_3D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, +) from ...clustering import KMeans from .gmm_diag_cov import GMMDiagCov @@ -193,7 +196,7 @@ def sample(self, num_samples=1, rng=None, seed=1024, r=None): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if r is None: r = rng.multinomial(1, self.pi, size=(num_samples,)) diff --git a/hyperion/np/pdfs/plda/frplda.py b/hyperion/np/pdfs/plda/frplda.py index 183725a7..af8c5d8b 100644 --- a/hyperion/np/pdfs/plda/frplda.py +++ b/hyperion/np/pdfs/plda/frplda.py @@ -7,7 +7,7 @@ from scipy import linalg as sla from ....hyp_defs import float_cpu -from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase @@ -465,7 +465,7 @@ def sample( assert self.is_init if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) Sb = invert_pdmat(self.B, return_inv=True)[-1] chol_Sb = sla.cholesky(Sb, lower=False) diff --git a/hyperion/np/pdfs/plda/plda.py b/hyperion/np/pdfs/plda/plda.py index fd2eb9a9..76299970 100644 --- a/hyperion/np/pdfs/plda/plda.py +++ b/hyperion/np/pdfs/plda/plda.py @@ -7,7 +7,7 @@ from scipy import linalg as sla from ....hyp_defs import float_cpu -from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase @@ -674,7 +674,7 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) x_dim = self.mu.shape[0] diff --git a/hyperion/np/pdfs/plda/splda.py b/hyperion/np/pdfs/plda/splda.py index f9322d26..5d397183 100644 --- a/hyperion/np/pdfs/plda/splda.py +++ b/hyperion/np/pdfs/plda/splda.py @@ -6,7 +6,7 @@ from scipy import linalg as sla from ....hyp_defs import float_cpu -from ....utils.math import invert_pdmat, invert_trimat, logdet_pdmat +from ....utils.math_funcs import invert_pdmat, invert_trimat, logdet_pdmat from .plda_base import PLDABase @@ -502,7 +502,7 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): Generated samples with shape (num_samples, x_dim). """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) Sw = invert_pdmat(self.W, return_inv=True)[-1] chol_Sw = sla.cholesky(Sw, lower=False) diff --git a/hyperion/np/transforms/skl_tsne.py b/hyperion/np/transforms/skl_tsne.py index 3f60c4be..ebabc6ec 100644 --- a/hyperion/np/transforms/skl_tsne.py +++ b/hyperion/np/transforms/skl_tsne.py @@ -23,7 +23,7 @@ class SklTSNE(NPModel): metric: the metric to use when calculating distance between instances in ['cosine', 'euclidean', 'l1', 'l2', 'precomputed'] or callable function. init: initialization method in ['random', 'pca'] or embedding matrix of shape (num_samples, num_comp) verbose: verbosity level. - rng: RandomState instance + rng: default_rng instance rng_seed: seed for random number generator method: gradient calculation method in [‘barnes_hut’, 'exact'] angle: angle thetha in Barnes-Hut TSNE @@ -53,7 +53,7 @@ def __init__( super().__init__(**kwargs) self.rng_seed = rng_seed if rng is None: - rng = np.random.RandomState(seed=rng_seed) + rng = np.random.default_rng(seed=rng_seed) self._tsne = TSNE( n_components=tsne_dim, diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index fa675fdb..f91d7d96 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -304,6 +304,7 @@ def __getitem__(self, segment): x, fs = self._read_audio(seg_id, start, duration) x, fs = self._resample(x, fs) data = {"seg_id": seg_id, "sample_freq": fs} + if self.augmenters: # augmentations if duration == 0: @@ -324,6 +325,17 @@ def __getitem__(self, segment): seg_info = self._get_segment_info(seg_id) data.update(seg_info) + if np.any(~np.isfinite(data["x"])): + print( + "zzz", + x.max(), + x.min(), + x.mean(), + data["x"].max(), + data["x"].min(), + data["x"].mean(), + flush=True, + ) return data @staticmethod diff --git a/hyperion/torch/layers/audio_feats_factory.py b/hyperion/torch/layers/audio_feats_factory.py index a8398dac..6d0b4df4 100644 --- a/hyperion/torch/layers/audio_feats_factory.py +++ b/hyperion/torch/layers/audio_feats_factory.py @@ -315,7 +315,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--dither", type=float, - default=1, + default=1.0 / 2 ** 15, help="Dithering constant (0.0 means no dither)", ) diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 06838ddd..29b6cdaa 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -7,11 +7,19 @@ from .transducer import RNNRNNTransducer, RNNTransducer from .vae.vae import VAE from .vae.vq_vae import VQVAE -from .wav2transducer import (HFWav2Vec2ConformerV1RNNTransducer, - HFWav2Vec2RNNRNNTransducer, - HFWav2Vec2RNNTransducer, HFWav2Vec2Transducer) -from .wav2xvectors import (HFHubert2ResNet1dXVector, HFWav2Vec2ResNet1dXVector, - HFWavLM2ResNet1dXVector) +from .wav2transducer import ( + HFWav2Vec2ConformerV1RNNTransducer, + HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, + HFWav2Vec2Transducer, +) +from .wav2xvectors import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, + Wav2ResNetXVector, + Wav2ResNet1dXVector, +) from .xvectors.efficient_net_xvector import EfficientNetXVector from .xvectors.resnet1d_xvector import ResNet1dXVector from .xvectors.resnet_xvector import ResNetXVector diff --git a/hyperion/torch/models/plda/splda.py b/hyperion/torch/models/plda/splda.py index 2272793e..3a0f1dee 100644 --- a/hyperion/torch/models/plda/splda.py +++ b/hyperion/torch/models/plda/splda.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn -from ...utils.math import invert_trimat +from ...utils.math_funcs import invert_trimat from .plda_base import PLDABase diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index c2bcdf99..24ab5bbb 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -224,7 +224,7 @@ def extract_embed( ): if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) + x, x_lengths = remove_silence(x, vad_samples, x_lengths) feats, _, feat_lengths = self.forward_feats( x, x_lengths, chunk_length=hf_chunk_length, detach_chunks=detach_chunks @@ -301,7 +301,7 @@ def set_train_mode(self, mode): logging.info("train mode set to %s", mode) - if "nograd" in mode: + if "nograd" in mode or mode == "ft-embed-affine": logging.info("using torch.no_grad for hf_feats") self._hf_context = torch.no_grad() else: diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py index 0d9f1bc4..0e4faded 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -52,3 +52,21 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ResNet1dXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNet1dXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py index 1f7283a0..11d643af 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py @@ -52,3 +52,21 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ResNetXVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNetXVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 4c21f478..4bbc0c4c 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import contextlib import logging from jsonargparse import ActionParser, ArgumentParser @@ -35,6 +36,23 @@ def __init__(self, feats, xvector): self.feats = feats self.xvector = xvector + self._feats_context = contextlib.nullcontext() + + @property + def sample_frequency(self): + return self.feats.sample_frequency + + def compute_prototype_affinity(self): + return self.xvector.compute_prototype_affinity() + + def update_loss_margin(self, epoch): + """Updates the value of the margin in AAM/AM-softmax losses + given the epoch number + + Args: + epoch: epoch which is about to start + """ + self.xvector.update_loss_margin(epoch) def rebuild_output_layer( self, @@ -58,8 +76,9 @@ def rebuild_output_layer( num_subcenters=num_subcenters, ) - def compute_prototype_affinity(self): - return self.xvector.compute_prototype_affinity() + def change_config(self, xvector): + logging.info("changing wav2xvector config") + self.xvector.change_config(**xvector) def forward( self, @@ -73,15 +92,28 @@ def forward( return_output=True, ): - if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) - feats, feat_lengths = self.feats(x, x_lengths) - if vad_feats is not None: - feats, feat_lengths = remove_silence(feats, feat_lengths) - - # feat_lengths = torch.div(x_lengths * feats.size(-1), x.size(-1)) - return self.xvector(feats, feat_lengths, y, enc_layers, classif_layers, - return_output) + with self._feats_context: + if vad_samples is not None: + x, x_lengths = remove_silence(x, vad_samples, x_lengths) + + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths) + + n = torch.sum(~torch.isfinite(feats)) + if n > 0: + print( + "feats", + n, + torch.sum(torch.isnan(feats)), + torch.sum(torch.any(torch.isnan(x), dim=-1)), + x.dtype, + feats.dtype, + flush=True, + ) + return self.xvector( + feats, feat_lengths, y, enc_layers, classif_layers, return_output + ) def extract_embed( self, @@ -94,18 +126,54 @@ def extract_embed( detach_chunks=False, ): - if vad_samples is not None: - x, x_lengths = remove_silence(x, x_lengths) - feats, feat_lengths = self.feats(x, x_lengths) - if vad_feats is not None: - feats, feat_lengths = remove_silence(feats, feat_lengths) + with self._feats_context: + if vad_samples is not None: + x, x_lengths = remove_silence(x, vad_samples, x_lengths) - feats = feats.transpose(1, 2) - return self.xvector.extract_embed(feats, feat_lengths, chunk_length, - embed_layer, detach_chunks) + feats, feat_lengths = self.feats(x, x_lengths) + if vad_feats is not None: + feats, feat_lengths = remove_silence(feats, vad_feats, feat_lengths) + + chunk_length = int(chunk_length * feats.shape[1] / x.shape[-1]) + + return self.xvector.extract_embed( + feats, feat_lengths, chunk_length, embed_layer, detach_chunks + ) def set_train_mode(self, mode): - self.xvector.set_train_mode(mode) + if mode == self._train_mode: + return + + if mode == "full-feats-grad": + self._feats_context = contextlib.nullcontext() + xvector_mode = "full" + else: + logging.info("using torch.no_grad for feats") + self._feats_context = torch.no_grad() + + self.xvector.set_train_mode(xvector_mode) + self._train_mode = mode + + def _train(self, train_mode: str): + + self.feats.train() + if train_mode in ["frozen"]: + super()._train(train_mode) + elif train_mode in ["full-feats-grad", "full"]: + self.xvector._train("full") + elif train_mode == "ft-embed-affine": + self.xvector._train("ft-embed_affine") + else: + raise ValueError(f"invalid train_mode={train_mode}") + + @staticmethod + def valid_train_modes(): + return [ + "full", + "frozen", + "ft-embed-affine", + "full-feats-grad", + ] def get_config(self): feat_cfg = self.feats.get_config() @@ -119,7 +187,7 @@ def get_config(self): return dict(list(base_config.items()) + list(config.items())) @staticmethod - def filter_args(*kwargs): + def filter_args(**kwargs): """Filters Wav2XVector class arguments from arguments dictionary. Args: @@ -150,5 +218,4 @@ def add_class_args(parser, prefix=None): AudioFeatsMVN.add_class_args(parser, prefix="feats") if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index a9ad224e..440c22b6 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -50,6 +50,10 @@ def __init__( self.trans = trans self.aug_after_mvn = aug_after_mvn + @property + def sample_frequency(self): + return self.audio_feats.fs + @property def fs(self): return self.audio_feats.fs diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 0cb887ca..e7020e1d 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -2,11 +2,11 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import os from collections import OrderedDict as ODict from copy import deepcopy from enum import Enum from typing import Optional +from pathlib import Path import torch import torch.nn as nn @@ -110,13 +110,11 @@ def valid_train_modes(): return ["full", "frozen"] def save(self, file_path): - file_dir = os.path.dirname(file_path) - if not (os.path.isdir(file_dir)): - os.makedirs(file_dir, exist_ok=True) - - config = self.get_config() + file_path = Path(file_path) + file_path.parent.mkdir(parents=True, exist_ok=True) torch.save( - {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()} + {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()}, + file_path, ) @staticmethod @@ -176,7 +174,7 @@ def _fix_cfg_compatibility(class_obj, cfg): Fixed configuration dictionary. """ # for compatibility with older x-vector models - XVector = torch_model_registry["xvector"] + XVector = TorchModel.registry["XVector"] if issubclass(class_obj, XVector): # We renamed AM-softmax scale parameer s to cos_scale if "s" in cfg: @@ -195,8 +193,9 @@ def auto_load(file_path, extra_objs={}, map_location=None): cfg = model_data["model_cfg"] class_name = cfg["class_name"] del cfg["class_name"] - if class_name in torch_model_registry: - class_obj = torch_model_registry[class_name] + print(TorchModel.registry) + if class_name in TorchModel.registry: + class_obj = TorchModel.registry[class_name] elif class_name in extra_objs: class_obj = extra_objs[class_name] else: diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index fe72339f..4d4dd55a 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -100,3 +100,19 @@ def cat(cls, tables): ) df["class_idx"].drop(columns=["class_idx"], inplace=True) return cls(df) + + def filter( + self, + predicate=None, + items=None, + iindex=None, + columns=None, + by="id", + keep=True, + rebuild_idx=False, + ): + new_class_info = super().filter(predicate, items, iindex, columns, by, keep) + if rebuild_idx: + new_class_info.add_class_idx() + + return new_class_info diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index d1d969fb..dd446576 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -4,13 +4,14 @@ """ import logging from pathlib import Path -from typing import Dict, Optional, Union +from typing import List, Dict, Optional, Union from copy import deepcopy import math import numpy as np import pandas as pd import yaml +from .info_table import InfoTable from .class_info import ClassInfo from .feature_set import FeatureSet from .misc import PathLike @@ -30,7 +31,7 @@ class Dataset: Attributes: segments: SegmentSet object or path to it. classes: Dictionary of ClassInfo objects or paths to then - recordings: Dictionary of RecordingSet objects or paths to then + recordings: RecordingSet object or paths to then features: Dictionary of FeatureSet objects or paths to then enrollments: Dictionary of EnrollmentMap objects or paths to then trials: Dictionary of TrialKey/TrialNdx/SparseTrialKey objects @@ -45,7 +46,7 @@ def __init__( self, segments: Union[SegmentSet, PathLike], classes: Optional[Dict[str, Union[ClassInfo, PathLike]]] = None, - recordings: Optional[Dict[str, Union[RecordingSet, PathLike]]] = None, + recordings: Optional[Union[RecordingSet, PathLike]] = None, features: Optional[Dict[str, Union[FeatureSet, PathLike]]] = None, enrollments: Optional[Dict[str, Union[EnrollmentMap, PathLike]]] = None, trials: Optional[ @@ -65,24 +66,65 @@ def __init__( self._classes, self._classes_paths = self._parse_dict_args(classes, ClassInfo) - self._recordings, self._recordings_paths = self._parse_dict_args( - recordings, RecordingSet - ) + if isinstance(recordings, RecordingSet): + self._recordings = recordings + self._recordings_path = None + else: + assert isinstance(recordings, (str, Path)) + self._recordings = None + self._recordings_path = Path(recordings) + + # self._recordings, self._recordings_paths = self._parse_dict_args( + # recordings, RecordingSet + # ) self._features, self._features_paths = self._parse_dict_args( features, FeatureSet ) self._enrollments, self._enrollments_paths = self._parse_dict_args( - enrollments, - EnrollmentMap, + enrollments, EnrollmentMap, ) self._trials, self._trials_paths = self._parse_dict_args( - trials, - (TrialKey, TrialNdx, SparseTrialKey), + trials, (TrialKey, TrialNdx, SparseTrialKey), ) self.sparse_trials = sparse_trials self.table_sep = table_sep + self._files_to_delete = [] + + def get_dataset_files(self): + file_paths = [] + for file_path in [self._segments_path, self._recordings_path]: + if file_path is not None: + file_paths.append(file_path) + + for path_dict in [ + self._features_paths, + self._enrollments_paths, + self._trials_paths, + ]: + if path_dict is None: + continue + for k, v in path_dict.items(): + file_paths.append(v) + + return file_paths + + def _delete_files(self, dataset_dir): + if not self._files_to_delete: + return + + dataset_files = self.get_dataset_files() + for file_path in self._files_to_delete: + file_path = Path(file_path) + # if the file has been added again we don't delete + if file_path in dataset_files: + continue + + # if we are saving the dataset to another location + # we don't delete the one in the original + if file_path.parent == dataset_dir and file_path.is_file(): + file_path.unlink() def _parse_dict_args(self, data, types): if data is None: @@ -109,17 +151,38 @@ def segments(self, keep_loaded: bool = True): return self._segments - def recordings_value(self, key: str, keep_loaded: bool = True): - if self._recordings[key] is None: - assert self._recordings_paths[key] is not None - recordings = RecordingSet.load( - self._recordings_paths[key], sep=self.table_sep - ) + def __len__(self): + return len(self.segments()) + + def recordings(self, keep_loaded: bool = True): + if self._recordings is None: + assert self._recordings_path is not None + recordings = RecordingSet.load(self._recordings_path, sep=self.table_sep) if keep_loaded: - self._recordings[key] = recordings + self._recordings = recordings return recordings - return self._recordings[key] + return self._recordings + + # def recordings_value(self, key: str, keep_loaded: bool = True): + # if self._recordings[key] is None: + # assert self._recordings_paths[key] is not None + # recordings = RecordingSet.load( + # self._recordings_paths[key], sep=self.table_sep + # ) + # if keep_loaded: + # self._recordings[key] = recordings + # return recordings + + # return self._recordings[key] + + def features_keys(self): + if self._features is not None: + return self._features.keys() + elif self._features_path is not None: + return self._features_path.keys() + else: + return {} def features_value(self, key: str, keep_loaded: bool = True): if self._features[key] is None: @@ -131,6 +194,14 @@ def features_value(self, key: str, keep_loaded: bool = True): return self._features[key] + def classes_keys(self): + if self._classes is not None: + return self._classes.keys() + elif self._classes_path is not None: + return self._classes_path.keys() + else: + return {} + def classes_value(self, key: str, keep_loaded: bool = True): if self._classes[key] is None: assert self._classes_paths[key] is not None @@ -170,12 +241,12 @@ def trials_value(self, key: str, keep_loaded: bool = True): return self._trials[key] - def recordings(self, keep_loaded: bool = True): - if self._recordings is None: - yield from () - else: - for key in self._recordings.keys(): - yield key, self.recordings_value(key, keep_loaded) + # def recordings(self, keep_loaded: bool = True): + # if self._recordings is None: + # yield from () + # else: + # for key in self._recordings.keys(): + # yield key, self.recordings_value(key, keep_loaded) def features(self, keep_loaded: bool = True): if self._features is None: @@ -299,7 +370,6 @@ def save_changed( dataset_path: PathLike, update_paths: bool = True, table_sep: Optional[str] = None, - force_save_all: bool = False, ): """Saves the tables that change in disk or tables that are not in the ouput directory. @@ -330,24 +400,36 @@ def save_changed( if update_paths: self._segments_path = file_path - if self._recordings is not None: - file_names = {} - for k in self._recordings.keys(): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - if ( - self._recordings[k] is not None - or file_path != self._recordings_paths[k] - or not file_path.exists() - ): - v = self.recordings_value(k, keep_loaded=False) - v.save(file_path, sep=table_sep) - if update_paths: - self._recordings_paths[k] = file_path - - if file_names: - dataset["recordings"] = file_names + file_name = f"recordings{table_ext}" + dataset["recordings"] = file_name + file_path = dataset_dir / file_name + if ( + self._recordings is not None + or file_path != self._recordings_path + or not file_path.exists() + ): + self.recordings(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._recordings_path = file_path + + # if self._recordings is not None: + # file_names = {} + # for k in self._recordings.keys(): + # file_name = k + table_ext + # file_names[k] = file_name + # file_path = dataset_dir / file_name + # if ( + # self._recordings[k] is not None + # or file_path != self._recordings_paths[k] + # or not file_path.exists() + # ): + # v = self.recordings_value(k, keep_loaded=False) + # v.save(file_path, sep=table_sep) + # if update_paths: + # self._recordings_paths[k] = file_path + + # if file_names: + # dataset["recordings"] = file_names if self._features is not None: file_names = {} @@ -428,6 +510,8 @@ def save_changed( with open(dataset_file, "w") as f: yaml.dump(dataset, f) + self._delete_files(dataset_dir) + def save_all( self, dataset_path: PathLike, @@ -457,17 +541,24 @@ def save_all( if update_paths: self._segments_path = file_path - file_names = {} - for k, v in self.recordings(keep_loaded=False): - file_name = k + table_ext - file_names[k] = file_name - file_path = dataset_dir / file_name - v.save(file_path, sep=table_sep) - if update_paths: - self._recordings_paths[k] = file_path + file_name = f"recordings{table_ext}" + dataset["recordings"] = file_name + file_path = dataset_dir / file_name + self.recordings(keep_loaded=False).save(file_path, sep=table_sep) + if update_paths: + self._recordings_path = file_path - if file_names: - dataset["recordings"] = file_names + # file_names = {} + # for k, v in self.recordings(keep_loaded=False): + # file_name = k + table_ext + # file_names[k] = file_name + # file_path = dataset_dir / file_name + # v.save(file_path, sep=table_sep) + # if update_paths: + # self._recordings_paths[k] = file_path + + # if file_names: + # dataset["recordings"] = file_names file_names = {} for k, v in self.features(keep_loaded=False): @@ -520,10 +611,13 @@ def save_all( with open(dataset_file, "w") as f: yaml.dump(dataset, f) + self._delete_files(dataset_dir) + def update_from_disk(self): self.segments() - for k, v in self.recordings(): - pass + self.recordings() + # for k, v in self.recordings(): + # pass for k, v in self.features(): pass @@ -568,9 +662,10 @@ def load( classes[k] = Dataset.resolve_file_path(dataset_dir, v) if "recordings" in dataset: - recordings = {} - for k, v in dataset["recordings"].items(): - recordings[k] = Dataset.resolve_file_path(dataset_dir, v) + recordings = Dataset.resolve_file_path(dataset_dir, dataset["recordings"]) + # recordings = {} + # for k, v in dataset["recordings"].items(): + # recordings[k] = Dataset.resolve_file_path(dataset_dir, v) if "features" in dataset: features = {} @@ -615,32 +710,42 @@ def add_features(self, features_name: str, features: Union[PathLike, FeatureSet] else: raise ValueError() - def add_recordings( - self, - recordings_name: str, - recordings: Union[PathLike, RecordingSet], + def set_segments( + self, segments: Union[PathLike, SegmentSet], update_seg_durs: bool, ): - if self._recordings is None: - self._recordings = {} - self._recordings_paths = {} + if isinstance(segments, (str, Path)): + self._segments = None + self._segments_path = segments + elif isinstance(segments, SegmentSet): + self._segments = segments + self._segments_path = None + else: + raise ValueError() - if isinstance(features, (str, Path)): - self._recordings[features_name] = None - self._recordings_paths[recordings_name] = recordings + def set_recordings( + self, recordings: Union[PathLike, RecordingSet], update_seg_durs: bool, + ): + if isinstance(recordings, (str, Path)): + self._recordings = None + self._recordings_path = Path(recordings) elif isinstance(recordings, RecordingSet): - self._recordings[recordings_name] = recordings - self._recordings_paths[recordings_name] = None + self._recordings = recordings + self._recordings_path = None else: raise ValueError() + if update_seg_durs: + rec_ids = self.segments(keep_loaded=True).recordings() + self.segments()["duration"] = self.recordings().loc[rec_ids, "duration"] + def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): if self._classes is None: self._classes = {} self._classes_paths = {} if isinstance(classes, (str, Path)): - self._classes[features_name] = None - self._classes_paths[classes_name] = classes + self._classes[classes_name] = None + self._classes_paths[classes_name] = Path(classes) elif isinstance(classes, ClassInfo): self._classes[classes_name] = classes self._classes_paths[classes_name] = None @@ -648,9 +753,7 @@ def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): raise ValueError() def add_enrollments( - self, - enrollments_name: str, - enrollments: Union[PathLike, EnrollmentMap], + self, enrollments_name: str, enrollments: Union[PathLike, EnrollmentMap], ): if self._enrollments is None: self._enrollments = {} @@ -658,7 +761,7 @@ def add_enrollments( if isinstance(enrollments, (str, Path)): self._enrollments[enrollments_name] = None - self._enrollments_paths[enrollments_name] = enrollments + self._enrollments_paths[enrollments_name] = Path(enrollments) elif isinstance(enrollments, EnrollmentMap): self._enrollments[enrollments_name] = enrollments self._enrollments_paths[enrollments_name] = None @@ -675,8 +778,8 @@ def add_trials( self._trials_paths = {} if isinstance(trials, (str, Path)): - self._trials[features_name] = None - self._trials_paths[trials_name] = trials + self._trials[trials_name] = None + self._trials_paths[trials_name] = Path(trials) elif isinstance(trials, (TrialKey, TrialNdx, SparseTrialKey)): self._trials[trials_name] = trials self._trials_paths[trials_name] = None @@ -685,85 +788,104 @@ def add_trials( def remove_features(self, features_name: str): if self._features_paths[features_name] is not None: - file_path = Path(self._features_paths[features_name]) - if file_path.is_file(): - file_path.unlink() + self._files_to_delete.append(self._features_paths[features_name]) del self._features[features_name] del self._features_paths[features_name] - def remove_recordings( - self, - recordings_name: str, - ): - if self._recordingsr_paths[recordings_name] is not None: - file_path = Path(self._recordings_paths[recordings_name]) - if file_path.is_file(): - file_path.unlink() + def remove_recordings(self,): + if self._recordings_path is not None: + self._files_to_delete.append(self._recordings_path) - del self._recordings[recordings_name] - del self._recordings_paths[recordings_name] + self._recordings = None + self._recordings_path = None + + # def remove_recordings( + # self, + # recordings_name: str, + # ): + # if self._recordingsr_paths[recordings_name] is not None: + # file_path = Path(self._recordings_paths[recordings_name]) + # if file_path.is_file(): + # file_path.unlink() + + # del self._recordings[recordings_name] + # del self._recordings_paths[recordings_name] def remove_classes(self, classes_name: str): if self._classes_paths[classes_name] is not None: - file_path = Path(self._classes_paths[classes_name]) - if file_path.is_file(): - file_path.unlink() + self._files_to_delete.append(self._class_paths[class_name]) del self._classes[classes_name] del self._classes_paths[classes_name] def remove_enrollments( - self, - enrollments_name: str, + self, enrollments_name: str, ): if self._enrollments_paths[enrollments_name] is not None: - file_path = Path(self._enrollments_paths[enrollments_name]) - if file_path.is_file(): - file_path.unlink() + self._files_to_delete.append(self._enrollments_paths[enrollments_name]) del self._enrollments[enrollments_name] del self._enrollments_paths[enrollments_name] def remove_trials( - self, - trials_name: str, + self, trials_name: str, ): if self._trials_paths[trials_name] is not None: - file_path = Path(self._trials_paths[trials_name]) - if file_path.is_file(): - file_path.unlink() + self._files_to_delete.append(self._trials_paths[trials_name]) del self._trials[trials_name] del self._trials_paths[trials_name] - def set_segments(self, segments: Union[PathLike, SegmentSet]): - if isinstance(segments, SegmentSet): - self._segments = segments - else: - self._segments_path = segments + def add_cols_to_segments( + self, + right_table: Union[InfoTable, pd.DataFrame, PathLike], + column_names: Union[None, str, List[str], np.ndarray] = None, + on: Union[str, List[str], np.ndarray] = "id", + right_on: Union[None, str, List[str], np.ndarray] = None, + ): + if isinstance(right_table, (str, Path)): + file_path = Path(right_table) + if file_path.is_file(): + right_table = InfoTable.load(file_path) + else: + if right_table == "recordings": + right_table = self.recordings() + elif right_table in self.features_keys(): + right_table = self.features_value(right_table) + elif right_table in self.classes_keys(): + right_table = self.classes_value + else: + raise ValueError("%s not found", right_table) + + segments = self.segments(keep_loaded=True) + segments.add_columns(right_table, column_names, on=on, right_on=right_on) - def clean(self): - rec_ids = self.segments().recording_ids() - for k, table in self.recordings(): - table = table.loc[table["id"].isin(rec_ids)].copy() - self._recordings[k] = RecordingSet(table) + def clean(self, rebuild_class_idx=False): + rec_ids = self.segments().recordings() + # for k, table in self.recordings(): + # # table = table.loc[table["id"].isin(rec_ids)].copy() + # # self._recordings[k] = RecordingSet(table) + self._recordings = self.recordings().filter(lambda df: df["id"].isin(rec_ids)) ids = self.segments()["id"].values for k, table in self.features(): - table = table.loc[table["id"].isin(ids)].copy() - self._features[k] = FeatureSet(table) + self._features[k] = table.filter(lambda df: df["id"].isin(ids)) + # table = table.loc[table["id"].isin(ids)].copy() + # self._features[k] = FeatureSet(table) for k, table in self.classes(): class_ids = self.segments()[k].unique() - table = table[table["id"].isin(class_ids)].copy() - self._classes[k] = ClassInfo(table) + self._classes[k] = table.filter(lambda df: df["id"].isin(class_ids)) + # table = table[table["id"].isin(class_ids)].copy() + # self._classes[k] = ClassInfo(table) remove_keys = [] for k, table in self.enrollments(): - table = table.loc[table["segmentid"].isin(ids)].copy() + # table = table.loc[table["segmentid"].isin(ids)].copy() + table = table.filter(lambda df: df["segmentid"].isin(ids)) if len(table) > 0: - self._enrollments[k] = EnrollmentMap(table) + self._enrollments[k] = table else: remove_keys.append(k) @@ -790,7 +912,7 @@ def _split_into_trials_and_cohort( seed: int, ): # select test speakers - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) spks = segments["speaker"].unique() trial_spks = rng.choice(spks, size=(num_trial_speakers,), replace=False) @@ -859,20 +981,14 @@ def split_into_trials_and_cohort( segments_male = SegmentSet(segments[segments["gender"] == "m"]) segments_female = SegmentSet(segments[segments["gender"] == "f"]) trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort( - segments_male, - num_tar_trials, - num_trial_speakers, - seed, + segments_male, num_tar_trials, num_trial_speakers, seed, ) ( trials_female, enroll_female, cohort_female, ) = self._split_into_trials_and_cohort( - segments_female, - num_tar_trials, - num_trial_speakers, - seed, + segments_female, num_tar_trials, num_trial_speakers, seed, ) trials = TrialKey.merge([trials_male, trials_female]) enroll = EnrollmentMap.cat([enroll_male, enroll_female]) @@ -880,10 +996,7 @@ def split_into_trials_and_cohort( else: segments = self.segments() trials, enroll, cohort = self._split_into_trials_and_cohort( - segments, - num_tar_trials, - num_trial_speakers, - seed, + segments, num_tar_trials, num_trial_speakers, seed, ) dataset_trials = self.clone() @@ -899,3 +1012,176 @@ def split_into_trials_and_cohort( dataset_cohort.clean() return dataset_trials, dataset_cohort + + def remove_short_segments(self, min_length: float, length_name: str = "duration"): + segments = self.segments() + self._segments = segments.filter(lambda df: df[length_name] >= min_length) + self.clean() + + def remove_classes_few_segments( + self, class_name: str, min_segs: int, rebuild_idx: bool = False, + ): + segments = self.segments() + classes, counts = np.unique(segments[class_name], return_counts=True) + keep_classes = classes[counts >= min_segs] + self._segments = segments.filter(lambda df: df[class_name].isin(keep_classes)) + self.clean() + if rebuild_idx: + class_info = self.classes_value(class_name) + class_info.add_class_idx() + + def rebuild_class_idx(self, class_name: str): + class_info = self.classes_value(class_name) + class_info.add_class_idx() + + def _segments_split(self, val_prob: float, rng: np.random.Generator): + segments = self.segments() + p = rng.permutation(len(segments)) + num_train = int(round((1 - val_prob) * len(p))) + + train_idx = p[:num_train] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_idx = p[num_train:] + val_segs = segments.filter(iindex=val_idx) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_joint_classes( + self, + val_prob: float, + joint_classes: List[str], + min_train_samples: int, + rng: np.random.Generator, + ): + segments = self.segments() + classes = segments[joint_classes].apply("-".join, axis=1) + u_classes, class_ids = np.unique(classes, return_inverse=True) + train_mask = np.zeros(len(segments), dtype=bool) + kk = 0 + for c_id in range(len(u_classes)): + idx = (class_ids == c_id).nonzero()[0] + count = len(idx) + p = rng.permutation(count) + num_train = max( + int(round((1 - val_prob) * count)), min(min_train_samples, count) + ) + kk += count - num_train + train_idx = idx[p[:num_train]] + train_mask[train_idx] = True + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_disjoint_classes( + self, val_prob: float, disjoint_classes: List[str], rng: np.random.Generator, + ): + segments = self.segments() + classes = segments[disjoint_classes].apply("-".join, axis=1) + u_classes, class_ids = np.unique(classes, return_inverse=True) + p = rng.permutation(len(u_classes)) + class_ids = p[class_ids] + num_train = int(round((1 - val_prob) * len(segments))) + train_mask = np.zeros(len(segments), dtype=bool) + count_acc = 0 + for c_id in range(len(u_classes)): + idx = (class_ids == c_id).nonzero()[0] + train_mask[idx] = True + count = len(idx) + count_acc += count + if count_acc >= num_train: + break + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def _segments_split_joint_and_disjoint_classes( + self, + val_prob: float, + joint_classes: List[str], + disjoint_clases: List[str], + min_train_samples: int, + rng: np.random.Generator, + ): + raise NotImplementedError("I'll implement this when I need it") + segments = self.segments() + j_classes = segments[joint_classes].apply("-".join, axis=1) + ju_classes, j_class_ids = np.unique(j_classes, return_inverse=True) + d_classes = segments[disjoint_classes].apply("-".join, axis=1) + du_classes, d_class_ids = np.unique(d_classes, return_inverse=True) + d_p = rng.permutation(len(du_classes)) + d_class_ids = d_p[d_class_ids] + d_sort_idx = np.argsort(d_class_ids) + d_sort_j_class_ids = j_class_ids[d_sort_idx] + + train_d_classes = set() + for c_id in range(len(ju_classes)): + idx = (j_sort_class_ids == c_id).nonzero()[0] + count = len(idx) + num_train = max( + int(round((1 - val_prob) * count)), min(min_train_samples, count) + ) + sel_d_class_ids = set(d_sort_idx[:num_train]) + train_d_classes = train_d_classes.union(sel_d_class_ids) + + train_mask = np.zeros(len(segments), dtype=bool) + for c_id in train_d_classes: + mask = d_class_ids == c_id + train_mask[mask] = True + + train_idx = train_mask.nonzero()[0] + train_segs = segments.filter(iindex=train_idx) + train_segs.sort() + + val_segs = segments.filter(iindex=train_idx, keep=False) + val_segs.sort() + + return train_segs, val_segs + + def split_train_val( + self, + val_prob: float, + joint_classes: Optional[List[str]] = None, + disjoint_classes: Optional[List[str]] = None, + min_train_samples: int = 1, + seed: int = 11235813, + ): + rng = np.random.default_rng(seed) + if joint_classes is None and disjoint_classes is None: + train_segs, val_segs = self._segments_split(val_prob, rng) + elif joint_classes is not None and disjoint_classes is None: + train_segs, val_segs = self._segments_split_joint_classes( + val_prob, joint_classes, min_train_samples, rng, + ) + elif joint_classes is None and disjoint_classes is not None: + train_segs, val_segs = self._segments_split_disjoint_classes( + val_prob, disjoint_classes, rng, + ) + else: + train_segs, val_segs = self._segments_split_joint_and_disjoint_classes( + val_prob, joint_classes, disjoint_classes, min_train_samples, rng, + ) + + train_ds = self.clone() + train_ds.set_segments(train_segs) + train_ds.clean() + + val_ds = self.clone() + val_ds.set_segments(val_segs) + val_ds.clean() + + return train_ds, val_ds diff --git a/hyperion/utils/fold_list.py b/hyperion/utils/fold_list.py index f22263cf..80b818d6 100644 --- a/hyperion/utils/fold_list.py +++ b/hyperion/utils/fold_list.py @@ -176,7 +176,7 @@ def create( FoldList object. """ if shuffle: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) if group_by_key is None: group_by_key = segment_key diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 45eab05f..57f3faf2 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -8,6 +8,7 @@ from collections import OrderedDict from copy import deepcopy from pathlib import Path +from typing import Optional, Union, List import numpy as np import pandas as pd @@ -192,14 +193,41 @@ def cat(cls, tables): ].is_unique, """there are duplicated ids in the tables we are concatenating""" return cls(df) - def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): + def filter( + self, predicate=None, items=None, iindex=None, columns=None, by="id", keep=True + ): + """Filters the table and produce a new table with the elements to keep + + Args: + predicate: callable function that defines the filtering criterion e.g.: + lambda df: df["duration"] > 1.0. + items: filters the table based in column value with pandas command: + df.loc[items, by], used only if predicate is None + iindex: filters the table based on integer index with pandas command: + df.iloc[iiindex], used if predicate and items are None + columns: columns to keep of remove. + by: column id to use with itmes criterion + keep: if True, the criterion is used to keep rows, if False it is used + to remove rows + + Returns + InfoTable of the same class as the input. + """ assert ( - items is None or iindex is None - ), "items and iindex cannot be not None at the same time" + predicate is not None + or items is not None + or iindex is not None + or columns is not None + ), "predicate, items, iindex and columns cannot be not None at the same time" df = self.df + if predicate is not None: + mask = predicate(self.df) + if not keep: - if items is not None: + if predicate is not None: + mask = np.logical_not(mask) + elif items is not None: items = np.setdiff1d(df[by], items) elif iindex is not None: iindex = np.setdiff1d(np.arange(len(df)), iindex) @@ -207,7 +235,12 @@ def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): if columns is not None: columns = np.setdiff1d(df.columns, columns) - if items is not None: + if predicate is not None: + if columns is None: + df = df.loc[mask] + else: + df = df.loc[mask, columns] + elif items is not None: if by != "id": missing = [False if v in df[by] else True for v in items] if any(missing): @@ -225,7 +258,7 @@ def filter(self, items=None, iindex=None, columns=None, by="id", keep=True): if columns is not None: df = df[columns] - return self.__class__(df) + return self.__class__(df.copy()) def __eq__(self, other): """Equal operator""" @@ -255,7 +288,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.df)) rng.shuffle(index) self.df = self.df.iloc[index] @@ -279,14 +312,33 @@ def get_loc(self, keys): loc = self.df.index.get_loc(keys) if isinstance(loc, int): return loc - elif isinstance(loc, np.ndarray) and loc.dtype == np.bool: + + if isinstance(loc, np.ndarray) and loc.dtype == np.bool: return np.nonzero(loc)[0] - else: - return list(range(loc.start, loc.stop, loc.step)) + + return list(range(loc.start, loc.stop, loc.step)) def get_col_idx(self, keys): return self.df.columns.get_loc(keys) + def add_columns( + self, + right_table, + column_names: Union[None, str, List[str], np.ndarray] = None, + on: Union[str, List[str], np.ndarray] = "id", + right_on: Union[None, str, List[str], np.ndarray] = None, + ): + if isinstance(right_table, InfoTable): + right_table = right_table.df + + if column_names is not None: + right_table = right_table[column_names] + + if right_on is None: + right_on = on + + self.df = self.df.merge(right_table, how="left", left_on=on, right_on=right_on) + # def __len__(self): # """Returns the number of elements in the list.""" diff --git a/hyperion/utils/math.py b/hyperion/utils/math_funcs.py similarity index 93% rename from hyperion/utils/math.py rename to hyperion/utils/math_funcs.py index 84596f7d..5ee510b9 100644 --- a/hyperion/utils/math.py +++ b/hyperion/utils/math_funcs.py @@ -346,10 +346,26 @@ def int2onehot(class_ids, num_classes=None): return p -def cosine_scoring(x1, x2): +def average_vectors(x, ids): + assert x.shape[0] == len(ids) + num_ids = np.max(ids) + 1 + x_avg = np.zeros((num_ids, x.shape[1]), dtype=x.dtype) + for i in range(num_ids): + mask = ids == i + x_avg[i] = np.mean(x[mask], axis=0) - l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True)) - l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True)) + return x_avg + + +def cosine_scoring(x1, x2, ids1=None, ids2=None): + if ids1 is not None: + x1 = average_vectors(x1, ids1) + + if ids2 is not None: + x2 = average_vectors(x2, ids2) + + l2_1 = np.sqrt(np.sum(x1 ** 2, axis=-1, keepdims=True) + 1e-10) + l2_2 = np.sqrt(np.sum(x2 ** 2, axis=-1, keepdims=True) + 1e-10) x1 = x1 / l2_1 x2 = x2 / l2_2 diff --git a/hyperion/utils/plotting.py b/hyperion/utils/plotting.py index 2341beb4..ec617975 100644 --- a/hyperion/utils/plotting.py +++ b/hyperion/utils/plotting.py @@ -4,6 +4,7 @@ """ import matplotlib + # matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np @@ -11,7 +12,7 @@ import scipy.stats as stats from mpl_toolkits.mplot3d import Axes3D as plt3d -from .math import invert_pdmat +from .math_funcs import invert_pdmat def plot_gaussian_1D(mu, C, num_sigmas=3, num_pts=100, weight=1, **kwargs): diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py index 5abf76f2..070e4f53 100644 --- a/hyperion/utils/scp_list.py +++ b/hyperion/utils/scp_list.py @@ -384,7 +384,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) diff --git a/hyperion/utils/segment_set.py b/hyperion/utils/segment_set.py index 6aef5bb2..a99b4e1e 100644 --- a/hyperion/utils/segment_set.py +++ b/hyperion/utils/segment_set.py @@ -13,42 +13,48 @@ class SegmentSet(InfoTable): def __init__(self, df): super().__init__(df) - if "start" in df and "recording_id" not in df: - df["recording_id"] = df["id"] + if "start" in df and "recordings" not in df: + df["recordings"] = df["id"] - if "start" not in df and "recording_id" in df: + if "start" not in df and "recordings" in df: df["start"] = 0.0 @property def has_time_marks(self): - return ( - "recording_id" in self.df and "start" in self.df and "duration" in self.df - ) + return "recordings" in self.df and "start" in self.df and "duration" in self.df @property def has_recording_ids(self): - return "recording_id" in self.df + return "recordings" in self.df - def recording_ids(self, ids=None): + @property + def has_recordings(self): + return "recordings" in self.df + + def recordings(self, ids=None): if ids is None: - if "recording_id" in self.df: - return self.df["recording_id"] + if "recordings" in self.df: + return self.df["recordings"] else: return self.df["id"] - if "recording_id" in self.df: - return self.df.loc[ids, "recording_id"] + if "recordings" in self.df: + return self.df.loc[ids, "recordings"] return ids - def recording_time_marks(self, ids): - if "recording" in self.df: - rec_col = "recording_id" - else: - rec_col = "id" + def recording_ids(self, ids=None): + return self.recordings(ids) + + def recording_time_marks(self, ids, recordings_name: str = "recordings"): + if recordings_name == "recordings": + if "recordings" in self.df: + recordings_name = "recordings" + else: + recordings_name = "id" assert "duration" in self.df if "start" not in self.df: self.df["start"] = 0.0 - return self.df.loc[ids, [rec_col, "start", "duration"]] + return self.df.loc[ids, [recordings_name, "start", "duration"]] diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py index 1bc321a7..62fcd446 100644 --- a/hyperion/utils/sparse_trial_key.py +++ b/hyperion/utils/sparse_trial_key.py @@ -145,7 +145,7 @@ def load_table(cls, file_path, sep=None): file_path: File to read the list. Returns: - TrialKey object. + SparseTrialKey object. """ file_path = Path(file_path) ext = file_path.suffix @@ -156,19 +156,15 @@ def load_table(cls, file_path, sep=None): models = df["modelid"].values segments = df["segmentid"].values is_tar = (df["targettype"] == "target").values - model_set, _, model_idx = np.unique( - models, return_index=True, return_inverse=True - ) - seg_set, _, seg_idx = np.unique( - segments, return_index=True, return_inverse=True - ) + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") - for item in zip(model_idx, seg_idx, is_tar): - if item[2]: - tar[item[0], item[1]] = True + for i, j, target_type in zip(model_idx, seg_idx, is_tar): + if target_type: + tar[i, j] = True else: - non[item[0], item[1]] = True + non[i, j] = True return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) @classmethod diff --git a/hyperion/utils/sparse_trial_scores.py b/hyperion/utils/sparse_trial_scores.py index 7ed9a1d1..760bd1f1 100644 --- a/hyperion/utils/sparse_trial_scores.py +++ b/hyperion/utils/sparse_trial_scores.py @@ -3,12 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import copy import logging -import os.path as path +from pathlib import Path import numpy as np +import pandas as pd import scipy.sparse as sparse from ..hyp_defs import float_cpu @@ -18,9 +18,6 @@ from .trial_ndx import TrialNdx from .trial_scores import TrialScores -# import h5py - - class SparseTrialScores(TrialScores): @@ -55,6 +52,26 @@ def save_txt(self, file_path): % (self.model_set[r], self.seg_set[c], self.scores[r, c]) ) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + self.score_mask.eliminate_zeros() + score_mask = self.score_mask.tocoo() + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}LLR\n") + for i, j in zip(score_mask.row, score_mask.col): + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n" + ) + @classmethod def load_h5(cls, file_path): raise NotImplementedError() @@ -90,6 +107,35 @@ def load_txt(cls, file_path): scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr()) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialScores object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + score_list = df["LLR"].values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + scores = sparse.lil_matrix((len(model_set), len(seg_set)), dtype=float_cpu()) + score_mask = sparse.lil_matrix(scores.shape, dtype="bool") + for i, j, score in zip(model_idx, seg_idx, score_list): + score_mask[i, j] = True + scores[i, j] = score + + return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr()) + @classmethod def merge(cls, scr_list): raise NotImplementedError() @@ -160,9 +206,9 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): if not (np.all(f_mod) and np.all(f_seg)): for i in (f_mod == 0).nonzero()[0]: - logging.info("model %s not found" % model_set[i]) + logging.info("model %s not found", model_set[i]) for i in (f_seg == 0).nonzero()[0]: - logging.info("segment %s not found" % seg_set[i]) + logging.info("segment %s not found", seg_set[i]) if raise_missing: raise Exception("some scores were not computed") @@ -172,18 +218,36 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): scores = self.scores.tocoo() new_data = scores.data new_row = scores.row.copy() + # for i, r in enumerate(mod_idx): + # if f_mod[i] and i != r: + # idx = scores.row == r + # new_row[idx] = i + + # new_col = scores.col.copy() + # for j, c in enumerate(seg_idx): + # if f_seg[j] and j != c: + # idx = scores.col == c + # new_col[idx] = j + + # idx = np.logical_and(new_row < num_mod, new_col < num_seg) + # if not np.all(idx): + # new_data = new_data[idx] + # new_row = new_row[idx] + # new_col = new_col[idx] + + new_row = -1 * np.ones_like(scores.row) for i, r in enumerate(mod_idx): - if f_mod[i] and i != r: + if f_mod[i]: idx = scores.row == r new_row[idx] = i - new_col = scores.col.copy() + new_col = -1 * np.ones_like(scores.col) for j, c in enumerate(seg_idx): - if f_seg[j] and j != c: + if f_seg[j]: idx = scores.col == c new_col[idx] = j - idx = np.logical_and(new_row < num_mod, new_col < num_seg) + idx = np.logical_and(new_row != -1, new_col != -1) if not np.all(idx): new_data = new_data[idx] new_row = new_row[idx] @@ -193,19 +257,37 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): score_mask = self.score_mask.tocoo() new_data = score_mask.data - new_row = score_mask.row.copy() + # new_row = score_mask.row.copy() + # for i, r in enumerate(mod_idx): + # if f_mod[i] and i != r: + # idx = score_mask.row == r + # new_row[idx] = i + + # new_col = score_mask.col.copy() + # for j, c in enumerate(seg_idx): + # if f_seg[j] and j != c: + # idx = score_mask.col == c + # new_col[idx] = j + + # idx = np.logical_and(new_row < num_mod, new_col < num_seg) + # if not np.all(idx): + # new_data = new_data[idx] + # new_row = new_row[idx] + # new_col = new_col[idx] + + new_row = -1 * np.ones_like(score_mask.row) for i, r in enumerate(mod_idx): - if f_mod[i] and i != r: + if f_mod[i]: idx = score_mask.row == r new_row[idx] = i - new_col = score_mask.col.copy() + new_col = -1 * np.ones_like(score_mask.col) for j, c in enumerate(seg_idx): - if f_seg[j] and j != c: + if f_seg[j]: idx = score_mask.col == c new_col[idx] = j - idx = np.logical_and(new_row < num_mod, new_col < num_seg) + idx = np.logical_and(new_row != -1, new_col != -1) if not np.all(idx): new_data = new_data[idx] new_row = new_row[idx] @@ -249,7 +331,7 @@ def align_with_ndx(self, ndx, raise_missing=True): if not scr.score_mask[r, c]: missing_scores = True logging.info( - "missing-scores for %s %s" % (scr.model_set[r], scr.seg_set[c]) + "missing-scores for %s %s", scr.model_set[r], scr.seg_set[c] ) if missing_scores and raise_missing: @@ -291,7 +373,7 @@ def set_valid_scores(self, scores, ndx=None): self.scores = scr.scores self.score_mat = scr.score_mat - self.scores[self.score_mask]=scores + self.scores[self.score_mask] = scores @classmethod def from_trial_scores(cls, scr): @@ -302,6 +384,12 @@ def from_trial_scores(cls, scr): score_mask.eliminate_zeros() return cls(scr.model_set, scr.seg_set, scores, score_mask) + def to_trial_scores(self): + scores = self.scores.toarray("C") + score_mask = self.score_mask.toarray("C") + # scores[~score_mask] = 0.0 + return TrialScores(self.model_set, self.seg_set, scores, score_mask) + def set_missing_to_value(self, ndx, val): """Aligns the scores with a TrialNdx and sets the trials with missing scores to the same value. diff --git a/hyperion/utils/train_val_eval_list.py b/hyperion/utils/train_val_eval_list.py index fd17e240..cbccf093 100644 --- a/hyperion/utils/train_val_eval_list.py +++ b/hyperion/utils/train_val_eval_list.py @@ -207,7 +207,7 @@ def create( part_names = ["train", "eval"] if shuffle: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) if group_by_key is None: group_by_key = segment_key diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py index 4a99461b..5d8019b6 100644 --- a/hyperion/utils/trial_key.py +++ b/hyperion/utils/trial_key.py @@ -11,7 +11,8 @@ import numpy as np import pandas as pd -from .list_utils import * +# from .list_utils import * +from .list_utils import sort, intersect, ismember, split_list, list2ndarray from .trial_ndx import TrialNdx @@ -178,7 +179,8 @@ def load(cls, file_path, sep=None): Returns: TrialKey object. """ - _, file_ext = path.splitext(file_path) + file_path = Path(file_path) + file_ext = file_path.suffix if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) elif file_ext in ("", ".txt"): @@ -268,7 +270,7 @@ def load_txt(cls, file_path): @classmethod def load_table(cls, file_path, sep=None): - """Loads object from txt file + """Loads object from pandas table file Args: file_path: File to read the list. @@ -285,12 +287,8 @@ def load_table(cls, file_path, sep=None): models = df["modelid"].values segments = df["segmentid"].values is_tar = (df["targettype"] == "target").values - model_set, _, model_idx = np.unique( - models, return_index=True, return_inverse=True - ) - seg_set, _, seg_idx = np.unique( - segments, return_index=True, return_inverse=True - ) + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) tar = np.zeros((len(model_set), len(seg_set)), dtype="bool") non = np.zeros((len(model_set), len(seg_set)), dtype="bool") for i, j, target_type in zip(model_idx, seg_idx, is_tar): diff --git a/hyperion/utils/trial_ndx.py b/hyperion/utils/trial_ndx.py index e26d19e2..b7b873df 100644 --- a/hyperion/utils/trial_ndx.py +++ b/hyperion/utils/trial_ndx.py @@ -4,12 +4,14 @@ """ import copy -import os.path as path +from pathlib import Path import h5py import numpy as np +import pandas as pd -from .list_utils import * +# from .list_utils import * +from .list_utils import sort, intersect, ismember, split_list, list2ndarray class TrialNdx(object): @@ -46,17 +48,20 @@ def sort(self): self.seg_set, s_idx = sort(self.seg_set, return_index=True) self.trial_mask = self.trial_mask[np.ix_(m_idx, s_idx)] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in [".h5", ".hdf5"]: self.save_h5(file_path) - else: + elif file_ext in [".txt", ""]: self.save_txt(file_path) + else: + self.save_table(file_path, sep=sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -71,15 +76,6 @@ def save_h5(self, file_path): f.create_dataset("ID/column_ids", data=seg_set) f.create_dataset("trial_mask", data=self.trial_mask.astype("uint8")) - # model_set = self.model_set.astype('S') - # f.create_dataset('ID/row_ids', self.model_set.shape, dtype=model_set.dtype) - # f['ID/row_ids'] = model_set - # seg_set = self.seg_set.astype('S') - # f.create_dataset('ID/column_ids', self.seg_set.shape, dtype=seg_set.dtype) - # f['ID/column_ids'] = seg_set - # f.create_dataset('trial_mask', self.trial_mask.shape, dtype='uint8') - # f['trial_mask'] = self.trial_mask.astype('uint8') - def save_txt(self, file_path): """Saves object to txt file. @@ -91,8 +87,25 @@ def save_txt(self, file_path): for item in zip(idx[0], idx[1]): f.write("%s %s\n" % (self.model_set[item[1]], self.seg_set[item[0]])) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}\n") + I, J = self.trial_mask.nonzero() + for i, j in zip(I, J): + f.write(f"{self.model_set[i]}{sep}{self.seg_set[j]}\n") + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -101,11 +114,14 @@ def load(cls, file_path): Returns: TrialNdx object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -148,6 +164,36 @@ def load_txt(cls, file_path): trial_mask[item[0], item[1]] = True return cls(model_set, seg_set, trial_mask) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialNdx object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + model_set, _, model_idx = np.unique( + models, return_index=True, return_inverse=True + ) + seg_set, _, seg_idx = np.unique( + segments, return_index=True, return_inverse=True + ) + trial_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool") + for i, j in zip(model_idx, seg_idx): + trial_mask[i, j] = True + + return cls(model_set, seg_set, trial_mask) + @classmethod def merge(cls, ndx_list): """Merges several index objects. diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py index a486647d..9e7fcd5d 100644 --- a/hyperion/utils/trial_scores.py +++ b/hyperion/utils/trial_scores.py @@ -3,16 +3,18 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import copy import logging -import os.path as path +from pathlib import Path import h5py import numpy as np +import pandas as pd from ..hyp_defs import float_cpu -from .list_utils import * + +# from .list_utils import * +from .list_utils import sort, intersect, ismember, split_list, list2ndarray from .trial_key import TrialKey from .trial_ndx import TrialNdx @@ -56,17 +58,20 @@ def sort(self): self.scores = self.scores[ix] self.score_mask = self.score_mask[ix] - def save(self, file_path): + def save(self, file_path, sep=None): """Saves object to txt/h5 file. Args: file_path: File to write the list. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in [".h5", ".hdf5"]: self.save_h5(file_path) - else: + elif file_ext in ["", ".txt"]: self.save_txt(file_path) + else: + self.save_table(file_path, sep=sep) def save_h5(self, file_path): """Saves object to h5 file. @@ -100,8 +105,27 @@ def save_txt(self, file_path): ) ) + def save_table(self, file_path, sep=None): + """Saves object to pandas tabnle file. + + Args: + file_path: File to write the list. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + with open(file_path, "w", encoding="utf-8") as f: + f.write(f"modelid{sep}segmentid{sep}LLR\n") + I, J = self.score_mask.nonzero() + for i, j in zip(I, J): + f.write( + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n" + ) + @classmethod - def load(cls, file_path): + def load(cls, file_path, sep=None): """Loads object from txt/h5 file Args: @@ -110,11 +134,14 @@ def load(cls, file_path): Returns: TrialScores object. """ - file_base, file_ext = path.splitext(file_path) - if file_ext == ".h5" or file_ext == ".hdf5": + file_path = Path(file_path) + file_ext = file_path.suffix + if file_ext in (".h5", ".hdf5"): return cls.load_h5(file_path) - else: + elif file_ext in ("", ".txt"): return cls.load_txt(file_path) + else: + return cls.load_table(file_path, sep) @classmethod def load_h5(cls, file_path): @@ -163,6 +190,35 @@ def load_txt(cls, file_path): scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores, score_mask) + @classmethod + def load_table(cls, file_path, sep=None): + """Loads object from pandas table file + + Args: + file_path: File to read the list. + + Returns: + TrialScores object. + """ + file_path = Path(file_path) + ext = file_path.suffix + if sep is None: + sep = "\t" if ".tsv" in ext else "," + + df = pd.read_csv(file_path, sep=sep) + models = df["modelid"].values + segments = df["segmentid"].values + score_list = df["LLR"].values + model_set, model_idx = np.unique(models, return_inverse=True) + seg_set, seg_idx = np.unique(segments, return_inverse=True) + score_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool") + scores = np.zeros((len(model_set), len(seg_set)), dtype=float_cpu()) + for i, j, score in zip(model_idx, seg_idx, score_list): + score_mask[i, j] = True + scores[i, j] = score + + return cls(model_set, seg_set, scores, score_mask) + @classmethod def merge(cls, scr_list): """Merges several score objects. @@ -235,7 +291,7 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): Filtered TrialScores object. """ - if not (keep): + if not keep: model_set = np.setdiff1d(self.model_set, model_set) seg_set = np.setdiff1d(self.model_set, seg_set) @@ -244,15 +300,15 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): if np.all(f_mod) and np.all(f_seg): model_set = self.model_set[mod_idx] - set_set = self.seg_set[seg_idx] + seg_set = self.seg_set[seg_idx] ix = np.ix_(mod_idx, seg_idx) scores = self.scores[ix] score_mask = self.score_mask[ix] else: for i in (f_mod == 0).nonzero()[0]: - logging.info("model %s not found" % model_set[i]) + logging.info("model %s not found", model_set[i]) for i in (f_seg == 0).nonzero()[0]: - logging.info("segment %s not found" % seg_set[i]) + logging.info("segment %s not found", seg_set[i]) if raise_missing: raise Exception("some scores were not computed") diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index edf2c23a..c1c429f2 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -261,7 +261,7 @@ def shuffle(self, seed=1024, rng=None): Index used to shuffle the list. """ if rng is None: - rng = np.random.RandomState(seed=seed) + rng = np.random.default_rng(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) self.utt_info = self.utt_info.iloc[index] From 89c6e2016b391818c35ab91644bbd091db4f9986 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 8 Sep 2023 11:24:03 -0400 Subject: [PATCH 106/154] finished vox v1.2 except plda --- egs/voxceleb/v1.2/run_007_eval_be.sh | 321 ++++++++++++++++++ .../eval_cosine_scoring_backend_with_qmf.py | 253 +++++++++++--- hyperion/bin/merge_scores.py | 19 +- hyperion/bin/train_qmf.py | 135 ++++++++ .../np/classifiers/logistic_regression.py | 3 +- hyperion/torch/utils/misc.py | 4 +- hyperion/utils/trial_scores.py | 138 +++++++- 7 files changed, 800 insertions(+), 73 deletions(-) create mode 100755 egs/voxceleb/v1.2/run_007_eval_be.sh create mode 100755 hyperion/bin/train_qmf.py diff --git a/egs/voxceleb/v1.2/run_007_eval_be.sh b/egs/voxceleb/v1.2/run_007_eval_be.sh new file mode 100755 index 00000000..9084d35b --- /dev/null +++ b/egs/voxceleb/v1.2/run_007_eval_be.sh @@ -0,0 +1,321 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name +score_plda_dir=$score_dir/${be_name}/plda +score_cosine_dir=$score_dir/cosine +score_cosine_snorm_dir=$score_dir/cosine_snorm +score_cosine_qmf_dir=$score_dir/cosine_qmf + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend.py \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + merge_scores.py --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring" + $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend.py \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_dir/voxsrc22_dev_scores.csv + + # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ + # hyp_utils/conda_env.sh \ + # eval_cosine_scoring_backend.py \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_dir/voxsrc22_dev_results.csv + + cat $score_cosine_dir/voxsrc22_dev_results.csv + +fi + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend.py \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + merge_scores.py --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_snorm_dir/voxceleb1_results.csv + + cat $score_cosine_snorm_dir/voxceleb1_results.csv + fi + + if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + AS-Norm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend.py \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + merge_scores.py --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + fi + +fi + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + echo "...Calculating quality measures for Vox2" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend_with_qmf.py \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --ndx-file data/voxceleb2cat_train_trials/trials.csv \ + --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ + --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + train_qmf.py --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + + fi + + if [ $stage -le 8 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend_with_qmf.py \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + ) & + done + wait + fi + + if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + eval_cosine_scoring_backend_with_qmf.py \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + merge_scores.py --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ + eval_verification_metrics.py \ + --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + ) & + done + wait + fi + +fi + diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py index f567dd81..0333669f 100755 --- a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py +++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py @@ -30,6 +30,7 @@ from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import TransformList from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.classifiers import BinaryLogisticRegression as LR def get_precomp_qm_names(quality_measures): @@ -38,7 +39,6 @@ def get_precomp_qm_names(quality_measures): def normalize_duration(q, min_dur, max_dur, frame_rate): - q = q / frame_rate q = np.log(np.clip(q / frame_rate, a_min=min_dur, a_max=max_dur)) log_min_dur = np.log(min_dur) @@ -99,6 +99,9 @@ def load_trial_data( test_segments.add_columns(test_feats_set) if enroll_feats_set != test_feats_set or enroll_segments != test_segments: enroll_segments.add_columns(enroll_feats_set) + else: + test_segments = test_feats_set + enroll_segments = enroll_feats_set # now we retrive the quality measures q_e = [] @@ -132,7 +135,6 @@ def load_trial_data( def load_cohort_data(segments_file, feats_file): - segments = SegmentSet.load(segments_file) feats_reader = DRF.create(feats_file) x = feats_reader.read(segments["id"], squeeze=True) @@ -160,16 +162,13 @@ def get_score_filepath( test_part_idx, num_test_parts, ): - score_file = Path(score_file) new_suffix = "" if score_name is not None: new_suffix = f".{score_name}" if num_enroll_parts > 1 or num_test_parts > 1: - new_suffix = ( - f"{new_suffix}.{enroll_part_idx}.{test_part_idx}{score_file.suffix}" - ) + new_suffix = f"{new_suffix}.{enroll_part_idx}.{test_part_idx}" if new_suffix: new_suffix = f"{new_suffix}{score_file.suffix}" @@ -177,25 +176,58 @@ def get_score_filepath( return score_file -def save_scores(ndx, scores, score_file, score_name, enroll_part_idx, + +def save_scores( + ndx, + scores, + score_file, + score_name, + q_measures, + enroll_part_idx, num_enroll_parts, test_part_idx, - num_test_parts): + num_test_parts, +): + score_file = get_score_filepath( + score_file, + score_name, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + logging.info("saving scores with to %s", score_file) + scores = TrialScores( + ndx.model_set, ndx.seg_set, scores, ndx.trial_mask, q_measures=q_measures + ) + scores.save(score_file) + -def save_empty_scores(ndx, score_file, score_name, enroll_part_idx, +def save_empty_scores( + ndx, + score_file, + score_name, + q_measures, + enroll_part_idx, num_enroll_parts, test_part_idx, - num_test_parts): + num_test_parts, +): scores = np.zeros(ndx.trial_mask.shape, dtype="float32") - score_file = get_score_filepath(score_file, score_name,enroll_part_idx, - num_enroll_parts, - test_part_idx, - num_test_parts) - - scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) - scores.save(score_file) - + if q_measures is not None: + q_measures = {k: scores for k in q_measures} + save_scores( + ndx, + scores, + score_file, + score_name, + q_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) def segment_to_trial_qm(q_e, q_t): @@ -226,31 +258,29 @@ def align_scores_to_ndx(enroll_set, ndx, scores, scores_norm, q_trial): return scores, scores_norm, q_trial -def make_qm_table(ndx, scores, scores_norm, q_trial): - if scores_norm is None: - scores = scores[ndx.trial_mask] - else: - scores = scores_norm[ndx.trial_mask] - - for qm in q_trial: - q_trial[qm] = q_trial[qm][ndx.trial_mask] +# def make_qm_table(ndx, scores, scores_norm, q_trial): +# if scores_norm is None: +# scores = scores[ndx.trial_mask] +# else: +# scores = scores_norm[ndx.trial_mask] - I, J = np.nonzero(ndx.trial_mask) - modelid = ndx.model_set[I] - segmentid = ndx.seg_set[J] - unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)] - - q_dict = { - "id": unique_id, - "modelid": modelid, - "segmentid": segmentid, - "scores": scores, - } - q_dict.update(q_trial) - df = pd.DataFrame(q_dict) - return InfoTable(df) +# for qm in q_trial: +# q_trial[qm] = q_trial[qm][ndx.trial_mask] +# I, J = np.nonzero(ndx.trial_mask) +# modelid = ndx.model_set[I] +# segmentid = ndx.seg_set[J] +# unique_id = [f"{a}-{b}" for a, b in zip(modelid, segmentid)] +# q_dict = { +# "id": unique_id, +# "modelid": modelid, +# "segmentid": segmentid, +# "scores": scores, +# } +# q_dict.update(q_trial) +# df = pd.DataFrame(q_dict) +# return InfoTable(df) def eval_backend( @@ -276,7 +306,6 @@ def eval_backend( test_part_idx, num_test_parts, ): - logging.info("loading data") enroll_map, ndx, x_e, x_t, q_e, q_t = load_trial_data( enroll_map_file, @@ -297,8 +326,43 @@ def eval_backend( if not np.any(ndx.trial_mask): # this part doesn't have any trials, save empty files - - + if qmf_file is not None: + quality_measures = None + save_empty_scores( + ndx, + score_file, + "snorm.qmf" if cohort_segments_file is not None else "qmf", + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + save_empty_scores( + ndx, + score_file, + None, + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + + if cohort_segments_file is not None: + save_empty_scores( + ndx, + score_file, + "snorm", + quality_measures, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + return + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) q_e = average_qm(q_e, enroll_set, enroll_ids) @@ -362,46 +426,123 @@ def eval_backend( enroll_set, ndx, scores, scores_norm, q_trial ) if qmf_file is None: - qm_table = make_qm_table(ndx, scores, scores_norm, q_trial) - qm_file = get_score_filepath( + save_scores( + ndx, + scores, score_file, - "qm", + None, + q_trial, enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts, ) - qm_table.save(qm_file) + + if scores_norm is not None: + save_scores( + ndx, + scores_norm, + score_file, + "snorm", + q_trial, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + # qm_table = make_qm_table(ndx, scores, scores_norm, q_trial) + # qm_file = get_score_filepath( + # score_file, + # "qm", + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # qm_table.save(qm_file) return - score_file_nonorm = get_score_filepath( + save_scores( + ndx, + scores, score_file, None, + None, enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts, ) - logging.info("saving scores to %s", score_file_nonorm) - scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) - scores.save(score_file_nonorm) if scores_norm is not None: - score_file_snorm = get_score_filepath( + save_scores( + ndx, + scores_norm, score_file, "snorm", + None, enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts, ) - logging.info("saving scores with AS-Norm to %s", score_file_snorm) - scores.scores = scores_norm - scores.save(score_file_snorm) + logging.info("applying qmf") + if scores_norm is None: + score_name = "qmf" + scores_fus = [scores.ravel()] + else: + score_name = "snorm.qmf" + scores_fus = [scores_norm.ravel()] + + q_names = list(q_trial.keys()) + q_names.sort() + for q_name in q_names: + scores_fus.append(q_trial[q_name].ravel()) + + scores_fus = np.vstack(scores_fus).T + lr = LR.load(qmf_file) + scores_fus = lr.predict(scores_fus) + scores_fus = np.reshape(scores_fus, (ndx.num_models, ndx.num_tests)) + save_scores( + ndx, + scores_fus, + score_file, + score_name, + None, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) -if __name__ == "__main__": + # score_file_nonorm = get_score_filepath( + # score_file, + # None, + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # logging.info("saving scores to %s", score_file_nonorm) + # scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + # scores.save(score_file_nonorm) + + # if scores_norm is not None: + # score_file_snorm = get_score_filepath( + # score_file, + # "snorm", + # enroll_part_idx, + # num_enroll_parts, + # test_part_idx, + # num_test_parts, + # ) + # logging.info("saving scores with AS-Norm to %s", score_file_snorm) + # scores.scores = scores_norm + # scores.save(score_file_snorm) + +if __name__ == "__main__": parser = ArgumentParser( description="Eval cosine-scoring with optional AS-Norm and QMF" ) diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py index 6a275f5c..cb8524b7 100755 --- a/hyperion/bin/merge_scores.py +++ b/hyperion/bin/merge_scores.py @@ -18,14 +18,19 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx): - output_file = Path(output_file) output_file.parent.mkdir(exist_ok=True, parents=True) ext = output_file.suffix if input_files is None: - input_file_base = output_file.with_suffix("") + if ext in [".h5", ".csv", ".tsv"]: + input_file_base = output_file + else: + input_file_base = output_file.parent / (output_file.name + ".txt") + ext = "" + + logging.info("merging %s* -> %s", input_file_base.with_suffix(""), output_file) input_files = [] for i in range(num_enroll_parts): idx_i = base_idx + i @@ -33,6 +38,8 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas idx_j = base_idx + j input_file_i = input_file_base.with_suffix(f".{idx_i}.{idx_j}{ext}") input_files.append(input_file_i) + else: + logging.info("merging %s -> %s", " + ".join(input_files), output_file) if ext == ".h5": # if files are h5 we need to load everything in RAM @@ -57,7 +64,6 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas if __name__ == "__main__": - parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument( @@ -88,7 +94,12 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas help="""index of the first job, typically 0 or 1""", ) parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, ) args = parser.parse_args() diff --git a/hyperion/bin/train_qmf.py b/hyperion/bin/train_qmf.py new file mode 100755 index 00000000..a97e8a5f --- /dev/null +++ b/hyperion/bin/train_qmf.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + + Trains calibration for SRE18 tel condition +""" + +import sys +import os +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) +import time +import logging +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import float_cpu, config_logger +from hyperion.utils.trial_scores import TrialScores +from hyperion.utils.trial_key import TrialKey +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.np.classifiers import BinaryLogisticRegression as LR + + +def print_q_stats(scr, q_names): + for k in q_names: + q_vec = scr.q_measures[k][scr.score_mask] + s = f"{k} stats mean={np.mean(q_vec)} min={np.min(q_vec)} max={np.max(q_vec)} median={np.median(q_vec)}" + logging.info(s) + + +def train_qmf( + score_file, key_file, model_file, prior, lambda_reg, quality_measures, verbose +): + logging.info("load key: %s", key_file) + key = TrialKey.load(key_file) + logging.info("load scores: %s", score_file) + scr = TrialScores.load(score_file) + tar, non = scr.get_tar_non(key) + ntar = len(tar) + nnon = len(non) + + if quality_measures is None: + quality_measures = list(scr.q_measures.keys()) + quality_measures.sort() + + print_q_stats(scr, quality_measures) + q_tar, q_non = scr.get_tar_non_q_measures(key, quality_measures) + + min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + min_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + logging.info("train calibration") + # tar = np.vstack((tar, maxnf_tar, minnf_tar, maxcohmu_tar, mincohmu_tar)).T + # non = np.vstack((non, maxnf_non, minnf_non, maxcohmu_non, mincohmu_non)).T + tar = np.hstack((tar[:, None], q_tar)) + non = np.hstack((non[:, None], q_non)) + + x = np.vstack((tar, non)) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) + lr.fit(x, y) + logging.info(f"A={lr.A} b={lr.b}") + logging.info("save calibration at %s", model_file) + lr.save(model_file) + + logging.info("calibrate scores") + tar_cal = lr.predict(tar) + non_cal = lr.predict(non) + act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f", + act_dcf, + p_miss * 100, + p_fa * 100, + n_miss, + n_fa, + ) + + score_file = Path(score_file) + output_file = score_file.with_suffix(f".qmf{score_file.suffix}") + scr_out = TrialScores(key.model_set, key.seg_set) + scr_out.scores[key.tar] = tar_cal + scr_out.scores[key.non] = non_cal + scr_out.score_mask = np.logical_or(key.tar, key.non) + scr_out.save(output_file) + + +if __name__ == "__main__": + parser = ArgumentParser(description="Trains QMF calibration") + + parser.add_argument("--score-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--model-file", required=True) + parser.add_argument("--prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument( + "--quality-measures", + default=None, + nargs="+", + choices=["snorm-mu/s", "snorm-mu", "speech_duration", "num_speech_frames"], + ) + + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + + train_qmf(**namespace_to_dict(args)) diff --git a/hyperion/np/classifiers/logistic_regression.py b/hyperion/np/classifiers/logistic_regression.py index 4c4c0cfc..03d9fd13 100644 --- a/hyperion/np/classifiers/logistic_regression.py +++ b/hyperion/np/classifiers/logistic_regression.py @@ -93,7 +93,8 @@ def __init__( super().__init__(**kwargs) if random_state is None: - random_state = np.random.default_rng(seed=lr_seed) + # random_state = np.random.default_rng(seed=lr_seed) + random_state = np.random.RandomState(seed=lr_seed) if bias_scaling is None: if use_bias and solver == "liblinear": diff --git a/hyperion/torch/utils/misc.py b/hyperion/torch/utils/misc.py index b2a3810f..46c09080 100644 --- a/hyperion/torch/utils/misc.py +++ b/hyperion/torch/utils/misc.py @@ -4,8 +4,8 @@ """ import torch -import torch.cuda.amp as amp import torch.nn as nn +import torch.cuda.amp as amp def l2_norm(x, dim=1, axis=None): @@ -104,3 +104,5 @@ def get_selfsim_tarnon(y, return_mask=False): mask = torch.triu(torch.ones_like(y_bin, dtype=torch.bool), diagonal=1) return y_bin, mask + + diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py index 9e7fcd5d..4a5e59da 100644 --- a/hyperion/utils/trial_scores.py +++ b/hyperion/utils/trial_scores.py @@ -14,7 +14,7 @@ from ..hyp_defs import float_cpu # from .list_utils import * -from .list_utils import sort, intersect, ismember, split_list, list2ndarray +from .list_utils import intersect, ismember, list2ndarray, sort, split_list from .trial_key import TrialKey from .trial_ndx import TrialNdx @@ -28,13 +28,22 @@ class TrialScores(object): seg_set: List of test segment names. scores: Matrix with the scores (num_models x num_segments). score_mask: Boolean matrix with the trials with valid scores to True (num_models x num_segments). + q_measures: optional dictionary of quality measure matrices """ - def __init__(self, model_set=None, seg_set=None, scores=None, score_mask=None): + def __init__( + self, + model_set=None, + seg_set=None, + scores=None, + score_mask=None, + q_measures=None, + ): self.model_set = model_set self.seg_set = seg_set self.scores = scores self.score_mask = score_mask + self.q_measures = q_measures if (model_set is not None) and (seg_set is not None): self.validate() @@ -57,6 +66,9 @@ def sort(self): ix = np.ix_(m_idx, s_idx) self.scores = self.scores[ix] self.score_mask = self.score_mask[ix] + if self.q_measures is not None: + for k in self.q_measures.keys(): + self.q_measures[k] = self.q_measures[k][ix] def save(self, file_path, sep=None): """Saves object to txt/h5 file. @@ -86,6 +98,10 @@ def save_h5(self, file_path): f.create_dataset("ID/column_ids", data=seg_set) f.create_dataset("scores", data=self.scores) f.create_dataset("score_mask", data=self.score_mask.astype("uint8")) + if self.q_measures is not None: + q_grp = f.create_group("q_measures") + for k, v in self.q_measures.items(): + q_grp.create_dataset(k, data=v) def save_txt(self, file_path): """Saves object to txt file. @@ -105,6 +121,9 @@ def save_txt(self, file_path): ) ) + if self.q_measures is not None: + logging.warning("q_measures cannot be saved to txt file") + def save_table(self, file_path, sep=None): """Saves object to pandas tabnle file. @@ -116,12 +135,20 @@ def save_table(self, file_path, sep=None): if sep is None: sep = "\t" if ".tsv" in ext else "," + q_str = "" + if self.q_measures is not None: + q_str = sep + sep.join(self.q_measures.keys()) + with open(file_path, "w", encoding="utf-8") as f: - f.write(f"modelid{sep}segmentid{sep}LLR\n") + f.write(f"modelid{sep}segmentid{sep}LLR{q_str}\n") I, J = self.score_mask.nonzero() for i, j in zip(I, J): + if self.q_measures is not None: + q_str = sep + sep.join( + [str(v[i, j]) for k, v in self.q_measures.items()] + ) f.write( - f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}\n" + f"{self.model_set[i]}{sep}{self.seg_set[j]}{sep}{self.scores[i,j]}{q_str}\n" ) @classmethod @@ -158,7 +185,12 @@ def load_h5(cls, file_path): seg_set = [t.decode("utf-8") for t in f["ID/column_ids"]] scores = np.asarray(f["scores"], dtype=float_cpu()) score_mask = np.asarray(f["score_mask"], dtype="bool") - return cls(model_set, seg_set, scores, score_mask) + if "q_measures" in f: + q_grp = f["q_measures"] + q_measures = {k: q_grp[k] for k in q_grp} + else: + q_measures = None + return cls(model_set, seg_set, scores, score_mask, q_measures) @classmethod def load_txt(cls, file_path): @@ -217,7 +249,21 @@ def load_table(cls, file_path, sep=None): score_mask[i, j] = True scores[i, j] = score - return cls(model_set, seg_set, scores, score_mask) + if len(df.columns) > 3: + q_names = df.columns[3:] + q_vals = df.iloc[:, 3:].values + q_measures = {} + for q_name in q_names: + q_measures[q_name] = np.zeros(scores.shape, dtype=float_cpu()) + + for i, j, q_row in zip(model_idx, seg_idx, q_vals): + for col, q_name in enumerate(q_names): + q_measures[q_name][i, j] = q_row[col] + + else: + q_measures = None + + return cls(model_set, seg_set, scores, score_mask, q_measures) @classmethod def merge(cls, scr_list): @@ -234,6 +280,7 @@ def merge(cls, scr_list): seg_set = scr_list[0].seg_set scores = scr_list[0].scores score_mask = scr_list[0].score_mask + q_measures = scr_list[0].q_measures for i in range(1, num_scr): scr_i = scr_list[i] new_model_set = np.union1d(model_set, scr_i.model_set) @@ -252,6 +299,10 @@ def merge(cls, scr_list): scores_1[ix_a] = scores[ix_b] score_mask_1 = np.zeros(shape, dtype="bool") score_mask_1[ix_a] = score_mask[ix_b] + if q_measures is not None: + q_measures_1 = {k: np.zeros(shape) for k in q_measures.keys()} + for k in q_measures.keys(): + q_measures_1[k][ix_a] = q_measures[k][ix_b] trial_mask_2 = np.zeros( (len(new_model_set), len(new_seg_set)), dtype="bool" @@ -268,14 +319,21 @@ def merge(cls, scr_list): scores_2[ix_a] = scr_i.scores[ix_b] score_mask_2 = np.zeros(shape, dtype="bool") score_mask_2[ix_a] = scr_i.score_mask[ix_b] + if q_measures is not None: + q_measures_2 = {k: np.zeros(shape) for k in q_measures.keys()} + for k in q_measures.keys(): + q_measures_2[k][ix_a] = scr_i.q_measures[k][ix_b] model_set = new_model_set seg_set = new_seg_set scores = scores_1 + scores_2 assert not (np.any(np.logical_and(score_mask_1, score_mask_2))) score_mask = np.logical_or(score_mask_1, score_mask_2) + if q_measures is not None: + for k in q_measures.keys(): + q_measures[k] = q_measures_1[k] + q_measures_2[k] - return cls(model_set, seg_set, scores, score_mask) + return cls(model_set, seg_set, scores, score_mask, q_measures) def filter(self, model_set, seg_set, keep=True, raise_missing=True): """Removes elements from TrialScores object. @@ -297,13 +355,17 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): f_mod, mod_idx = ismember(model_set, self.model_set) f_seg, seg_idx = ismember(seg_set, self.seg_set) - + q_measures = None if np.all(f_mod) and np.all(f_seg): model_set = self.model_set[mod_idx] seg_set = self.seg_set[seg_idx] ix = np.ix_(mod_idx, seg_idx) scores = self.scores[ix] score_mask = self.score_mask[ix] + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = self.q_measures[k][ix] else: for i in (f_mod == 0).nonzero()[0]: logging.info("model %s not found", model_set[i]) @@ -318,8 +380,13 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): ix2 = np.ix_(mod_idx[f_mod], seg_idx[f_seg]) scores[ix1] = self.scores[ix2] score_mask[ix1] = self.score_mask[ix2] + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = np.zeros(scores.shape, dtype=float_cpu()) + q_measures[k][ix1] = self.q_measures[k][ix2] - return TrialScores(model_set, seg_set, scores, score_mask) + return TrialScores(model_set, seg_set, scores, score_mask, q_measures) def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): """Splits the TrialScores into num_model_parts x num_seg_parts and returns part @@ -340,7 +407,13 @@ def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): ix = np.ix_(model_idx1, seg_idx1) scores = self.scores[ix] score_mask = self.score_mask[ix] - return TrialScores(model_set, seg_set, scores, score_mask) + q_measures = None + if self.q_measures is not None: + q_measures = {} + for k in self.q_measures.keys(): + q_measures[k] = self.q_measures[k][ix] + + return TrialScores(model_set, seg_set, scores, score_mask, q_measures) def validate(self): """Validates the attributes of the TrialScores object.""" @@ -362,6 +435,10 @@ def validate(self): else: assert self.score_mask.shape == (len(self.model_set), len(self.seg_set)) + if self.q_measures is not None: + for k in self.q_measures.keys(): + assert self.q_measures[k].shape == self.scores.shape + def align_with_ndx(self, ndx, raise_missing=True): """Aligns scores, model_set and seg_set with TrialNdx or TrialKey. @@ -412,6 +489,34 @@ def get_tar_non(self, key): non = scr.scores[non_mask] return tar, non + def get_tar_non_q_measures(self, key, q_names=None, return_dict=False): + """Returns target and non target scores. + + Args: + key: TrialKey object. + q_names: names of quality measures to return, if None it will return all + + Returns: + Numpy array with target scores. + Numpy array with non-target scores. + """ + scr = self.align_with_ndx(key) + tar_mask = np.logical_and(scr.score_mask, key.tar) + if q_names is None: + q_names = self.q_measures.keys() + tar = {} + for k in q_names: + tar[k] = self.q_measures[k][tar_mask] + non_mask = np.logical_and(scr.score_mask, key.non) + non = {} + for k in q_names: + non[k] = self.q_measures[k][non_mask] + + if not return_dict: + tar = np.vstack(tuple(tar[k] for k in q_names)).T + non = np.vstack(tuple(non[k] for k in q_names)).T + return tar, non + def set_missing_to_value(self, ndx, val): """Aligns the scores with a TrialNdx and sets the trials with missing scores to the same value. @@ -450,6 +555,18 @@ def __eq__(self, other): eq = eq and np.all(self.seg_set == other.seg_set) eq = eq and np.all(np.isclose(self.scores, other.scores, atol=1e-5)) eq = eq and np.all(self.score_mask == other.score_mask) + if self.q_measures is not None: + eq = eq and other.q_measures is not None + if eq: + eq = self.q_measures.keys() == other.q_measures.keys() + if eq: + for k in self.q_measures.keys(): + eq = eq and np.all( + np.isclose( + self.q_measures[k], other.q_measures[k], atol=1e-5 + ) + ) + return eq def __ne__(self, other): @@ -463,7 +580,6 @@ def __cmp__(self, other): return 1 def test(key_file="core-core_det5_key.h5"): - key = TrialKey.load(key_file) mask = np.logical_or(key.tar, key.non) From 44f085a86b8c6e9206431cdfbb4f26954dfb4672 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sun, 10 Sep 2023 11:16:43 -0400 Subject: [PATCH 107/154] introduce entry points --- README.md | 4 +- egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml | 34 +++ egs/voxceleb/v1.2/run_001_prepare_data.sh | 26 +- egs/voxceleb/v1.2/run_002_compute_evad.sh | 16 +- .../v1.2/run_003_prepare_noises_rirs.sh | 102 +++---- .../v1.2/run_004_prepare_xvec_train_data.sh | 46 +-- egs/voxceleb/v1.2/run_005_train_xvector.sh | 4 +- egs/voxceleb/v1.2/run_006_extract_xvectors.sh | 16 +- egs/voxceleb/v1.2/run_007_eval_be.sh | 80 ++--- hyperion/bin/__init__.py | 0 hyperion/bin/adv_finetune_xvector_from_wav.py | 18 +- hyperion/bin/apply_mvn_select_frames.py | 36 ++- hyperion/bin/audio_to_duration.py | 17 +- hyperion/bin/compute_energy_vad.py | 17 +- hyperion/bin/compute_mfcc_feats.py | 21 +- hyperion/bin/copy_feats.py | 7 +- hyperion/bin/decode_wav2transducer.py | 20 +- hyperion/bin/decode_wav2vec2rnn_transducer.py | 92 +++--- hyperion/bin/eval_cosine_scoring_backend.py | 27 +- .../eval_cosine_scoring_backend_with_qmf.py | 38 +-- hyperion/bin/eval_verification_metrics.py | 25 +- ...l_xvec_cosine_scoring_from_adv_test_wav.py | 23 +- ...osine_scoring_from_adv_test_wav_wavegan.py | 26 +- ...l_xvec_cosine_scoring_from_art_test_wav.py | 27 +- .../eval_xvec_cosine_scoring_from_test_wav.py | 22 +- ...sine_scoring_from_transfer_adv_test_wav.py | 20 +- ...sine_scoring_from_transfer_art_test_wav.py | 27 +- hyperion/bin/eval_xvec_logits_from_wav.py | 28 +- hyperion/bin/extract_wav2vec2xvectors.py | 28 +- hyperion/bin/extract_wav2xvectors.py | 23 +- hyperion/bin/extract_xvectors_from_feats.py | 22 +- hyperion/bin/extract_xvectors_from_wav.py | 23 +- .../extract_xvectors_slidwin_from_feats.py | 34 ++- .../bin/extract_xvectors_slidwin_from_wav.py | 36 ++- hyperion/bin/finetune_wav2vec2transducer.py | 52 ++-- hyperion/bin/finetune_wav2vec2xvector.py | 26 +- hyperion/bin/finetune_wav2xvector.py | 22 +- .../bin/finetune_xvector_dfr_from_feats.py | 17 +- hyperion/bin/finetune_xvector_dfr_from_wav.py | 20 +- hyperion/bin/finetune_xvector_from_feats.py | 16 +- hyperion/bin/finetune_xvector_from_wav.py | 18 +- .../generate_adv_attacks_xvector_classif.py | 31 +- .../bin/generate_adv_attacks_xvector_verif.py | 18 +- hyperion/bin/hyperion_dataset.py | 62 ++-- hyperion/bin/hyperion_tables.py | 21 +- hyperion/bin/make_babble_noise_audio_files.py | 20 +- hyperion/bin/make_wav2xvector.py | 21 +- hyperion/bin/merge_scores.py | 12 +- hyperion/bin/pack_wav_rirs.py | 15 +- hyperion/bin/plot_embedding_tsne.py | 17 +- hyperion/bin/plot_embedding_tsne_per_class.py | 23 +- hyperion/bin/prepare_data.py | 11 +- hyperion/bin/preprocess_audio_files.py | 20 +- .../split_dataset_into_trials_and_cohort.py | 11 +- hyperion/bin/train_qmf.py | 28 +- hyperion/bin/train_wav2rnn_transducer.py | 77 ++--- hyperion/bin/train_wav2vec2rnn_transducer.py | 90 +++--- hyperion/bin/train_wav2vec2transducer.py | 79 ++--- hyperion/bin/train_wav2vec2xvector.py | 28 +- hyperion/bin/train_wav2xvector.py | 28 +- hyperion/bin/train_xvector_from_feats.py | 18 +- hyperion/bin/train_xvector_from_wav.py | 18 +- hyperion/io/__init__.py | 8 +- .../np/pdfs/mixtures/exp_family_mixture.py | 165 ---------- .../torch/lr_schedulers/red_lr_on_plateau.py | 6 +- hyperion/utils/queues.py | 287 ------------------ setup.py | 33 +- 67 files changed, 1110 insertions(+), 1193 deletions(-) create mode 100644 egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml create mode 100644 hyperion/bin/__init__.py delete mode 100644 hyperion/utils/queues.py diff --git a/README.md b/README.md index 7132a031..4838157b 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,11 @@ The full API is described in the documentation page [https://hyperion-ml.readthe We use anaconda or miniconda, though you should be able to make it work in other python distributions To start, you should create a new enviroment and install PyTorch>=1.9, (older versions are not supported any longer) e.g.: ``` -conda create --name ${your_env} python=3.8 +conda create --name ${your_env} python=3.11 conda activate ${your_env} conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch +conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia ``` -In next Hyperion versions, we will upgrade to Pytorch>=1.9 and drop compatibility with older PyTorch versions. ### Installing Hyperion diff --git a/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..86f55073 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/reverb_noise_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 3 + max_snr: 18 diff --git a/egs/voxceleb/v1.2/run_001_prepare_data.sh b/egs/voxceleb/v1.2/run_001_prepare_data.sh index aef70e96..563d3c2d 100755 --- a/egs/voxceleb/v1.2/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.2/run_001_prepare_data.sh @@ -16,31 +16,31 @@ config_file=default_config.sh if [ $stage -le 1 ];then # Prepare the VoxCeleb2 dataset for training. - prepare_data.py voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ - --cat-videos --use-kaldi-ids \ - --output-dir data/voxceleb2cat_train + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train fi if [ $stage -le 2 ];then # prepare voxceleb1 for test - prepare_data.py voxceleb1 --task test --corpus-dir $voxceleb1_root \ - --use-kaldi-ids \ - --output-dir data/voxceleb1_test + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test fi if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then - prepare_data.py voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ - --vox1-corpus-dir $voxceleb1_root \ - --output-dir data/voxsrc22_dev + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev fi # if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then -# prepare_data.py voxsrc22 --subset test --corpus-dir $voxsrc22_root \ -# --vox1-corpus-dir $voxceleb1_root \ -# --output-dir data/voxsrc22_test + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test # fi if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then # split vox2 into 2 parts, for cohort and qmf training - split_dataset_into_trials_and_cohort.py --data-dir data/voxceleb2cat_train + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train fi diff --git a/egs/voxceleb/v1.2/run_002_compute_evad.sh b/egs/voxceleb/v1.2/run_002_compute_evad.sh index e7593df2..acccace3 100755 --- a/egs/voxceleb/v1.2/run_002_compute_evad.sh +++ b/egs/voxceleb/v1.2/run_002_compute_evad.sh @@ -48,18 +48,18 @@ if [ $stage -le 2 ];then echo "compute vad for $name" $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ hyp_utils/conda_env.sh \ - compute_energy_vad.py --cfg $vad_config \ + hyperion-compute-energy-vad --cfg $vad_config \ --recordings-file data/$name/recordings.csv \ --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ --part-idx JOB --num-parts $nj || exit 1 - hyperion_tables.py cat \ - --table-type features \ - --output-file $vad_dir/$name/vad.csv --num-tables $nj - hyperion_dataset.py add_features \ - --dataset data/$name \ - --features-name vad \ - --features-file $vad_dir/$name/vad.csv + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv done fi diff --git a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh index aed1dae4..73c7ed82 100755 --- a/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh +++ b/egs/voxceleb/v1.2/run_003_prepare_noises_rirs.sh @@ -18,10 +18,10 @@ config_file=default_config.sh if [ $stage -le 1 ]; then for name in noise music speech do - prepare_data.py musan \ - --corpus-dir $musan_root \ - --subset $name \ - --output-dir data/musan_$name + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name done fi @@ -37,66 +37,66 @@ if [ $stage -le 2 ]; then output_dir=exp/proc_audio/$name $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ hyp_utils/conda_env.sh \ - preprocess_audio_files.py \ + hyperion-preprocess-audio-files \ --audio-format flac \ --part-idx JOB --num-parts $nj \ --recordings-file $input_data_dir/recordings.csv \ --output-path $output_dir \ --output-recordings-file $output_dir/recordings.JOB.csv - - hyperion_tables.py cat \ - --table-type recordings \ - --output-file $output_dir/recordings.csv --num-tables $nj - hyperion_dataset.py set_recordings \ - --dataset $input_data_dir \ - --recordings-file $output_dir/recordings.csv \ - --output-dataset $output_data_dir - + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + done fi if [ $stage -le 3 ]; then - # Create Babble noise from MUSAN speech files - for name in musan_speech - do - input_data_dir=data/$name - output_data_dir=data/${name}_babble - output_dir=exp/proc_audio/${name}_babble - $train_cmd $output_dir/log/make_babble_noise_${name}.log \ - hyp_utils/conda_env.sh \ - make_babble_noise_audio_files.py \ - --audio-format flac \ - --min-spks 3 --max-spks 10 --num-reuses 5 \ - --recordings-file $input_data_dir/recordings.csv \ - --output-path $output_dir \ - --output-recordings-file $output_data_dir/recordings.csv - hyperion_dataset.py make_from_recordings \ - --dataset $output_data_dir \ - --recordings-file $output_data_dir/recordings.csv - done + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done fi if [ $stage -le 4 ]; then - if [ ! -d "RIRS_NOISES" ]; then - # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip - fi - prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom - prepare_data.py rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom - prepare_data.py rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real - for rirs in rirs_smallroom rirs_mediumroom rirs_real - do - output_dir=exp/rirs/$rirs - data_dir=data/$rirs - $train_cmd $output_dir/log/pack_rirs_${name}.log \ - hyp_utils/conda_env.sh \ - pack_wav_rirs.py ${args} --input $data_dir/recordings.csv \ - --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; - hyperion_dataset.py add_features --dataset $data_dir \ - --features-name rirs --features-file $output_dir/rirs.csv + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv - done + done fi diff --git a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh index 7649ff22..4e0c5b19 100755 --- a/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh +++ b/egs/voxceleb/v1.2/run_004_prepare_xvec_train_data.sh @@ -35,42 +35,42 @@ if [ $stage -le 2 ];then $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ hyp_utils/conda_env.sh \ - preprocess_audio_files.py \ + hyperion-preprocess-audio-files \ --audio-format flac --remove-dc-offset $vad_args \ --part-idx JOB --num-parts $nj \ --recordings-file data/$nnet_data/recordings.csv \ --output-path $output_dir \ --output-recordings-file $output_dir/recordings.JOB.csv - hyperion_tables.py cat \ - --table-type recordings \ - --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj - hyperion_dataset.py set_recordings $update_durs \ - --dataset data/$nnet_data \ - --recordings-file $output_dir/recordings.csv \ - --output-dataset data/${nnet_data}_proc_audio \ - --remove-features vad + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad fi if [ $stage -le 3 ];then - hyperion_dataset.py remove_short_segments \ - --dataset data/${nnet_data}_proc_audio \ - --output-dataset data/${nnet_data}_filtered \ - --length-name duration --min-length 2.0 + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 - hyperion_dataset.py remove_classes_few_segments \ - --dataset data/${nnet_data}_filtered \ - --class-name speaker --min-segs 4 + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 fi if [ $stage -le 4 ];then - hyperion_dataset.py split_train_val \ - --dataset data/${nnet_data}_filtered \ - --val-prob 0.03 \ - --joint-classes speaker --min-train-samples 1 \ - --seed 1123581321 \ - --train-dataset data/${nnet_data}_xvector_train \ - --val-dataset data/${nnet_data}_xvector_val + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val fi diff --git a/egs/voxceleb/v1.2/run_005_train_xvector.sh b/egs/voxceleb/v1.2/run_005_train_xvector.sh index d2f31ea1..2479d565 100755 --- a/egs/voxceleb/v1.2/run_005_train_xvector.sh +++ b/egs/voxceleb/v1.2/run_005_train_xvector.sh @@ -44,7 +44,7 @@ if [ $stage -le 1 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - train_wav2xvector.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ --data.train.dataset.class-files $train_data_dir/speaker.csv \ @@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - finetune_wav2xvector.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ --data.train.dataset.class-files $train_data_dir/speaker.csv \ diff --git a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh index 09b8c8e9..0dc58048 100755 --- a/egs/voxceleb/v1.2/run_006_extract_xvectors.sh +++ b/egs/voxceleb/v1.2/run_006_extract_xvectors.sh @@ -58,15 +58,15 @@ if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qm echo "Extracting x-vectors for $name" $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - extract_wav2xvectors.py ${xvec_args} ${vad_args} \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ --random-utt-length --min-utt-length 2 --max-utt-length 30 \ --model-path $nnet \ --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv - hyperion_tables.py cat \ - --table-type features \ - --output-file $output_dir/xvector.csv --num-tables $nj + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj done fi @@ -88,14 +88,14 @@ if [ $stage -le 2 ]; then echo "Extracting x-vectors for $name" $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - extract_wav2xvectors.py ${xvec_args} ${vad_args} \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ --model-path $nnet \ --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv - hyperion_tables.py cat \ - --table-type features \ - --output-file $output_dir/xvector.csv --num-tables $nj + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj done fi diff --git a/egs/voxceleb/v1.2/run_007_eval_be.sh b/egs/voxceleb/v1.2/run_007_eval_be.sh index 9084d35b..53621488 100755 --- a/egs/voxceleb/v1.2/run_007_eval_be.sh +++ b/egs/voxceleb/v1.2/run_007_eval_be.sh @@ -56,7 +56,7 @@ if [ $stage -le 3 ];then do $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend.py \ + hyperion-eval-cosine-scoring-backend \ --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ --ndx-file data/voxceleb1_test/trials.csv \ --enroll-map-file data/voxceleb1_test/enrollment.csv \ @@ -66,11 +66,11 @@ if [ $stage -le 3 ];then done done wait - merge_scores.py --output-file $score_cosine_dir/voxceleb1_scores.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_dir/voxceleb1_scores.csv \ --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ --score-names voxceleb1 \ @@ -85,22 +85,22 @@ if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then echo "Eval voxsrc2 with Cosine scoring" $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend.py \ + hyperion-eval-cosine-scoring-backend \ --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ --ndx-file data/voxsrc22_dev/trials.csv \ --enroll-map-file data/voxsrc22_dev/enrollment.csv \ --score-file $score_cosine_dir/voxsrc22_dev_scores.csv # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ - # hyp_utils/conda_env.sh \ - # eval_cosine_scoring_backend.py \ - # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ - # --ndx-file data/voxsrc22_eval/trials.csv \ - # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ - # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + # hyp_utils/conda_env.sh \ + # hyperion-eval-cosine-scoring-backend \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ --key-files data/voxsrc22_dev/trials.csv \ --score-names voxsrc22_dev \ @@ -121,7 +121,7 @@ if [ "$do_snorm" == "true" ];then do $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend.py \ + hyperion-eval-cosine-scoring-backend \ --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ --ndx-file data/voxceleb1_test/trials.csv \ --enroll-map-file data/voxceleb1_test/enrollment.csv \ @@ -135,11 +135,11 @@ if [ "$do_snorm" == "true" ];then sleep 5s done wait - merge_scores.py --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ --score-names voxceleb1 \ @@ -159,7 +159,7 @@ if [ "$do_snorm" == "true" ];then do $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend.py \ + hyperion-eval-cosine-scoring-backend \ --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ --ndx-file data/voxsrc22_dev/trials.csv \ --enroll-map-file data/voxsrc22_dev/enrollment.csv \ @@ -174,16 +174,16 @@ if [ "$do_snorm" == "true" ];then sleep 10s done wait - merge_scores.py --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ - eval_verification_metrics.py \ - --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ - --key-files data/voxsrc22_dev/trials.csv \ - --score-names voxsrc22_dev \ - --key-names all \ - --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv @@ -202,7 +202,7 @@ if [ "$do_qmf" == "true" ];then do $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend_with_qmf.py \ + hyperion-eval-cosine-scoring-backend-with-qmf \ --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ --ndx-file data/voxceleb2cat_train_trials/trials.csv \ --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ @@ -216,13 +216,13 @@ if [ "$do_qmf" == "true" ];then sleep 5s done wait - merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts - train_qmf.py --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ - --key-file data/voxceleb2cat_train_trials/trials.csv \ - --model-file $score_cosine_qmf_dir/qmf.h5 - + hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + fi if [ $stage -le 8 ];then @@ -234,7 +234,7 @@ if [ "$do_qmf" == "true" ];then do $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend_with_qmf.py \ + hyperion-eval-cosine-scoring-backend-with-qmf \ --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ --ndx-file data/voxceleb1_test/trials.csv \ --enroll-map-file data/voxceleb1_test/enrollment.csv \ @@ -252,11 +252,11 @@ if [ "$do_qmf" == "true" ];then for suffix in "" .snorm .snorm.qmf do ( - merge_scores.py --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ --score-names voxceleb1 \ @@ -280,7 +280,7 @@ if [ "$do_qmf" == "true" ];then do $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ hyp_utils/conda_env.sh \ - eval_cosine_scoring_backend_with_qmf.py \ + hyperion-eval-cosine-scoring-backend-with-qmf \ --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ --ndx-file data/voxsrc22_dev/trials.csv \ --enroll-map-file data/voxsrc22_dev/enrollment.csv \ @@ -299,11 +299,11 @@ if [ "$do_qmf" == "true" ];then for suffix in "" .snorm .snorm.qmf do ( - merge_scores.py --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ - --num-enroll-parts $num_parts --num-test-parts $num_parts + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ - eval_verification_metrics.py \ + hyperion-eval-verification-metrics \ --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ --key-files data/voxsrc22_dev/trials.csv \ --score-names voxsrc22_dev \ diff --git a/hyperion/bin/__init__.py b/hyperion/bin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperion/bin/adv_finetune_xvector_from_wav.py b/hyperion/bin/adv_finetune_xvector_from_wav.py index f45b84a0..ea3d3b80 100755 --- a/hyperion/bin/adv_finetune_xvector_from_wav.py +++ b/hyperion/bin/adv_finetune_xvector_from_wav.py @@ -13,6 +13,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.adv_attacks import AttackFactory @@ -29,8 +36,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorAdvTrainerFromWav as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, @@ -43,7 +48,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -138,7 +142,6 @@ def init_attack(feat_extractor, model, wav_scale, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -231,8 +234,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="""Fine-tune x-vector model from audio files with adversarial training""" @@ -266,6 +268,10 @@ def make_parser(xvec_class): train_xvec(gpu_id, args_sc) +if __name__ == "__main__": + main() + + # def init_data( # audio_path, # train_list, diff --git a/hyperion/bin/apply_mvn_select_frames.py b/hyperion/bin/apply_mvn_select_frames.py index bdf53786..f8299edc 100755 --- a/hyperion/bin/apply_mvn_select_frames.py +++ b/hyperion/bin/apply_mvn_select_frames.py @@ -10,6 +10,13 @@ import time import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import RandomAccessDataReaderFactory as RDRF @@ -18,8 +25,6 @@ from hyperion.np.feats import MeanVarianceNorm as MVN from hyperion.utils import Utt2Info from hyperion.utils.kaldi_matrix import compression_methods -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def process_feats( @@ -35,7 +40,6 @@ def process_feats( compression_method, **kwargs ): - logging.info("initializing") mvn_args = MVN.filter_args(**kwargs) mvn = MVN(**mvn_args) @@ -49,16 +53,23 @@ def process_feats( logging.info("opening output stream: %s" % (output_spec)) with DWF.create( - output_spec, compress=compress, compression_method=compression_method, + output_spec, + compress=compress, + compression_method=compression_method, ) as writer: - logging.info("opening input stream: %s" % (output_spec)) with DRF.create( - input_spec, path_prefix=path_prefix, part_idx=part_idx, num_parts=num_parts, + input_spec, + path_prefix=path_prefix, + part_idx=part_idx, + num_parts=num_parts, ) as reader: if vad_spec is not None: logging.info("opening VAD stream: %s" % (vad_spec)) - v_reader = RDRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = RDRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): key, data = reader.read(1) @@ -91,8 +102,7 @@ def process_feats( u2nf.save(write_num_frames_spec) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Apply CMVN and remove silence") parser.add_argument("--input", dest="input_spec", required=True) @@ -105,7 +115,9 @@ def process_feats( "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix") ) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument( "--part-idx", @@ -150,3 +162,7 @@ def process_feats( logging.debug(args) process_feats(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/audio_to_duration.py b/hyperion/bin/audio_to_duration.py index ac8852a4..8ef6b5c1 100755 --- a/hyperion/bin/audio_to_duration.py +++ b/hyperion/bin/audio_to_duration.py @@ -9,15 +9,19 @@ import time import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.io import SequentialAudioReader as AR from hyperion.utils import SegmentSet -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def audio_to_duration(audio_file, output_file, **kwargs): - input_args = AR.filter_args(**kwargs) logging.info(f"input_args={input_args}") @@ -36,8 +40,7 @@ def audio_to_duration(audio_file, output_file, **kwargs): seg_set.save(output_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Writes audio file durations to table") parser.add_argument("--cfg", action=ActionConfigFile) @@ -59,3 +62,7 @@ def audio_to_duration(audio_file, output_file, **kwargs): logging.debug(args) audio_to_duration(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/compute_energy_vad.py b/hyperion/bin/compute_energy_vad.py index 9d50388c..fe0b1d8e 100755 --- a/hyperion/bin/compute_energy_vad.py +++ b/hyperion/bin/compute_energy_vad.py @@ -9,10 +9,6 @@ import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.io import DataWriterFactory as DWF -from hyperion.io import SequentialAudioReader as AR -from hyperion.np.feats import EnergyVAD from jsonargparse import ( ActionConfigFile, ActionParser, @@ -20,9 +16,13 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR +from hyperion.np.feats import EnergyVAD -def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): +def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): vad_args = EnergyVAD.filter_args(**kwargs) vad = EnergyVAD(**vad_args) @@ -78,8 +78,7 @@ def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): f_num_frames.close() -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Compute Kaldi Energy VAD") parser.add_argument("--cfg", action=ActionConfigFile) @@ -105,3 +104,7 @@ def compute_vad(recordings_file, output_spec, write_num_frames, **kwargs): logging.debug(args) compute_vad(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/compute_mfcc_feats.py b/hyperion/bin/compute_mfcc_feats.py index 442e4141..f42f260d 100755 --- a/hyperion/bin/compute_mfcc_feats.py +++ b/hyperion/bin/compute_mfcc_feats.py @@ -9,20 +9,24 @@ import time import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.io import compression_methods from hyperion.np.feats import MFCC -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def compute_mfcc_feats( input_path, output_path, compress, compression_method, write_num_frames, **kwargs ): - mfcc_args = MFCC.filter_args(**kwargs) mfcc = MFCC(**mfcc_args) @@ -34,7 +38,9 @@ def compute_mfcc_feats( reader = DRF.create(input_path, **input_args) writer = DWF.create( - output_path, compress=compress, compression_method=compression_method, + output_path, + compress=compress, + compression_method=compression_method, ) if write_num_frames is not None: @@ -68,8 +74,7 @@ def compute_mfcc_feats( f_num_frames.close() -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Compute MFCC features") parser.add_argument("--cfg", action=ActionConfigFile) @@ -109,3 +114,7 @@ def compute_mfcc_feats( logging.debug(args) compute_mfcc_feats(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/copy_feats.py b/hyperion/bin/copy_feats.py index 4549caec..4ffc1a58 100755 --- a/hyperion/bin/copy_feats.py +++ b/hyperion/bin/copy_feats.py @@ -12,11 +12,12 @@ import time import numpy as np + from hyperion.hyp_defs import config_logger from hyperion.io import CopyFeats as CF -if __name__ == "__main__": +def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, fromfile_prefix_chars="@", @@ -37,3 +38,7 @@ logging.debug(args) CF(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/decode_wav2transducer.py b/hyperion/bin/decode_wav2transducer.py index 972b247c..bcf9e05c 100755 --- a/hyperion/bin/decode_wav2transducer.py +++ b/hyperion/bin/decode_wav2transducer.py @@ -15,18 +15,22 @@ import sentencepiece as spm import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.models.wav2transducer.beam_search import (beam_search, - greedy_search) +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): @@ -118,7 +122,6 @@ def decode_one_batch( def decode_transducer( input_spec, output_spec, model_path, bpe_model, use_gpu, **kwargs ): - device = init_device(use_gpu) model = load_model(model_path, device) @@ -202,8 +205,7 @@ def decode_transducer( ) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extracts x-vectors from waveform computing " "acoustic features on the fly" @@ -235,3 +237,7 @@ def decode_transducer( logging.debug(args) decode_transducer(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/decode_wav2vec2rnn_transducer.py b/hyperion/bin/decode_wav2vec2rnn_transducer.py index 4fdc3140..33aea8c3 100755 --- a/hyperion/bin/decode_wav2vec2rnn_transducer.py +++ b/hyperion/bin/decode_wav2vec2rnn_transducer.py @@ -15,19 +15,23 @@ import sentencepiece as spm import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR from hyperion.np.augment import SpeechAugment from hyperion.torch import TorchModelLoader as TML from hyperion.torch.models import HFWav2Vec2RNNTransducer -from hyperion.torch.models.wav2transducer.beam_search import (beam_search, - greedy_search) +from hyperion.torch.models.wav2transducer.beam_search import beam_search, greedy_search from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): @@ -48,10 +52,11 @@ def load_model(model_path, device): def decode_one_batch( - model: nn.Module, - sp: spm.SentencePieceProcessor, - x: torch.Tensor, - decoding_method="beam_search") -> Dict[str, List[List[str]]]: + model: nn.Module, + sp: spm.SentencePieceProcessor, + x: torch.Tensor, + decoding_method="beam_search", +) -> Dict[str, List[List[str]]]: """Decode one batch and return the result in a dict. The dict has the following format: - key: It indicates the setting used for decoding. For example, @@ -77,7 +82,7 @@ def decode_one_batch( the returned dict. """ device = model.device - feature = x #batch["inputs"] + feature = x # batch["inputs"] assert x.shape[0] == 1 assert feature.ndim == 2 @@ -87,7 +92,8 @@ def decode_one_batch( feature_lens = torch.Tensor([x.shape[1]]).int() encoder_out, hid_feats, encoder_out_lens = model.forward_feats( - x=feature, x_lengths=feature_lens) + x=feature, x_lengths=feature_lens + ) hyps = [] batch_size = encoder_out.size(0) @@ -114,9 +120,16 @@ def decode_one_batch( return hyps[0] -def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, - infer_args, use_gpu, **kwargs): - +def decode_transducer( + input_spec, + output_spec, + scp_sep, + model_path, + bpe_model, + infer_args, + use_gpu, + **kwargs, +): device = init_device(use_gpu) model = load_model(model_path, device) @@ -142,8 +155,9 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, t2 = time.time() logging.info("processing utt %s", key) with torch.no_grad(): - x = torch.tensor( - x[None, :], dtype=torch.get_default_dtype()).to(device) + x = torch.tensor(x[None, :], dtype=torch.get_default_dtype()).to( + device + ) tot_frames = x.shape[1] logging.info( @@ -157,10 +171,10 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, if x.shape[1] == 0: y = [""] else: - #y = decode_one_batch(model=model, sp=sp, x=x) - x_lengths = torch.tensor((x.shape[1], ), - dtype=torch.long, - device=device) + # y = decode_one_batch(model=model, sp=sp, x=x) + x_lengths = torch.tensor( + (x.shape[1],), dtype=torch.long, device=device + ) y = model.infer(x, x_lengths, **infer_args) y = sp.decode(y[0]) @@ -172,10 +186,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, tot_time = t4 - t1 infer_time = t3 - t2 logging.info( - ("utt %s total-time=%.3f read-time=%.3f " - "infer-time=%.3f " - "write-time=%.3f " - "infer-rt-factor=%.2f tot-rt-factor=%.2f"), + ( + "utt %s total-time=%.3f read-time=%.3f " + "infer-time=%.3f " + "write-time=%.3f " + "infer-rt-factor=%.2f tot-rt-factor=%.2f" + ), key, tot_time, t2 - t1, @@ -186,16 +202,14 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, ) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( - description=("ASR decoding for RNN-T with Wav2vec features")) + description=("ASR decoding for RNN-T with Wav2vec features") + ) parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument("--input", dest="input_spec", required=True) - parser.add_argument("--scp-sep", - default=" ", - help=("scp file field separator")) + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) AR.add_class_args(parser) parser.add_argument("--model-path", required=True) @@ -203,16 +217,12 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, HFWav2Vec2RNNTransducer.add_infer_args(parser, "infer-args") parser.add_argument("--output", dest="output_spec", required=True) - parser.add_argument("--use-gpu", - default=False, - action="store_true", - help="extract xvectors in gpu") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -220,3 +230,7 @@ def decode_transducer(input_spec, output_spec, scp_sep, model_path, bpe_model, logging.debug(args) decode_transducer(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_cosine_scoring_backend.py b/hyperion/bin/eval_cosine_scoring_backend.py index 1a740024..835cae0b 100755 --- a/hyperion/bin/eval_cosine_scoring_backend.py +++ b/hyperion/bin/eval_cosine_scoring_backend.py @@ -4,24 +4,24 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging +import time +from pathlib import Path + +import numpy as np from jsonargparse import ( - ArgumentParser, ActionConfigFile, ActionParser, + ArgumentParser, namespace_to_dict, ) -import time -import logging -from pathlib import Path - -import numpy as np from hyperion.hyp_defs import config_logger -from hyperion.utils import TrialNdx, TrialKey, TrialScores, EnrollmentMap, SegmentSet -from hyperion.utils.math_funcs import cosine_scoring from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.np.transforms import TransformList from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import TransformList +from hyperion.utils import EnrollmentMap, SegmentSet, TrialKey, TrialNdx, TrialScores +from hyperion.utils.math_funcs import cosine_scoring def load_trial_data( @@ -58,7 +58,6 @@ def load_trial_data( def load_cohort_data(segments_file, feats_file): - segments = SegmentSet.load(segments_file) feats_reader = DRF.create(feats_file) x = feats_reader.read(segments["id"], squeeze=True) @@ -81,7 +80,6 @@ def eval_backend( cohort_nbest, avg_cohort_by, ): - logging.info("loading data") enroll_map, ndx, x_e, x_t = load_trial_data( enroll_map_file, @@ -151,8 +149,7 @@ def eval_backend( scores.save(score_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Eval cosine-scoring with optional AS-Norm") parser.add_argument("--enroll-feats-file", default=None) @@ -198,3 +195,7 @@ def eval_backend( logging.debug(args) eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py index 0333669f..4fecf2f3 100755 --- a/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py +++ b/hyperion/bin/eval_cosine_scoring_backend_with_qmf.py @@ -4,33 +4,33 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ( - ArgumentParser, - ActionConfigFile, - ActionParser, - namespace_to_dict, -) -import time import logging +import time from pathlib import Path import numpy as np import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import TransformList from hyperion.utils import ( - TrialNdx, - TrialKey, - TrialScores, EnrollmentMap, - SegmentSet, InfoTable, + SegmentSet, + TrialKey, + TrialNdx, + TrialScores, ) -from hyperion.utils.math_funcs import cosine_scoring, average_vectors -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.np.transforms import TransformList -from hyperion.np.score_norm import AdaptSNorm -from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.utils.math_funcs import average_vectors, cosine_scoring def get_precomp_qm_names(quality_measures): @@ -542,7 +542,7 @@ def eval_backend( # scores.save(score_file_snorm) -if __name__ == "__main__": +def main(): parser = ArgumentParser( description="Eval cosine-scoring with optional AS-Norm and QMF" ) @@ -611,3 +611,7 @@ def eval_backend( logging.debug(args) eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_verification_metrics.py b/hyperion/bin/eval_verification_metrics.py index 83227558..98fd37e2 100755 --- a/hyperion/bin/eval_verification_metrics.py +++ b/hyperion/bin/eval_verification_metrics.py @@ -5,19 +5,19 @@ """ import logging from pathlib import Path -import pandas as pd - -from hyperion.hyp_defs import config_logger -from hyperion.np.metrics import VerificationEvaluator as VE +import pandas as pd from jsonargparse import ( ActionConfigFile, - ActionYesNo, ActionParser, + ActionYesNo, ArgumentParser, namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.np.metrics import VerificationEvaluator as VE + def eval_verification_metrics( key_files, @@ -30,7 +30,6 @@ def eval_verification_metrics( sparse, output_file, ): - assert len(key_files) == len(key_names) assert len(score_files) == len(score_names) dfs = [] @@ -61,8 +60,7 @@ def eval_verification_metrics( print(df.to_string(), flush=True) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Evaluate speaker verification metrics") parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument("--key-files", required=True, nargs="+") @@ -85,7 +83,12 @@ def eval_verification_metrics( parser.add_argument("--sparse", default=False, action=ActionYesNo) parser.add_argument("--output-file", required=True) parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, ) args = parser.parse_args() @@ -94,3 +97,7 @@ def eval_verification_metrics( del kwargs["verbose"] del kwargs["cfg"] eval_verification_metrics(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py index 7c9d4104..1baad913 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav.py @@ -12,6 +12,13 @@ import pandas as pd import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -26,8 +33,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): @@ -44,7 +49,6 @@ def __init__( self.sigma = sigma def forward(self, s_t): - if self.sigma > 0: s_t = s_t + self.sigma * torch.randn_like(s_t) @@ -107,7 +111,6 @@ def load_calibrator(cal_file, threshold): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -143,7 +146,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) xvector_model = load_model(model_path) @@ -319,8 +321,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Eval cosine-scoring given enroll x-vector and test wave" ) @@ -336,7 +337,9 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -415,3 +418,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py index fb0d402c..3e4e9229 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_adv_test_wav_wavegan.py @@ -7,6 +7,7 @@ import os import sys import time + # [Added Sonal May21] from pathlib import Path @@ -14,6 +15,13 @@ import pandas as pd import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -29,8 +37,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) torch.backends.cudnn.enabled = False @@ -45,7 +51,7 @@ def __init__( sigma=0, smoothing_after_wavegan=None, wave_gan_defender=None, - wav_scale=2 ** 15 - 1, + wav_scale=2**15 - 1, ): super().__init__() self.feat_extractor = feat_extractor @@ -61,7 +67,6 @@ def __init__( self.apply_wavegan = False if wave_gan_defender is None else True def forward(self, s_t): - # Pre-proceessing defense, wavegan + smoothing [Added Sonal May21] s_t = s_t / self.wav_scale if self.smoothing_after_wavegan: @@ -149,7 +154,6 @@ def load_calibrator(cal_file, threshold): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -188,7 +192,6 @@ def eval_cosine_scoring_wavegan( wave_gan_model_ckpt, **kwargs ): - device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) @@ -374,8 +377,7 @@ def eval_cosine_scoring_wavegan( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Eval cosine-scoring given enroll x-vector and test wave" ) @@ -391,7 +393,9 @@ def eval_cosine_scoring_wavegan( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -488,3 +492,7 @@ def eval_cosine_scoring_wavegan( logging.debug(args) eval_cosine_scoring_wavegan(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py index 2d5baa17..781cdbdf 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_art_test_wav.py @@ -15,6 +15,13 @@ import torch.nn as nn from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -22,16 +29,15 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): @@ -69,7 +75,6 @@ def load_calibrator(cal_file): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -156,7 +161,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device_type = "gpu" if use_gpu else "cpu" device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) @@ -343,8 +347,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Eval cosine-scoring given enroll x-vector " @@ -363,7 +366,9 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -431,3 +436,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py index 76af5d75..2ebb7e3d 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_test_wav.py @@ -12,6 +12,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import RandomAccessAudioReader as AR from hyperion.io import RandomAccessDataReaderFactory as DRF @@ -24,8 +31,6 @@ from hyperion.torch.utils.misc import l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_device(use_gpu): @@ -66,7 +71,6 @@ def load_calibrator(cal_file, device): def read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) try: @@ -104,7 +108,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -199,8 +202,7 @@ def eval_cosine_scoring( s.save_txt(score_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Eval cosine-scoring given enroll x-vector and test wave" ) @@ -216,7 +218,9 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -266,3 +270,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index f33402a1..a6f8efa4 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -12,6 +12,13 @@ import pandas as pd import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -26,8 +33,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): @@ -104,7 +109,6 @@ def load_calibrator(cal_file, threshold): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -146,7 +150,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device = init_device(use_gpu) # load victim model feat_extractor = init_feats(**kwargs["feats"]) @@ -204,7 +207,7 @@ def eval_cosine_scoring( if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) scores = np.zeros((key.num_models, key.num_tests), dtype="float32") attack_stats = pd.DataFrame( @@ -337,8 +340,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Eval cosine-scoring given enroll x-vector and " @@ -435,3 +437,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py index f94dc497..7b8bc245 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_art_test_wav.py @@ -15,6 +15,13 @@ import torch.nn as nn from art.classifiers import PyTorchClassifier from art.estimators.classification import PyTorchClassifier +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -22,16 +29,15 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.np.classifiers import BinaryLogisticRegression as LR from hyperion.torch import TorchModelLoader as TML -from hyperion.torch.adv_attacks.art_attack_factory import \ - ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) from hyperion.torch.layers import LinBinCalibrator as Calibrator from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): @@ -113,7 +119,6 @@ def load_calibrator(cal_file): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -155,7 +160,6 @@ def eval_cosine_scoring( num_seg_parts, **kwargs ): - device_type = "gpu" if use_gpu else "cpu" device = init_device(use_gpu) # load victim model @@ -361,8 +365,7 @@ def eval_cosine_scoring( attack_stats.to_csv(stats_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Eval cosine-scoring given enroll x-vector and " @@ -384,7 +387,9 @@ def eval_cosine_scoring( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -456,3 +461,7 @@ def eval_cosine_scoring( logging.debug(args) eval_cosine_scoring(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_xvec_logits_from_wav.py b/hyperion/bin/eval_xvec_logits_from_wav.py index f60c7508..b2e6a665 100755 --- a/hyperion/bin/eval_xvec_logits_from_wav.py +++ b/hyperion/bin/eval_xvec_logits_from_wav.py @@ -12,6 +12,13 @@ import numpy as np import pandas as pd import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -21,12 +28,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -110,7 +111,6 @@ def eval_xvec( use_gpu, **kwargs ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) @@ -131,15 +131,16 @@ def eval_xvec( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) ) with AR(input_spec, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): t1 = time.time() @@ -224,8 +225,7 @@ def eval_xvec( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Evaluates x-vectors logits from waveform computing " @@ -299,3 +299,7 @@ def eval_xvec( logging.debug(args) eval_xvec(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 5eba1b99..f2df9581 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -13,6 +13,13 @@ import pandas as pd import torch import torchaudio.transforms as tat +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -21,12 +28,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) resamplers = {} @@ -122,7 +123,6 @@ def extract_xvectors( use_gpu, **kwargs, ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) model = load_model(model_path, device) @@ -143,13 +143,14 @@ def extract_xvectors( ar_args["wav_scale"] = 1.0 logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info(f"opening input stream: {recordings_file} with args={ar_args}") with AR(recordings_file, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): t1 = time.time() @@ -252,8 +253,7 @@ def extract_xvectors( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extracts x-vectors from waveform computing " "acoustic features on the fly" @@ -340,3 +340,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_wav2xvectors.py b/hyperion/bin/extract_wav2xvectors.py index 7b04fcc8..763df3fc 100755 --- a/hyperion/bin/extract_wav2xvectors.py +++ b/hyperion/bin/extract_wav2xvectors.py @@ -13,6 +13,13 @@ import pandas as pd import torch import torchaudio.transforms as tat +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -21,12 +28,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) resamplers = {} @@ -121,7 +122,6 @@ def extract_xvectors( use_gpu, **kwargs, ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) model = load_model(model_path, device) @@ -143,10 +143,8 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s with args=%s", output_spec, str(ar_args)) with DWF.create(output_spec, metadata_columns=metadata_columns) as writer: - logging.info(f"opening input stream: {recordings_file} with args={ar_args}") with AR(recordings_file, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) @@ -255,8 +253,7 @@ def extract_xvectors( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="""Extracts x-vectors from waveform computing acoustic features on the fly""" ) @@ -331,3 +328,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_from_feats.py b/hyperion/bin/extract_xvectors_from_feats.py index b02db70c..e70225c2 100755 --- a/hyperion/bin/extract_xvectors_from_feats.py +++ b/hyperion/bin/extract_xvectors_from_feats.py @@ -11,6 +11,13 @@ import numpy as np import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -19,12 +26,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -82,7 +83,6 @@ def extract_xvectors( use_gpu, **kwargs ): - logging.info("initializing") rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) @@ -96,7 +96,6 @@ def extract_xvectors( dr_args = DRF.filter_args(**kwargs) logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec) as writer: - logging.info("opening input stream: %s" % (input_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: @@ -174,8 +173,7 @@ def extract_xvectors( u2nf.save(write_num_frames_spec) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Extracts x-vectors from features") parser.add_argument("--cfg", action=ActionConfigFile) @@ -244,3 +242,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_from_wav.py b/hyperion/bin/extract_xvectors_from_wav.py index 6a8130d3..71a24bd4 100755 --- a/hyperion/bin/extract_xvectors_from_wav.py +++ b/hyperion/bin/extract_xvectors_from_wav.py @@ -12,6 +12,13 @@ import numpy as np import pandas as pd import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -21,12 +28,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -111,7 +112,6 @@ def extract_xvectors( use_gpu, **kwargs ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) @@ -132,12 +132,10 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info( "opening input stream: {} with args={}".format(recordings_file, ar_args) ) with AR(recordings_file, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) @@ -235,8 +233,7 @@ def extract_xvectors( aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extracts x-vectors from waveform computing acoustic features on the fly" @@ -317,3 +314,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_slidwin_from_feats.py b/hyperion/bin/extract_xvectors_slidwin_from_feats.py index bcec5133..a1186ed2 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_feats.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_feats.py @@ -12,6 +12,13 @@ import numpy as np import torch import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialDataReaderFactory as DRF @@ -20,12 +27,6 @@ from hyperion.torch import TorchModelLoader as TML from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -73,7 +74,6 @@ def extract_xvectors( use_gpu, **kwargs ): - logging.info("initializing") rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) @@ -86,7 +86,6 @@ def extract_xvectors( dr_args = DRF.filter_args(**kwargs) logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec) as writer: - logging.info("opening input stream: %s" % (output_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: @@ -118,7 +117,13 @@ def extract_xvectors( t4 = time.time() if x.shape[0] == 0: - y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),) + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) else: xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) with torch.no_grad(): @@ -195,8 +200,7 @@ def extract_xvectors( yaml.dump(params, f) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Extract x-vectors over a sliding window") parser.add_argument("--cfg", action=ActionConfigFile) @@ -208,7 +212,9 @@ def extract_xvectors( ) parser.add_argument("--slidwin-params-path", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) MVN.add_class_args(parser, prefix="mvn") @@ -298,3 +304,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/extract_xvectors_slidwin_from_wav.py b/hyperion/bin/extract_xvectors_slidwin_from_wav.py index f1a64e1b..f973b566 100755 --- a/hyperion/bin/extract_xvectors_slidwin_from_wav.py +++ b/hyperion/bin/extract_xvectors_slidwin_from_wav.py @@ -13,6 +13,13 @@ import pandas as pd import torch import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import DataWriterFactory as DWF from hyperion.io import SequentialAudioReader as AR @@ -22,12 +29,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_device(use_gpu): @@ -99,7 +100,6 @@ def extract_xvectors( use_gpu, **kwargs ): - rng = np.random.default_rng(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) @@ -124,15 +124,16 @@ def extract_xvectors( ar_args = AR.filter_args(**kwargs) logging.info("opening output stream: %s", output_spec) with DWF.create(output_spec) as writer: - logging.info( "opening input stream: {} with args={}".format(input_spec, ar_args) ) with AR(input_spec, **ar_args) as reader: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix,) + v_reader = VRF.create( + vad_spec, + path_prefix=vad_path_prefix, + ) while not reader.eof(): t1 = time.time() @@ -172,7 +173,13 @@ def extract_xvectors( t6 = time.time() if x.shape[1] == 0: - y = np.zeros((1, model.embed_dim,), dtype=float_cpu(),) + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) else: x = x.transpose(1, 2).contiguous() y = ( @@ -255,8 +262,7 @@ def extract_xvectors( yaml.dump(params, f) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Extract x-vectors over a sliding window" @@ -347,3 +353,7 @@ def extract_xvectors( logging.debug(args) extract_xvectors(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2vec2transducer.py b/hyperion/bin/finetune_wav2vec2transducer.py index 6f17f800..138f18f7 100755 --- a/hyperion/bin/finetune_wav2vec2transducer.py +++ b/hyperion/bin/finetune_wav2vec2transducer.py @@ -14,6 +14,14 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -22,9 +30,6 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from torch.nn.utils.rnn import pad_sequence model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, @@ -43,8 +48,7 @@ def transducer_collate(batch): audio = pad_sequence(audio) audio_length = torch.as_tensor(audio_length) target = k2.RaggedTensor(target) - return torch.transpose(audio,0,1), audio_length, target - + return torch.transpose(audio, 0, 1), audio_length, target def init_data(partition, rank, num_gpus, **kwargs): @@ -73,7 +77,9 @@ def init_data(partition, rank, num_gpus, **kwargs): largs = ( {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} ) - data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -89,11 +95,7 @@ def init_model(in_model_file, rank, model_class, **kwargs): return model - - - def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -119,7 +121,7 @@ def train_model(gpu_id, args): trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} + metrics = {} trainer = Trainer( model, device=device, @@ -135,7 +137,7 @@ def train_model(gpu_id, args): def make_parser(model_class): parser = ArgumentParser() - + parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") AD.add_class_args(train_parser, prefix="dataset", skip={}) @@ -161,27 +163,23 @@ def make_parser(model_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) - parser.add_argument( "--data.train.dataset.text_file", - type=str, + type=str, ) - - parser.add_argument("--data.val.dataset.text_file", type=str) - + + parser.add_argument("--data.val.dataset.text_file", type=str) + parser.add_argument( "--data.train.dataset.bpe_model", - type=str, + type=str, ) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) - parser.link_arguments( - "data.train.dataset.bpe_model", "data.val.dataset.bpe_model" - ) - + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") parser.add_argument("--in-model-file", required=True) model_class.add_finetune_args(parser, prefix="model") @@ -198,8 +196,10 @@ def make_parser(model_class): return parser -if __name__ == "__main__": - parser = ArgumentParser(description="Fine-tune Wav2Vec2Transducer model from audio files") +def main(): + parser = ArgumentParser( + description="Fine-tune Wav2Vec2Transducer model from audio files" + ) parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -228,3 +228,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2vec2xvector.py b/hyperion/bin/finetune_wav2vec2xvector.py index fc3c7084..7020e32f 100755 --- a/hyperion/bin/finetune_wav2vec2xvector.py +++ b/hyperion/bin/finetune_wav2vec2xvector.py @@ -13,18 +13,25 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy -from hyperion.torch.models import (HFHubert2ResNet1dXVector, - HFWav2Vec2ResNet1dXVector, - HFWavLM2ResNet1dXVector) +from hyperion.torch.models import ( + HFHubert2ResNet1dXVector, + HFWav2Vec2ResNet1dXVector, + HFWavLM2ResNet1dXVector, +) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, @@ -34,7 +41,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -99,7 +105,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -182,8 +187,7 @@ def make_parser(model_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Finetunes Wav2Vec2XVector model from audio files" ) @@ -215,3 +219,7 @@ def make_parser(model_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_wav2xvector.py b/hyperion/bin/finetune_wav2xvector.py index b100b544..97356c01 100755 --- a/hyperion/bin/finetune_wav2xvector.py +++ b/hyperion/bin/finetune_wav2xvector.py @@ -11,6 +11,13 @@ from pathlib import Path import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -27,12 +34,6 @@ # from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) xvec_dict = { "resnet": RXVec, @@ -45,7 +46,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -115,7 +115,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -196,8 +195,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Fine-tune x-vector model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -226,3 +224,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_dfr_from_feats.py b/hyperion/bin/finetune_xvector_dfr_from_feats.py index 17cafb85..140cc3a2 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_feats.py +++ b/hyperion/bin/finetune_xvector_dfr_from_feats.py @@ -14,6 +14,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import ClassWeightedSeqSampler as Sampler @@ -22,8 +29,6 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainerDeepFeatReg as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): @@ -60,7 +65,6 @@ def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **k def init_xvector( num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs ): - xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: logging.info("xvector network ft args={}".format(xvec_args)) @@ -194,8 +198,7 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Fine-tune x-vector model with deep feature loss regularization" ) @@ -278,3 +281,7 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_dfr_from_wav.py b/hyperion/bin/finetune_xvector_dfr_from_wav.py index f7832a47..9d745e67 100755 --- a/hyperion/bin/finetune_xvector_dfr_from_wav.py +++ b/hyperion/bin/finetune_xvector_dfr_from_wav.py @@ -8,10 +8,18 @@ import os import sys import time +from pathlib import Path import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -21,8 +29,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerDeepFeatRegFromWav as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data( @@ -36,7 +42,6 @@ def init_data( rank, **kwargs ): - ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: @@ -82,7 +87,6 @@ def init_feats(rank, **kwargs): def init_xvector( num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs ): - xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: logging.info("xvector network ft args={}".format(xvec_args)) @@ -103,7 +107,6 @@ def init_xvector( def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -231,8 +234,7 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Fine-tune x-vector model with deep feature loss " @@ -327,3 +329,7 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_from_feats.py b/hyperion/bin/finetune_xvector_from_feats.py index ac9c2d0b..01e0c778 100755 --- a/hyperion/bin/finetune_xvector_from_feats.py +++ b/hyperion/bin/finetune_xvector_from_feats.py @@ -12,6 +12,13 @@ import numpy as np import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import ClassWeightedSeqSampler as Sampler @@ -20,8 +27,6 @@ from hyperion.torch.models import XVector as XVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp, open_device -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): @@ -161,8 +166,7 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Fine-tune x-vector model") parser.add_argument("--cfg", action=ActionConfigFile) @@ -230,3 +234,7 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/finetune_xvector_from_wav.py b/hyperion/bin/finetune_xvector_from_wav.py index 1c7cbe58..2c884d0b 100755 --- a/hyperion/bin/finetune_xvector_from_wav.py +++ b/hyperion/bin/finetune_xvector_from_wav.py @@ -11,6 +11,13 @@ from pathlib import Path import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch import TorchModelLoader as TML from hyperion.torch.data import AudioDataset as AD @@ -25,8 +32,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, @@ -39,7 +44,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -120,7 +124,6 @@ def init_hard_prototype_mining(model, train_loader, val_loader, rank): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -208,8 +211,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Fine-tune x-vector model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -238,3 +240,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 4336b7b9..00452695 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -14,6 +14,13 @@ import torch import torch.nn as nn import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -24,12 +31,6 @@ from hyperion.torch.utils import open_device from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialNdx, Utt2Info -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def read_utt_list(list_file, class2int_file, part_idx, num_parts): @@ -156,14 +157,13 @@ def generate_attacks( num_parts, **kwargs ): - device = init_device(use_gpu) model = init_model(model_path, **kwargs) model.to(device) logging.info("opening audio read stream: %s" % (wav_file)) audio_args = AR.filter_args(**kwargs) - audio_reader = AR(wav_file ** audio_args) + audio_reader = AR(wav_file**audio_args) wav_scale = audio_reader.wav_scale logging.info("opening audio write stream: %s" % (output_wav_dir)) @@ -207,7 +207,7 @@ def generate_attacks( s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) target = torch.as_tensor([class_id], dtype=torch.long).to(device) if vad_spec is not None: - vad = v_reader.read([key.seg_set[j]])[0] + vad = v_reader.read([key])[0] tot_frames = len(vad) speech_frames = np.sum(vad) vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( @@ -217,7 +217,7 @@ def generate_attacks( logging.info( "utt %s detected %d/%d (%.2f %%) speech frames" % ( - key.seg_set[j], + key, speech_frames, tot_frames, speech_frames / tot_frames * 100, @@ -315,8 +315,7 @@ def generate_attacks( yaml.dump(attacks_info, f, sort_keys=True) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Generate Attacks for speaker classification with x-vectors" ) @@ -332,7 +331,9 @@ def generate_attacks( parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), + "--vad-path-prefix", + default=None, + help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) @@ -413,3 +414,7 @@ def generate_attacks( logging.debug(args) generate_attacks(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index 363e3afc..ab7d907b 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -14,6 +14,13 @@ import torch import torch.nn as nn import yaml +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, float_cpu, set_float_cpu from hyperion.io import AudioWriter as AW from hyperion.io import RandomAccessAudioReader as AR @@ -28,8 +35,6 @@ from hyperion.torch.utils.misc import compute_stats_adv_attack, l2_norm from hyperion.utils import TrialKey, TrialNdx, TrialScores, Utt2Info from hyperion.utils.list_utils import ismember -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) class MyModel(nn.Module): @@ -73,7 +78,6 @@ def forward(self, s_t): def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): - r = DRF.create(v_file) enroll = Utt2Info.load(enroll_file) key = TrialKey.load(key_file) @@ -173,7 +177,6 @@ def generate_attacks( random_seed, **kwargs ): - device = init_device(use_gpu) model = init_model(model_path, embed_layer, cal_file, threshold, **kwargs) model.to(device) @@ -346,8 +349,7 @@ def generate_attacks( yaml.dump(attacks_info, f, sort_keys=True) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Generate Attacks for speaker verification with x-vectors+cos+calibration" ) @@ -442,3 +444,7 @@ def generate_attacks( logging.debug(args) generate_attacks(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py index 2e3a35ec..17fff2ba 100755 --- a/hyperion/bin/hyperion_dataset.py +++ b/hyperion/bin/hyperion_dataset.py @@ -7,6 +7,14 @@ from pathlib import Path from typing import List, Optional, Union +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.utils import ( ClassInfo, @@ -18,13 +26,6 @@ RecordingSet, SegmentSet, ) -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, - ActionYesNo, -) subcommand_list = [ "add_features", @@ -41,7 +42,12 @@ def add_common_args(parser): parser.add_argument( - "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int, + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, ) @@ -145,7 +151,8 @@ def make_make_from_recordings_parser(): def make_from_recordings( - dataset: PathLike, recordings_file: PathLike, + dataset: PathLike, + recordings_file: PathLike, ): output_dataset = dataset import pandas as pd @@ -186,7 +193,10 @@ def make_remove_short_segments_parser(): def remove_short_segments( - dataset: PathLike, min_length: float, length_name: str, output_dataset: PathLike, + dataset: PathLike, + min_length: float, + length_name: str, + output_dataset: PathLike, ): if output_dataset is None: output_dataset = dataset @@ -216,7 +226,9 @@ def make_rebuild_class_idx_parser(): def rebuild_class_idx( - dataset: PathLike, class_name: str, output_dataset: PathLike, + dataset: PathLike, + class_name: str, + output_dataset: PathLike, ): if output_dataset is None: output_dataset = dataset @@ -301,14 +313,21 @@ def make_split_train_val_parser(): help="""types of classes that need to have different classes in train and val""", ) parser.add_argument( - "--seed", default=11235813, type=int, help="""random seed""", + "--seed", + default=11235813, + type=int, + help="""random seed""", ) parser.add_argument( - "--train-dataset", required=True, help="""output train dataset dir""", + "--train-dataset", + required=True, + help="""output train dataset dir""", ) parser.add_argument( - "--val-dataset", required=True, help="""output val dataset dir""", + "--val-dataset", + required=True, + help="""output val dataset dir""", ) add_common_args(parser) @@ -361,7 +380,8 @@ def make_copy_parser(): def copy( - dataset: PathLike, output_dataset: PathLike, + dataset: PathLike, + output_dataset: PathLike, ): dataset = Dataset.load(dataset, lazy=True) dataset.save(output_dataset) @@ -383,7 +403,10 @@ def make_add_cols_to_segments_parser(): help="""columns to copy to segments table""", ) parser.add_argument( - "--on", default=["id"], nargs="+", help="""columns to match both tables rows""", + "--on", + default=["id"], + nargs="+", + help="""columns to match both tables rows""", ) parser.add_argument( "--right-on", @@ -418,8 +441,7 @@ def add_cols_to_segments( dataset.save(output_dataset) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Tool to manipulates the Hyperion dataset") parser.add_argument("--cfg", action=ActionConfigFile) @@ -436,3 +458,7 @@ def add_cols_to_segments( del kwargs["verbose"] del kwargs["cfg"] globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py index 7f61b35a..59472d83 100755 --- a/hyperion/bin/hyperion_tables.py +++ b/hyperion/bin/hyperion_tables.py @@ -7,6 +7,13 @@ from pathlib import Path from typing import List, Optional, Union +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.utils import ( ClassInfo, @@ -17,12 +24,6 @@ RecordingSet, SegmentSet, ) -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) subcommand_list = ["cat"] table_dict = { @@ -87,7 +88,6 @@ def cat( num_tables: int, base_idx: int = 1, ): - assert input_files is not None or num_tables != 0 output_file = Path(output_file) if input_files is None: @@ -108,8 +108,7 @@ def cat( output_table.save(output_file) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") parser.add_argument("--cfg", action=ActionConfigFile) @@ -126,3 +125,7 @@ def cat( del kwargs["verbose"] del kwargs["cfg"] globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/make_babble_noise_audio_files.py b/hyperion/bin/make_babble_noise_audio_files.py index 68e5b22b..43d6ab91 100755 --- a/hyperion/bin/make_babble_noise_audio_files.py +++ b/hyperion/bin/make_babble_noise_audio_files.py @@ -10,11 +10,6 @@ import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.io import AudioWriter as Writer -from hyperion.io import RandomAccessAudioReader as AR -from hyperion.io import VADReaderFactory as VRF -from hyperion.utils import Utt2Info from jsonargparse import ( ActionConfigFile, ActionParser, @@ -22,9 +17,14 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.io import AudioWriter as Writer +from hyperion.io import RandomAccessAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.utils import Utt2Info + def make_noise(xs, max_value): - lens = np.array([x.shape[0] for x in xs]) max_len = np.max(lens) num_tiles = np.ceil(max_len / lens) @@ -53,7 +53,6 @@ def make_babble_noise_audio_files( random_seed=112358, **kwargs, ): - input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) logging.info(f"input_args={input_args}") @@ -105,8 +104,7 @@ def make_babble_noise_audio_files( logging.info("finished making babble files, elapsed-time=%f", time.time() - t1) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Creates babble noise by adding speech files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -137,3 +135,7 @@ def make_babble_noise_audio_files( logging.debug(args) make_babble_noise_audio_files(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/make_wav2xvector.py b/hyperion/bin/make_wav2xvector.py index b5972d1b..b3a1a2d5 100755 --- a/hyperion/bin/make_wav2xvector.py +++ b/hyperion/bin/make_wav2xvector.py @@ -12,6 +12,13 @@ import numpy as np import pandas as pd import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger # from hyperion.torch import TorchModelLoader as TML @@ -26,12 +33,6 @@ from hyperion.torch.models import Wav2ResNet1dXVector as W2R1dXVec from hyperion.torch.models import Wav2ResNetXVector as W2RXVec from hyperion.torch.narchs import AudioFeatsMVN as AF -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) def init_feats(feats): @@ -51,7 +52,6 @@ def load_model(model_path): def make_wav2xvector(feats, xvector_path, output_path): - feats = init_feats(feats) xvector_model = load_model(xvector_path) if isinstance(xvector_model, RXVec): @@ -67,8 +67,7 @@ def make_wav2xvector(feats, xvector_path, output_path): model.save(output_path) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="""Combines the feature extractor config with XVector model to produce a Wav2XVector model with integrated feature extraction""" @@ -89,3 +88,7 @@ def make_wav2xvector(feats, xvector_path, output_path): logging.debug(args) make_wav2xvector(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/merge_scores.py b/hyperion/bin/merge_scores.py index cb8524b7..72ab6010 100755 --- a/hyperion/bin/merge_scores.py +++ b/hyperion/bin/merge_scores.py @@ -6,9 +6,6 @@ import logging from pathlib import Path -from hyperion.hyp_defs import config_logger - -from hyperion.utils import TrialScores from jsonargparse import ( ActionConfigFile, ActionParser, @@ -16,6 +13,9 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.utils import TrialScores + def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, base_idx): output_file = Path(output_file) @@ -63,7 +63,7 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas write_header = False -if __name__ == "__main__": +def main(): parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") parser.add_argument("--cfg", action=ActionConfigFile) parser.add_argument( @@ -108,3 +108,7 @@ def merge_scores(input_files, output_file, num_enroll_parts, num_test_parts, bas del kwargs["verbose"] del kwargs["cfg"] merge_scores(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/pack_wav_rirs.py b/hyperion/bin/pack_wav_rirs.py index b2a1bc2b..bf88d674 100755 --- a/hyperion/bin/pack_wav_rirs.py +++ b/hyperion/bin/pack_wav_rirs.py @@ -10,9 +10,6 @@ import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.io import DataWriterFactory as DWF -from hyperion.io import SequentialAudioReader as AR from jsonargparse import ( ActionConfigFile, ActionParser, @@ -20,9 +17,12 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.io import DataWriterFactory as DWF +from hyperion.io import SequentialAudioReader as AR -def pack_wav_rirs(input_path, output_spec, **kwargs): +def pack_wav_rirs(input_path, output_spec, **kwargs): writer = DWF.create(output_spec, compress=False) t1 = time.time() with AR(input_path, wav_scale=1) as reader: @@ -47,8 +47,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): logging.info("Packed RIRS elapsed-time=%.f", time.time() - t1) -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Packs RIRs in wave format to h5/ark files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -69,3 +68,7 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): logging.debug(args) pack_wav_rirs(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/plot_embedding_tsne.py b/hyperion/bin/plot_embedding_tsne.py index e2157e3e..60d7ac5c 100755 --- a/hyperion/bin/plot_embedding_tsne.py +++ b/hyperion/bin/plot_embedding_tsne.py @@ -13,12 +13,18 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.transforms import PCA, LNorm, SklTSNE from hyperion.utils import SegmentSet -from jsonargparse import (ActionConfigFile, ActionParser, ActionYesNo, - ArgumentParser, namespace_to_dict) matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] @@ -40,7 +46,6 @@ def plot_embedding_tsne( output_dir, **kwargs, ): - output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("loading data") @@ -126,8 +131,7 @@ def plot_embedding_tsne( # plt.clf() -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Projects embeddings using TSNE") parser.add_argument("--train-v-file", required=True) @@ -162,6 +166,9 @@ def plot_embedding_tsne( plot_embedding_tsne(**namespace_to_dict(args)) +if __name__ == "__main__": + main() + # #!/usr/bin/env python # """ # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) diff --git a/hyperion/bin/plot_embedding_tsne_per_class.py b/hyperion/bin/plot_embedding_tsne_per_class.py index 14da4d07..08e4ef70 100755 --- a/hyperion/bin/plot_embedding_tsne_per_class.py +++ b/hyperion/bin/plot_embedding_tsne_per_class.py @@ -13,12 +13,6 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from hyperion.hyp_defs import config_logger -from hyperion.io import RandomAccessDataReaderFactory as DRF -from hyperion.np.clustering import AHC -from hyperion.np.transforms import PCA, LNorm, SklTSNE -from hyperion.utils import SegmentSet -from hyperion.utils.math_funcs import cosine_scoring from jsonargparse import ( ActionConfigFile, ActionParser, @@ -27,6 +21,13 @@ namespace_to_dict, ) +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.clustering import AHC +from hyperion.np.transforms import PCA, LNorm, SklTSNE +from hyperion.utils import SegmentSet +from hyperion.utils.math_funcs import cosine_scoring + matplotlib.use("Agg") colors = ["b", "g", "r", "c", "m", "y", "k"] markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] @@ -50,7 +51,6 @@ def plot_embedding_tsne( output_dir, **kwargs, ): - output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("loading data") @@ -92,7 +92,7 @@ def plot_embedding_tsne( if do_ahc: if cluster_tsne: # in the low dim space, we cannot use cosine scoring - x2 = np.sum(x_tsne ** 2, axis=1)[:, None] + x2 = np.sum(x_tsne**2, axis=1)[:, None] d2 = x2 - 2 * np.dot(x_tsne, x_tsne.T) + x2.T d2 = np.clip(d2, a_min=0, a_max=None) scores = -np.sqrt(d2) @@ -140,8 +140,7 @@ def plot_embedding_tsne( train_segs.save(output_dir / "segments.csv") -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description=( "Projects embeddings using TSNE, " @@ -194,3 +193,7 @@ def plot_embedding_tsne( logging.debug(args) plot_embedding_tsne(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/prepare_data.py b/hyperion/bin/prepare_data.py index f6723c7d..dd1bde27 100755 --- a/hyperion/bin/prepare_data.py +++ b/hyperion/bin/prepare_data.py @@ -6,8 +6,6 @@ import logging from pathlib import Path -from hyperion.data_prep import DataPrep -from hyperion.hyp_defs import config_logger from jsonargparse import ( ActionConfigFile, ActionParser, @@ -15,6 +13,9 @@ namespace_to_dict, ) +from hyperion.data_prep import DataPrep +from hyperion.hyp_defs import config_logger + def make_parser(data_prep_class): parser = ArgumentParser() @@ -22,7 +23,7 @@ def make_parser(data_prep_class): return parser -if __name__ == "__main__": +def main(): parser = ArgumentParser( description="""Prepares a dataset into relational database tables""" ) @@ -39,3 +40,7 @@ def make_parser(data_prep_class): args = namespace_to_dict(args)[args.subcommand] data_prep = data_prep_class(**args) data_prep.prepare() + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/preprocess_audio_files.py b/hyperion/bin/preprocess_audio_files.py index bda9a503..5e98a477 100755 --- a/hyperion/bin/preprocess_audio_files.py +++ b/hyperion/bin/preprocess_audio_files.py @@ -10,11 +10,6 @@ import time import numpy as np -from hyperion.hyp_defs import config_logger -from hyperion.io import AudioWriter as Writer -from hyperion.io import SequentialAudioReader as AR -from hyperion.io import VADReaderFactory as VRF -from hyperion.utils import Utt2Info from jsonargparse import ( ActionConfigFile, ActionParser, @@ -23,6 +18,12 @@ ) from scipy import ndimage, signal +from hyperion.hyp_defs import config_logger +from hyperion.io import AudioWriter as Writer +from hyperion.io import SequentialAudioReader as AR +from hyperion.io import VADReaderFactory as VRF +from hyperion.utils import Utt2Info + def resample_vad(vad, length): step = (len(vad) - 1) / length @@ -59,7 +60,6 @@ def process_audio_files( remove_dc_offset=False, **kwargs, ): - input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) logging.info(f"input_args={input_args}") @@ -72,7 +72,6 @@ def process_audio_files( with AR(recordings_file, **input_args) as reader, Writer( output_path, output_recordings_file, **output_args ) as writer: - if vad_spec is not None: logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) @@ -147,8 +146,7 @@ def process_audio_files( u2td.save(write_time_durs_spec) -if __name__ == "__main__": - +def main(): parser = ArgumentParser( description="Process pipes in wav.scp file, optionally applies vad and save all audios in the same format" ) @@ -204,3 +202,7 @@ def process_audio_files( logging.debug(args) process_audio_files(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/split_dataset_into_trials_and_cohort.py b/hyperion/bin/split_dataset_into_trials_and_cohort.py index 24ec10bf..50c2f1f2 100755 --- a/hyperion/bin/split_dataset_into_trials_and_cohort.py +++ b/hyperion/bin/split_dataset_into_trials_and_cohort.py @@ -6,8 +6,6 @@ import logging from pathlib import Path -from hyperion.hyp_defs import config_logger -from hyperion.utils import Dataset from jsonargparse import ( ActionConfigFile, ActionParser, @@ -16,8 +14,11 @@ namespace_to_dict, ) -if __name__ == "__main__": +from hyperion.hyp_defs import config_logger +from hyperion.utils import Dataset + +def main(): parser = ArgumentParser( description=( """Split speakers in dataset into test speaker to create ASV trials and @@ -66,3 +67,7 @@ trials_dataset, cohort_dataset = dataset.split_into_trials_and_cohort(**args) trials_dataset.save(trials_dir) cohort_dataset.save(cohort_dir) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_qmf.py b/hyperion/bin/train_qmf.py index a97e8a5f..42aabe0c 100755 --- a/hyperion/bin/train_qmf.py +++ b/hyperion/bin/train_qmf.py @@ -6,25 +6,25 @@ Trains calibration for SRE18 tel condition """ -import sys +import logging import os +import sys +import time +from pathlib import Path + +import numpy as np from jsonargparse import ( - ArgumentParser, ActionConfigFile, ActionParser, + ArgumentParser, namespace_to_dict, ) -import time -import logging -from pathlib import Path -import numpy as np - -from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.utils.trial_scores import TrialScores -from hyperion.utils.trial_key import TrialKey -from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.hyp_defs import config_logger, float_cpu from hyperion.np.classifiers import BinaryLogisticRegression as LR +from hyperion.np.metrics import compute_act_dcf, compute_min_dcf +from hyperion.utils.trial_key import TrialKey +from hyperion.utils.trial_scores import TrialScores def print_q_stats(scr, q_names): @@ -110,7 +110,7 @@ def train_qmf( scr_out.save(output_file) -if __name__ == "__main__": +def main(): parser = ArgumentParser(description="Trains QMF calibration") parser.add_argument("--score-file", required=True) @@ -133,3 +133,7 @@ def train_qmf( logging.debug(args) train_qmf(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 8930b299..c00c4633 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -14,15 +14,20 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.models import Wav2RNNRNNTransducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from torch.nn.utils.rnn import pad_sequence model_dict = { "rnn_rnn_transducer": Wav2RNNRNNTransducer, @@ -72,14 +77,12 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ({ - "num_workers": num_workers_per_gpu, - "pin_memory": True - } if num_gpus > 0 else {}) - data_loader = torch.utils.data.DataLoader(dataset, - batch_sampler=sampler, - **largs, - collate_fn=transducer_collate) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -97,7 +100,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -105,8 +107,8 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") - #torch.backends.cudnn.deterministic = True - #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) @@ -115,8 +117,11 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: @@ -159,8 +164,7 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", - action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) @@ -176,34 +180,27 @@ def make_parser(model_class): type=str, ) - parser.link_arguments("data.train.data_loader.num_workers", - "data.val.data_loader.num_workers") + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) - parser.link_arguments("data.train.dataset.bpe_model", - "data.val.dataset.bpe_model") + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, - prefix="trainer", - train_modes=model_class.valid_train_modes()) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", - type=int, - default=1123581321, - help="random seed") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == "__main__": - parser = ArgumentParser( - description="Train RNN Transducer model from audio files") +def main(): + parser = ArgumentParser(description="Train RNN Transducer model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -232,3 +229,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index 7018c406..5b802454 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -14,23 +14,29 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory -from hyperion.torch.models import (HFWav2Vec2ConformerV1RNNTransducer, - HFWav2Vec2RNNRNNTransducer, - HFWav2Vec2RNNTransducer) +from hyperion.torch.models import ( + HFWav2Vec2ConformerV1RNNTransducer, + HFWav2Vec2RNNRNNTransducer, + HFWav2Vec2RNNTransducer, +) from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from torch.nn.utils.rnn import pad_sequence model_dict = { "hf_wav2vec2rnn_transducer": HFWav2Vec2RNNTransducer, "hf_wav2vec2rnn_rnn_transducer": HFWav2Vec2RNNRNNTransducer, - "hf_wav2vec2conformer_v1_rnn_transducer": - HFWav2Vec2ConformerV1RNNTransducer, + "hf_wav2vec2conformer_v1_rnn_transducer": HFWav2Vec2ConformerV1RNNTransducer, # "hf_hubert2rnn_transducer": HFWav2Vec2RNNTransducer, # "hf_hubert2rnn_rnn_transducer": Hubert2RNNRNNTransducer, # "hf_wavlm2rnn_transducer": HFHubert2RNNTransducer, @@ -88,14 +94,12 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ({ - "num_workers": num_workers_per_gpu, - "pin_memory": True - } if num_gpus > 0 else {}) - data_loader = torch.utils.data.DataLoader(dataset, - batch_sampler=sampler, - **largs, - collate_fn=transducer_collate) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -113,7 +117,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -121,8 +124,8 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") - #torch.backends.cudnn.deterministic = True - #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) @@ -137,13 +140,16 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} #{"acc": CategoricalAccuracy()} + metrics = {} # {"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, @@ -181,8 +187,7 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", - action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) @@ -198,34 +203,29 @@ def make_parser(model_class): type=str, ) - parser.link_arguments("data.train.data_loader.num_workers", - "data.val.data_loader.num_workers") + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) - parser.link_arguments("data.train.dataset.bpe_model", - "data.val.dataset.bpe_model") + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, - prefix="trainer", - train_modes=model_class.valid_train_modes()) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", - type=int, - default=1123581321, - help="random seed") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == "__main__": +def main(): parser = ArgumentParser( - description="Train Wav2Vec2Transducer model from audio files") + description="Train Wav2Vec2Transducer model from audio files" + ) parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -254,3 +254,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2transducer.py b/hyperion/bin/train_wav2vec2transducer.py index 55f3b996..77a22bb8 100755 --- a/hyperion/bin/train_wav2vec2transducer.py +++ b/hyperion/bin/train_wav2vec2transducer.py @@ -14,6 +14,14 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) +from torch.nn.utils.rnn import pad_sequence + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -21,9 +29,6 @@ from hyperion.torch.models import HFWav2Vec2Transducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) -from torch.nn.utils.rnn import pad_sequence model_dict = { "hf_wav2vec2transducer": HFWav2Vec2Transducer, @@ -73,14 +78,12 @@ def init_data(partition, rank, num_gpus, **kwargs): num_workers = data_kwargs["data_loader"]["num_workers"] num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = ({ - "num_workers": num_workers_per_gpu, - "pin_memory": True - } if num_gpus > 0 else {}) - data_loader = torch.utils.data.DataLoader(dataset, - batch_sampler=sampler, - **largs, - collate_fn=transducer_collate) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader( + dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + ) return data_loader @@ -98,7 +101,6 @@ def init_model(blank_id, vocab_size, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -106,8 +108,8 @@ def train_model(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) set_float_cpu("float32") - #torch.backends.cudnn.deterministic = True - #torch.backends.cudnn.benchmark = False + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) @@ -122,13 +124,16 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) - model = init_model(train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), **kwargs) + model = init_model( + train_loader.dataset.sp.piece_to_id(""), + train_loader.dataset.sp.get_piece_size(), + **kwargs, + ) trn_args = Trainer.filter_args(**kwargs["trainer"]) if rank == 0: logging.info("trainer args={}".format(trn_args)) - metrics = {} #{"acc": CategoricalAccuracy()} + metrics = {} # {"acc": CategoricalAccuracy()} trainer = Trainer( model, device=device, @@ -166,8 +171,7 @@ def make_parser(model_class): help="num_workers of data loader", ) data_parser = ArgumentParser(prog="") - data_parser.add_argument("--train", - action=ActionParser(parser=train_parser)) + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) @@ -183,34 +187,29 @@ def make_parser(model_class): type=str, ) - parser.link_arguments("data.train.data_loader.num_workers", - "data.val.data_loader.num_workers") + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) - parser.link_arguments("data.train.dataset.bpe_model", - "data.val.dataset.bpe_model") + parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") - Trainer.add_class_args(parser, - prefix="trainer", - train_modes=model_class.valid_train_modes()) + Trainer.add_class_args( + parser, prefix="trainer", train_modes=model_class.valid_train_modes() + ) ddp.add_ddp_args(parser) - parser.add_argument("--seed", - type=int, - default=1123581321, - help="random seed") - parser.add_argument("-v", - "--verbose", - dest="verbose", - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == "__main__": +def main(): parser = ArgumentParser( - description="Train Wav2Vec2Transducer model from audio files") + description="Train Wav2Vec2Transducer model from audio files" + ) parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -239,3 +238,7 @@ def make_parser(model_class): # torch docs recommend using forkserver # multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index f132a35c..e6dd3d3e 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -14,6 +14,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -25,12 +32,6 @@ ) from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) model_dict = { "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, @@ -40,7 +41,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -83,7 +83,6 @@ def init_model(num_classes, rank, model_class, **kwargs): def train_model(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -105,7 +104,11 @@ def train_model(gpu_id, args): logging.info(f"trainer args={trn_args}") metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) @@ -162,8 +165,7 @@ def make_parser(model_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train Wav2Vec2XVector model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -193,3 +195,7 @@ def make_parser(model_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_model(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py index ddf292b8..7373a338 100755 --- a/hyperion/bin/train_wav2xvector.py +++ b/hyperion/bin/train_wav2xvector.py @@ -9,6 +9,13 @@ from pathlib import Path import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -23,12 +30,6 @@ # from hyperion.torch.models import TransformerXVectorV1 as TFXVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import ( - ActionConfigFile, - ActionParser, - ArgumentParser, - namespace_to_dict, -) xvec_dict = { "resnet": RXVec, @@ -41,7 +42,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -84,7 +84,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -107,7 +106,11 @@ def train_xvec(gpu_id, args): logging.info("trainer args={}".format(trn_args)) metrics = {"acc": CategoricalAccuracy()} trainer = Trainer( - model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args, + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, ) trainer.load_last_checkpoint() trainer.fit(train_loader, val_loader) @@ -164,8 +167,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train Wav2XVector from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -194,3 +196,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index 71bba080..a2acdf4c 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -13,6 +13,13 @@ import numpy as np import torch import torch.nn as nn +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.data import FeatSeqDataset as SD @@ -25,8 +32,6 @@ from hyperion.torch.models import TransformerXVectorV1 as TFXVec from hyperion.torch.trainers import XVectorTrainer as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, @@ -39,7 +44,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] sd_args = SD.filter_args(**kwargs["dataset"]) sampler_args = Sampler.filter_args(**kwargs["sampler"]) @@ -80,7 +84,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -164,8 +167,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train XVector from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -196,3 +198,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index b2e36cac..c3f6170d 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -9,6 +9,13 @@ from pathlib import Path import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory @@ -22,8 +29,6 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer from hyperion.torch.utils import ddp -from jsonargparse import (ActionConfigFile, ActionParser, ArgumentParser, - namespace_to_dict) xvec_dict = { "resnet": RXVec, @@ -36,7 +41,6 @@ def init_data(partition, rank, num_gpus, **kwargs): - kwargs = kwargs["data"][partition] ad_args = AD.filter_args(**kwargs["dataset"]) sampler_args = kwargs["sampler"] @@ -90,7 +94,6 @@ def init_xvector(num_classes, rank, xvec_class, **kwargs): def train_xvec(gpu_id, args): - config_logger(args.verbose) del args.verbose logging.debug(args) @@ -176,8 +179,7 @@ def make_parser(xvec_class): return parser -if __name__ == "__main__": - +def main(): parser = ArgumentParser(description="Train XVector from audio files") parser.add_argument("--cfg", action=ActionConfigFile) @@ -206,3 +208,7 @@ def make_parser(xvec_class): # torch docs recommend using forkserver multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/io/__init__.py b/hyperion/io/__init__.py index 14b1b35f..aa5ac653 100644 --- a/hyperion/io/__init__.py +++ b/hyperion/io/__init__.py @@ -16,10 +16,10 @@ from .hyp_data_reader import * from .hyp_data_writer import * from .kaldi_data_reader import * -from .packed_audio_reader import (RandomAccessPackedAudioReader, - SequentialPackedAudioReader) +from .packed_audio_reader import ( + RandomAccessPackedAudioReader, + SequentialPackedAudioReader, +) from .packed_audio_writer import PackedAudioWriter from .segment_vad_reader import SegmentVADReader from .vad_rw_factory import VADReaderFactory - -# from .queues import * diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index 2186522e..d1cf7f68 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -8,7 +8,6 @@ from ....hyp_defs import float_cpu from ....utils.math_funcs import logsumexp, softmax -from ....utils.queues import GeneratorQueue from ..core import PDF @@ -110,86 +109,6 @@ def fit( else: return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] - def fit_generator( - self, - generator, - train_steps, - epochs=10, - val_data=None, - val_steps=0, - max_queue_size=10, - workers=1, - use_multiprocessing=False, - ): - """Trains the model from data read by a generator function. - This function is deprecated. - - Args: - generator: train data generator function returning a tuple - (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. - train_steps: number of training steps / epoch - epochs: number of epochs. - val_data: val. data generator function returning a tuple - (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. - val_steps: number of validation steps / epoch - max_queue_size: max. size of the generator queue. - workers: number of workers in the generator. - use_multiprocessing: use multi-processing in the generator queue. - - Returns: - log p(X) of the training data. - log p(x) per sample. - log p(X) of the val. data, if present. - log p(x) of the val. data per sample, if present. - """ - - do_validation = bool(val_data) - val_gen = hasattr(val_data, "next") or hasattr(val_data, "__next__") - if val_gen and not val_steps: - raise ValueError( - "When using a generator for validation data, " - "you must specify a value for " - "`val_steps`." - ) - - if do_validation and not val_gen: - x, u_x_val, sample_weight_val = self.tuple2data(val_data) - log_h_val = self.accum_log_h(x, sample_weight_val) - - elbo = np.zeros((epochs,), dtype=float_cpu()) - elbo_val = np.zeros((epochs,), dtype=float_cpu()) - for epoch in range(epochs): - N, u_x, log_h = self.Estep_generator( - generator, - train_steps, - return_log_h=True, - max_queue_size=max_queue_size, - workers=workers, - use_multiprocessing=use_multiprocessing, - ) - - self.Mstep(N, u_x) - elbo[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h) - - if val_data is not None: - if val_gen: - N, u_x, log_h_val = self.Estep_generator( - val_data, - train_steps, - return_log_h=True, - max_queue_size=max_queue_size, - workers=workers, - use_multiprocessing=use_multiprocessing, - ) - else: - N, u_x = self.Estep(val_data, u_x_val, sample_weight_val) - elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) - - if val_data is None: - return elbo, elbo / x.shape[0] - else: - return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] - def log_h(self, x): """Computes log h(x) of the exp. family.""" return 0 @@ -404,7 +323,6 @@ def _accum_suff_stats_segments_prob_1batch( def _accum_suff_stats_segments_prob_nbatches( self, x, prob, sample_weight, batch_size ): - sw_i = None for i1 in range(0, x.shape[0], batch_size): i2 = np.minimum(i1 + batch_size, x.shape[0]) @@ -458,7 +376,6 @@ def accum_suff_stats_sorttime( def _accum_suff_stats_sorttime_1batch( self, x, frame_length, frame_shift, u_x=None, sample_weight=None ): - K = len(self.pi) num_frames = x.shape[0] num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) @@ -494,7 +411,6 @@ def _accum_suff_stats_sorttime_1batch( def _accum_suff_stats_sorttime_nbatches( self, x, frame_length, frame_shift, sample_weight, batch_size ): - K = len(self.pi) num_frames = x.shape[0] num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) @@ -539,65 +455,6 @@ def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): """ return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - def Estep_generator( - self, - generator, - num_steps, - return_log_h, - max_queue_size=10, - workers=1, - use_multiprocessing=False, - ): - """Expectation step, where data is read from a generator function. - - Args: - generator: data generator function returning a tuple - (x, u_x, sample_weight), (x, u_x), (x, sample_weight) or x. - num_steps: number of steps / epoch - return_log_h: returns accumlated log h(x). - max_queue_size: max. size of the generator queue. - workers: number of workers in the generator. - use_multiprocessing: use multi-processing in the generator queue. - - Returns: - N zero order sufficient statistics (number of samples). - Accumlated sufficient statistics \sum u(x). - Accumlated log h(x) (optional). - """ - wait_time = 0.01 # in secs - queue = None - N = None - acc_u_x = None - log_h = 0 - try: - queue = GeneratorQueue( - generator, use_multiprocessing=use_multiprocessing, wait_time=wait_time - ) - queue.start(workers=workers, max_queue_size=max_queue_size) - queue_generator = queue.get() - - cur_step = 0 - for cur_step in range(num_steps): - data = next(queue_generator) - x, u_x, sample_weight = self.tuple2data(data) - N_i, u_x_i = self.Estep(x, u_x, sample_weight) - if return_log_h: - log_h += self.accum_log_h(x) - if cur_step == 0: - N = N_i - acc_u_x = u_x_i - else: - N += N_i - acc_u_x += u_x_i - finally: - if queue is not None: - queue.stop() - - if return_log_h: - return N, acc_u_x, log_h - else: - return N, acc_u_x - def sum_suff_stats(self, N, u_x): """Sums suff. stats from muttiple sub-processes. @@ -754,28 +611,6 @@ def get_config(self): base_config = super(ExpFamilyMixture, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @staticmethod - def tuple2data(data): - if isinstance(data, tuple): - if len(data) == 2: - x, u_x = data - if u_x.ndim == 2: - sample_weight = None - elif u_x.ndim == 1: - sample_weight = u_x - u_x = None - else: - raise ValueError("Generator output: " + str(data)) - elif len(data) == 3: - x, u_x, sample_weight = data - else: - raise ValueError("Generator output: " + str(data)) - else: - x = data - u_x = None - sample_weight = None - return x, u_x, sample_weight - @staticmethod def compute_A_nat(eta): """Computes A_theta from the natural param.""" diff --git a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py index 7a2e82f8..3f7b2ec7 100644 --- a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py +++ b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py @@ -7,7 +7,11 @@ from functools import partial import torch -from torch._six import inf + +try: + from torch import inf +except: + from torch._six import inf from .lr_scheduler import LRScheduler diff --git a/hyperion/utils/queues.py b/hyperion/utils/queues.py deleted file mode 100644 index 8bfd0166..00000000 --- a/hyperion/utils/queues.py +++ /dev/null @@ -1,287 +0,0 @@ -""" - Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import copy -import multiprocessing -import threading -import time -import warnings -from abc import abstractmethod - -import numpy as np -import six - -try: - import queue -except ImportError: - import Queue as queue - - -class SequenceQueue(object): - """Base class to enqueue inputs. - - The task of an Queue is to use parallelism to speed up preprocessing. - This is done with processes or threads. - - # Examples - - ```python - enqueuer = SequenceQueue(...) - enqueuer.start() - datas = enqueuer.get() - for data in datas: - # Use the inputs; training, evaluating, predicting. - # ... stop sometime. - enqueuer.close() - ``` - - The `enqueuer.get()` should be an infinite stream of datas. - - """ - - @abstractmethod - def is_running(self): - raise NotImplemented - - @abstractmethod - def start(self, workers=1, max_queue_size=10): - """Starts the handler's workers. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, threads could block on `put()`). - """ - raise NotImplemented - - @abstractmethod - def stop(self, timeout=None): - """Stop running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called start(). - - # Arguments - timeout: maximum time to wait on thread.join() - """ - raise NotImplemented - - @abstractmethod - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - Generator yielding tuples `(inputs, targets)` - or `(inputs, targets, sample_weights)`. - """ - raise NotImplemented - - -class OrderedQueue(SequenceQueue): - """Builds a Queue from a Sequence. - - Used in `fit_generator`, `evaluate_generator`, `predict_generator`. - - # Arguments - sequence: A `keras.utils.data_utils.Sequence` object. - use_multiprocessing: use multiprocessing if True, otherwise threading - scheduling: Sequential querying of datas if 'sequential', random otherwise. - """ - - def __init__(self, sequence, use_multiprocessing=False, scheduling="sequential"): - self.sequence = sequence - self.use_multiprocessing = use_multiprocessing - self.scheduling = scheduling - self.workers = 0 - self.executor = None - self.queue = None - self.run_thread = None - self.stop_signal = None - - def is_running(self): - return self.stop_signal is not None and not self.stop_signal.is_set() - - def start(self, workers=1, max_queue_size=10): - """Start the handler's workers. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, workers could block on `put()`) - """ - if self.use_multiprocessing: - self.executor = multiprocessing.Pool(workers) - else: - self.executor = ThreadPool(workers) - self.queue = queue.Queue(max_queue_size) - self.stop_signal = threading.Event() - self.run_thread = threading.Thread(target=self._run) - self.run_thread.daemon = True - self.run_thread.start() - - def _run(self): - """Function to submit request to the executor and queue the `Future` objects.""" - sequence = list(range(len(self.sequence))) - while True: - if self.scheduling is not "sequential": - random.shuffle(sequence) - for i in sequence: - if self.stop_signal.is_set(): - return - self.queue.put( - self.executor.apply_async(get_index, (self.sequence, i)), block=True - ) - - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - Generator yielding tuples (inputs, targets) - or (inputs, targets, sample_weights) - """ - try: - while self.is_running(): - inputs = self.queue.get(block=True).get() - if inputs is not None: - yield inputs - except Exception as e: - self.stop() - raise StopIteration(e) - - def stop(self, timeout=None): - """Stops running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called `start()`. - - # Arguments - timeout: maximum time to wait on `thread.join()` - """ - self.stop_signal.set() - with self.queue.mutex: - self.queue.queue.clear() - self.queue.unfinished_tasks = 0 - self.queue.not_full.notify() - self.executor.close() - self.executor.join() - self.run_thread.join(timeout) - - -class GeneratorQueue(SequenceQueue): - """Builds a queue out of a data generator. - - Used in `fit_generator`, `evaluate_generator`, `predict_generator`. - - # Arguments - generator: a generator function which endlessly yields data - use_multiprocessing: use multiprocessing if True, otherwise threading - wait_time: time to sleep in-between calls to `put()` - random_seed: Initial seed for workers, - will be incremented by one for each workers. - """ - - def __init__( - self, generator, use_multiprocessing=False, wait_time=0.05, random_seed=None - ): - self.wait_time = wait_time - self._generator = generator - self._use_multiprocessing = use_multiprocessing - self._threads = [] - self._stop_event = None - self.queue = None - self.random_seed = random_seed - - def start(self, workers=1, max_queue_size=10): - """Kicks off threads which add data from the generator into the queue. - - # Arguments - workers: number of worker threads - max_queue_size: queue size - (when full, threads could block on `put()`) - """ - - def data_generator_task(): - while not self._stop_event.is_set(): - try: - if self._use_multiprocessing or self.queue.qsize() < max_queue_size: - generator_output = next(self._generator) - self.queue.put(generator_output) - else: - time.sleep(self.wait_time) - except Exception: - self._stop_event.set() - raise - - try: - if self._use_multiprocessing: - self.queue = multiprocessing.Queue(maxsize=max_queue_size) - self._stop_event = multiprocessing.Event() - else: - self.queue = queue.Queue() - self._stop_event = threading.Event() - - for _ in range(workers): - if self._use_multiprocessing: - # Reset random seed else all children processes - # share the same seed - np.random.seed(self.random_seed) - thread = multiprocessing.Process(target=data_generator_task) - thread.daemon = True - if self.random_seed is not None: - self.random_seed += 1 - else: - thread = threading.Thread(target=data_generator_task) - self._threads.append(thread) - thread.start() - except: - self.stop() - raise - - def is_running(self): - return self._stop_event is not None and not self._stop_event.is_set() - - def stop(self, timeout=None): - """Stops running threads and wait for them to exit, if necessary. - - Should be called by the same thread which called `start()`. - - # Arguments - timeout: maximum time to wait on `thread.join()`. - """ - if self.is_running(): - self._stop_event.set() - - for thread in self._threads: - if thread.is_alive(): - if self._use_multiprocessing: - thread.terminate() - else: - thread.join(timeout) - - if self._use_multiprocessing: - if self.queue is not None: - self.queue.close() - - self._threads = [] - self._stop_event = None - self.queue = None - - def get(self): - """Creates a generator to extract data from the queue. - - Skip the data if it is `None`. - - # Returns - A generator - """ - while self.is_running(): - if not self.queue.empty(): - inputs = self.queue.get() - if inputs is not None: - yield inputs - else: - time.sleep(self.wait_time) diff --git a/setup.py b/setup.py index 9780586d..e1fb35cc 100644 --- a/setup.py +++ b/setup.py @@ -15,15 +15,26 @@ # limitations under the License. # -import setuptools from pathlib import Path +import setuptools + project_root = Path(__file__).parent -with open(project_root / "apps.txt") as f: - apps = f.read().splitlines() +# with open(project_root / "apps.txt") as f: +# apps = f.read().splitlines() -apps = [str(project_root / "hyperion" / "bin" / app) for app in apps] +# apps = [str(project_root / "hyperion" / "bin" / app) for app in apps] +binaries = (project_root / "hyperion" / "bin").glob("*.py") +console_scripts = [] +for binary in binaries: + stem = binary.stem + script_name = stem.replace("hyperion_", "").replace("_", "-") + if script_name[0] == "-": + continue + module = f"hyperion.bin.{stem}:main" + console_script = f"hyperion-{script_name} = {module}" + console_scripts.append(console_script) with open(project_root / "requirements.txt") as f: requirements = f.read().splitlines() @@ -77,10 +88,22 @@ def get_version(): "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ], python_requires=">=3.7", install_requires=requirements, - scripts=apps, + entry_points={ + "console_scripts": console_scripts, + } + # entry_points={ + # "console_scripts": [ + # "hyperion-prepare-data = hyperion.bin.prepare_data:main", + # "hyperion-train-wav2xvector = hyperion.bin.train_wav2xvector:main", + # ] + # }, + # scripts=apps, ) From 610547682764789844af201c1a16bccc6b8d34ab Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sun, 10 Sep 2023 20:01:36 -0400 Subject: [PATCH 108/154] make it work with cuda 11 --- README.md | 11 +++++++++-- hyp_utils/conda_env.sh | 32 +++++++++++++++++--------------- hyperion/torch/utils/ddp.py | 13 +++++-------- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 4838157b..d56406d7 100644 --- a/README.md +++ b/README.md @@ -26,14 +26,21 @@ The full API is described in the documentation page [https://hyperion-ml.readthe ### Prerequisites We use anaconda or miniconda, though you should be able to make it work in other python distributions - To start, you should create a new enviroment and install PyTorch>=1.9, (older versions are not supported any longer) e.g.: + To start, you should create a new enviroment and install PyTorch: ``` conda create --name ${your_env} python=3.11 conda activate ${your_env} -conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=10.2 -c pytorch conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia ``` +For systems with cuda 10.2 driver: +``` +conda create --name ${your_env} python=3.10 +conda activate ${your_env} +conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=10.2 -c pytorch +``` + + ### Installing Hyperion - First, clone the repo: diff --git a/hyp_utils/conda_env.sh b/hyp_utils/conda_env.sh index 8d5c67c1..90ffa369 100755 --- a/hyp_utils/conda_env.sh +++ b/hyp_utils/conda_env.sh @@ -52,22 +52,24 @@ fi # echo "LRU_CACHE_CAPACITY=$LRU_CACHE_CAPACITY" conda activate $conda_env -command="python" +command="" if [ $num_gpus -gt 0 ];then - # set CUDA_VISIBLE_DEVICES - if [ ! -z "$SGE_HGR_gpu" ]; then - echo "SGE_HGR_gpu=$SGE_HGR_gpu" - export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') - else - # seach location of free-gpu program in the PATH or hyp_utils directory - free_gpu=$(which free-gpu) - if [ -z "$free_gpu" ];then - free_gpu=$(which hyp_utils/free-gpu) - fi - - if [ ! -z "$free_gpu" ];then - # if free-gpu found set env var, otherwise we assume that you can use any gpu - export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) + if [ -z "$CUDA_VISIBLE_DEVICES" ];then + # set CUDA_VISIBLE_DEVICES + if [ ! -z "$SGE_HGR_gpu" ]; then + echo "SGE_HGR_gpu=$SGE_HGR_gpu" + export CUDA_VISIBLE_DEVICES=$(echo $SGE_HGR_gpu | sed 's@ @,@g') + else + # seach location of free-gpu program in the PATH or hyp_utils directory + free_gpu=$(which free-gpu) + if [ -z "$free_gpu" ];then + free_gpu=$(which hyp_utils/free-gpu) + fi + + if [ ! -z "$free_gpu" ];then + # if free-gpu found set env var, otherwise we assume that you can use any gpu + export CUDA_VISIBLE_DEVICES=$($free_gpu -n $num_gpus) + fi fi fi echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 1aefb3d4..4f006c0a 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -6,19 +6,16 @@ import logging import os -from fairscale.nn.data_parallel import \ - FullyShardedDataParallel as FullyShardedDDP -from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP - import torch import torch.distributed as dist import torch.nn as nn +from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP +from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP from .devices import open_device def add_ddp_args(parser): - parser.add_argument( "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" ) @@ -50,7 +47,6 @@ def filter_ddp_args(**kwargs): def ddp_init( gpu_id, num_gpus, node_id=0, num_nodes=1, master_addr="localhost", master_port=None ): - rank = node_id * num_gpus + gpu_id world_size = num_nodes * num_gpus @@ -62,15 +58,16 @@ def ddp_init( os.environ["MASTER_PORT"] = master_port logging.info( - f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" + f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" ) dist.init_process_group( "nccl", rank=rank, world_size=world_size, ) + torch.cuda.set_device(rank) torch.tensor([0]).to(gpu_id) - device = torch.device('cuda', gpu_id) + device = torch.device("cuda", gpu_id) return device, rank, world_size # return gpu_id, rank, world_size From 392cd30f6bae594e9121bde48379aae787d16e6f Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 11 Sep 2023 11:41:35 -0400 Subject: [PATCH 109/154] started vox/v2.1 recipe and fix some readmes --- egs/voxceleb/v1.1/README.md | 2 + egs/voxceleb/v1.2/README.md | 249 ++++++-------- .../train_cfwseresnet34_xvec_stage1_v3.0.yaml | 72 ++++ .../train_cfwseresnet34_xvec_stage2_v3.0.yaml | 69 ++++ .../train_cwseresnet34_xvec_stage1_v3.0.yaml | 72 ++++ .../train_cwseresnet34_xvec_stage2_v3.0.yaml | 69 ++++ .../train_fwseresnet34_xvec_stage1_v3.0.yaml | 72 ++++ .../train_fwseresnet34_xvec_stage2_v3.0.yaml | 69 ++++ ...rain_idrnd_resnet100_xvec_stage1_v3.0.yaml | 73 ++++ ...rain_idrnd_resnet100_xvec_stage2_v3.0.yaml | 69 ++++ .../conf/train_resnet34_xvec_stage1_v3.0.yaml | 71 ++++ .../conf/train_resnet34_xvec_stage2_v3.0.yaml | 69 ++++ .../train_tseresnet34_xvec_stage1_v3.0.yaml | 72 ++++ .../train_tseresnet34_xvec_stage2_v3.0.yaml | 69 ++++ .../config_fbank80_stmn_cfwseresnet34.v3.0.sh | 44 +++ .../config_fbank80_stmn_cwseresnet34.v3.0.sh | 45 +++ .../config_fbank80_stmn_fwseresnet34.v3.0.sh | 44 +++ ...onfig_fbank80_stmn_idrnd_resnet100.v3.0.sh | 44 +++ .../config_fbank80_stmn_resnet34.v3.0.sh | 44 +++ .../config_fbank80_stmn_tseresnet34.v3.0.sh | 44 +++ egs/voxceleb/v2.1/cmd.sh | 28 ++ egs/voxceleb/v2.1/conf/clsp.conf | 11 + egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf | 11 + egs/voxceleb/v2.1/conf/coe_gpu_long.conf | 13 + egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf | 11 + egs/voxceleb/v2.1/conf/coe_gpu_short.conf | 11 + egs/voxceleb/v2.1/conf/coe_gpu_v100.conf | 11 + egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml | 35 ++ ...lsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml | 59 ++++ ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...baseplus9l_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...lmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++ ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 73 ++++ ...lmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++ ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 73 ++++ ...avlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml | 63 ++++ ...avlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml | 73 ++++ ...wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml | 59 ++++ ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 63 ++++ ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 73 ++++ egs/voxceleb/v2.1/conf/vad_16k.yaml | 8 + ...v2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml | 45 +++ ...wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml | 44 +++ .../wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml | 44 +++ .../wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml | 45 +++ .../wavlmbaseplus_ecapatdnn512x3_v2.0.yaml | 44 +++ .../wavlmlarge12l_ecapatdnn512x3_v2.0.yaml | 45 +++ .../conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml | 44 +++ egs/voxceleb/v2.1/datapath.sh | 23 ++ egs/voxceleb/v2.1/default_config.sh | 1 + ...wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | 54 +++ ...g_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh | 54 +++ ...ig_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | 54 +++ ...fig_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | 54 +++ ...onfig_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | 54 +++ ...onfig_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | 54 +++ .../config_wavlmlarge_ecapatdnn512x3_v2.0.sh | 54 +++ egs/voxceleb/v2.1/hyp_utils | 1 + egs/voxceleb/v2.1/path.sh | 5 + egs/voxceleb/v2.1/run_001_prepare_data.sh | 46 +++ egs/voxceleb/v2.1/run_002_compute_evad.sh | 66 ++++ .../v2.1/run_003_prepare_noises_rirs.sh | 102 ++++++ .../v2.1/run_004_prepare_xvec_train_data.sh | 76 +++++ egs/voxceleb/v2.1/run_005_train_xvector.sh | 78 +++++ egs/voxceleb/v2.1/run_006_extract_xvectors.sh | 103 ++++++ egs/voxceleb/v2.1/run_007_eval_be.sh | 321 ++++++++++++++++++ egs/voxceleb/v2/README.md | 10 +- egs/voxceleb/v2/default_config.sh | 2 +- 71 files changed, 3829 insertions(+), 152 deletions(-) create mode 100644 egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh create mode 100755 egs/voxceleb/v2.1/cmd.sh create mode 100644 egs/voxceleb/v2.1/conf/clsp.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_long.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_short.conf create mode 100644 egs/voxceleb/v2.1/conf/coe_gpu_v100.conf create mode 100644 egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/vad_16k.yaml create mode 100644 egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/datapath.sh create mode 120000 egs/voxceleb/v2.1/default_config.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh create mode 100644 egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh create mode 120000 egs/voxceleb/v2.1/hyp_utils create mode 100755 egs/voxceleb/v2.1/path.sh create mode 100755 egs/voxceleb/v2.1/run_001_prepare_data.sh create mode 100755 egs/voxceleb/v2.1/run_002_compute_evad.sh create mode 100755 egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh create mode 100755 egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh create mode 100755 egs/voxceleb/v2.1/run_005_train_xvector.sh create mode 100755 egs/voxceleb/v2.1/run_006_extract_xvectors.sh create mode 100755 egs/voxceleb/v2.1/run_007_eval_be.sh diff --git a/egs/voxceleb/v1.1/README.md b/egs/voxceleb/v1.1/README.md index 3b9eeaa9..efdb77c1 100644 --- a/egs/voxceleb/v1.1/README.md +++ b/egs/voxceleb/v1.1/README.md @@ -1,5 +1,7 @@ # VoxCeleb V1.1 +This recipe will be deprecated, use V1.2 + Recipe for the VoxCeleb Speaker Verification Task ## Differences w.r.t VoxCeleb V1 recipe diff --git a/egs/voxceleb/v1.2/README.md b/egs/voxceleb/v1.2/README.md index 1ee9468f..6e8ba07a 100644 --- a/egs/voxceleb/v1.2/README.md +++ b/egs/voxceleb/v1.2/README.md @@ -1,4 +1,4 @@ -# VoxCeleb V1.1 +# VoxCeleb V1.2 Recipe for the VoxCeleb Speaker Verification Task @@ -9,7 +9,7 @@ In recipe version V1: - Augmentation is performed using Kaldi scripts and wav-reverbate tool - Babble noise is created on-the-fly when computing features by mixing 3-7 single speaker files. -In this recipe: +In V1.1: - We compute speech augmentations and acoustic features are computed always on-the-fly, we don't dump any features to disk. - Augmentation is performed using Hyperin SpeechAugment class. @@ -18,6 +18,11 @@ In this recipe: which mimics the proportions of noise and RIR types, and SNRs used in the V1 or the recipe. - Babble noise is created offline by mixing 3-10 single speaker files. +In V1.2: + - Feaure extractor is embedded into the pytorch model in classes derived from Wav2XVector base class. + - Kaldi format is replaced by new format based on pandas tables + - Kaldi style bash scripts are removed and replaced by python scripts + - Most python scripts are called using Hyperion entry points ## Citing @@ -30,13 +35,11 @@ In this recipe: ## Test data - Test data is VoxCeleb 1 - - We evaluate 6 conditions: + - We evaluate the 3 conditions (with cleaned lists): - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers - - Voxceleb-O-cleaned: VoxCeleb-O cleaned-up of some errors - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 - - Voxceleb-E-cleaned: VoxCeleb-E cleaned-up of some errors - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. - - Voxceleb-H-cleaned: VoxCeleb-H cleaned-up of some errors + ## Usage @@ -44,9 +47,9 @@ In this recipe: - By default it will use Light ResNet (16 base channels) - For better performance use full ResNet (64 base channels) using `config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` file as ```bash -run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh -run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true -run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_005_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_006_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true +run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh ``` - To train with mixed precision training use config file `config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh` @@ -66,25 +69,26 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. - - `run_010_prepare_xvec_train_data.sh` + - `run_004_prepare_xvec_train_data.sh` - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. - Removes silence from the audios - Removes utterances shorter than 4secs and speakers with less than 8 utterances. - Creates training and validation lists for x-vector training - - `run_011_train_xvector.sh` + - `run_005_train_xvector.sh` - Trains the x-vector network - - `run_030_extract_xvectors.sh` + - `run_006_extract_xvectors.sh` - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training - Exctracts x-vectors for VoxCeleb1 test sets - - `run_040_eval_be.sh` + - `run_007_eval_be.sh` - Trains PLDA and evals PLDA and cosine scoring back-ends ## Results + ### VoxCeleb 1 Original-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -95,9 +99,28 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | | | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | | | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | || | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.039 | 0.048 | +| | | | Cosine + QMF | 0.62 | 0.034 | 0.042 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.76 | 0.048 | 0.071 | +| | | | Cosine + AS-Norm | 0.70 | 0.041 | 0.061 | +| | | | Cosine + QMF | 0.62 | 0.037 | 0.056 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | +| | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| +| | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 | +| | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 | +| | | | Cosine + QMF | 0.63 | 0.042 | 0.071 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.051 | 0.095 | +| | | | Cosine + AS-Norm | 0.72 | 0.046 | 0.070 | +| | | | Cosine + QMF | 0.67 | 0.039 | 0.074 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | +| | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 | +| | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | +| | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | +| | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | + ### VoxCeleb 1 Entire-Clean trial list @@ -109,9 +132,27 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | | | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | | | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.86 | 0.054 | 0.098 | +| | | | Cosine + AS-Norm | 0.81 | 0.049 | 0.087 | +| | | | Cosine + QMF | 0.77 | 0.046 | 0.082 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.89 | 0.058 | 0.098 | +| | | | Cosine + AS-Norm | 0.84 | 0.053 | 0.087| +| | | | Cosine + QMF | 0.80 | 0.050 | 0.081 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | +| | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | +| | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 | +| | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 | +| | | | Cosine + QMF | 0.81 | 0.049 | 0.085 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.94 | 0.059 | 0.105 | +| | | | Cosine + AS-Norm | 0.88 | 0.053 | 0.093 | +| | | | Cosine + QMF | 0.84 | 0.051 | 0.088 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| +| | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | +| | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | +| | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | +| | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | ### VoxCeleb 1 Hard-Clean trial list @@ -123,9 +164,28 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | | | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | | | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | | | | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.62 | 0.098 | 0.164 | +| | | | Cosine + AS-Norm | 1.45 | 0.085 | 0.142 | +| | | | Cosine + QMF | 1.36 | 0.082 | 0.137 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.70 | 0.1 | 0.165 | +| | | | Cosine + AS-Norm | 1.50 | 0.086 | 0.138 | +| | | | Cosine + QMF | 1.44 | 0.085 | 0.139 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | +| | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | +| | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 | +| | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 | +| | | | Cosine + QMF | 1.50 | 0.087 | 0.145 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.76 | 0.104 | 0.174 | +| | | | Cosine + AS-Norm | 1.58 | 0.092 | 0.152 | +| | | | Cosine + QMF | 1.51 | 0.089 | 0.149 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | +| | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | +| | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | +| | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | +| | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | + ### VoxSRC2022 dev @@ -137,127 +197,24 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | | | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | | | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | -| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | || | -| | | | Cosine + AS-Norm | | | | -| | | | Cosine + QMF | | | | - -## Results before 2023 - -### VoxCeleb 1 Original-Clean trial list - -| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | -| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.00 | 0.129 | 0.216 | -| | | | Cosine | 2.04 | 0.138 | 0.210 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.35 | 0.091 | 0.159 | -| | | | Cosine | 1.22 | 0.082 | 0.129 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.074 | 0.124 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.090 | 0.160 | -| | | | Cosine | 1.44 | 0.100 | 0.173 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.091 | 0.143 | -| | | | Cosine | 1.17 | 0.081 | 0.110 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.37 | 0.104 | 0.179 | -| | | | Cosine | 1.31 | 0.080 | 0.139 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.29 | 0.088 | 0.129 | -| | | | Cosine | 1.23 | 0.083 | 0.136 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.095 | 0.156 | -| | | | Cosine | 1.29 | 0.089 | 0.146 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.20 | 0.084 | 0.136 | -| | | | Cosine | 1.18 | 0.078 | 0.115 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.084 | 0.145 | -| | | | Cosine | 1.12 | 0.073 | 0.131 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.53 | 0.104 | 0.189 | -| | | | Cosine | 1.31 | 0.084 | 0.132 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 0.98 | 0.066 | 0.116 | -| | | | Cosine | 1.12 | 0.071 | 0.103 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.05 | 0.077 | 0.123 | -| | | | Cosine | 0.96 | 0.065 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.04 | 0.071 | 0.118 | -| | | | Cosine | 0.93 | 0.067 | 0.108 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 0.90 | 0.067 | 0.118 | -| | | | Cosine | 0.85 | 0.060 | 0.094 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.44 | 0.102 | 0.169 | -| | | | Cosine | 1.29 | 0.084 | 0.140 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.071 | 0.116 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.074 | 0.116 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.09 | 0.081 | 0.150 | - - -### VoxCeleb 1 Entire-Clean trial list - -| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | -| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.86 | 0.124 | 0.210 | -| | | | Cosine | 1.93 | 0.122 | 0.201 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.43 | 0.091 | 0.159 | -| | | | Cosine | 1.24 | 0.080 | 0.136 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 1.19 | 0.077 | 0.132 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.084 | 0.150 | -| | | | Cosine | 1.30 | 0.082 | 0.150 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 1.30 | 0.083 | 0.146 | -| | | | Cosine | 1.09 | 0.071 | 0.124 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 1.45 | 0.097 | 0.165 | -| | | | Cosine | 1.15 | 0.076 | 0.132 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 1.47 | 0.094 | 0.165 | -| | | | Cosine | 1.27 | 0.082 | 0.148 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 1.31 | 0.086 | 0.149 | -| | | | Cosine | 1.22 | 0.079 | 0.134 | -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.27 | 0.082 | 0.145 | -| | | | Cosine | 1.16 | 0.074 | 0.130 | -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.077 | 0.136 | -| | | | Cosine | 1.11 | 0.071 | 0.125 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 1.46 | 0.097 | 0.173 | -| | | | Cosine | 1.24 | 0.080 | 0.140 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 1.11 | 0.071 | 0.127 | -| | | | Cosine | 1.05 | 0.067 | 0.117 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 1.23 | 0.078 | 0.134 | -| | | | Cosine | 1.05 | 0.069 | 0.121 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 1.18 | 0.075 | 0.131 | -| | | | Cosine | 0.98 | 0.063 | 0.110 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | Res2Net50 width=26x8 | + SWA | PLDA | 1.17 | 0.072 | 0.123 | -| | | | Cosine | 0.94 | 0.061 | 0.107 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 1.56 | 0.095 | 0.166 | -| | | | Cosine | 1.27 | 0.079 | 0.142 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 1.19 | 0.077 | 0.137 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.12 | 0.073 | 0.129 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | TSE-Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 1.05 | 0.068 | 0.120 | - - -### VoxCeleb 1 Hard-Clean trial list - -| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | -| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | -| config_fbank80_stmn_lresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | LResNet34 | ArcFace s=30/m=0.3 | PLDA | 3.29 | 0.195 | 0.318 | -| | | | Cosine | 3.27 | 0.188 | 0.303 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.66 | 0.160 | 0.258 | -| | | | Cosine | 2.32 | 0.139 | 0.232 | -| config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp_swa.v1.sh | ResNet34 | + SWA | Cosine | 2.19 | 0.133 | 0.215 | -| config_fbank80_stmn_resnet50_arcs30m0.3_adam_lr0.05_amp.v1.sh | ResNet50 | ArcFace s=30/m=0.3 | PLDA | 2.33 | 0.139 | 0.227 | -| | | | Cosine | 2.33 | 0.142 | 0.235 | -| config_fbank80_stmn_tseresnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-ResNet34 | ArcFace s=30/m=0.3 | PLDA | 2.46 | 0.142 | 0.237 | -| | | | Cosine | 2.14 | 0.126 | 0.203 | -| config_fbank80_stmn_effnetb4_v2_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b4 v2 | EfficientNet-b4 with strides=1122121
ArcFace s=30/m=0.3 | 2.57 | 0.153 | 0.255 | -| | | | Cosine | 2.11 | 0.127 | 0.205 | -| config_fbank80_stmn_effnetb7_v2_eina_hln_arcs30m0.3_adam_lr0.01_amp.v1.sh | EfficientNet-b7 v2 | EfficientNet-b7 with strides=1122121
Instance-Norm with affine transform in Encoder
Layer-Norm in head
ArcFace s=30/m=0.3 | 2.64 | 0.157 | 0.244 | -| | | | Cosine | 2.33 | 0.141 | 0.232 | -| config_fbank80_stmn_res2net34w16s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=16x4 | ArcFace s=30/m=0.3 | PLDA | 2.42 | 0.144 | 0.245 | -| | | | Cosine | 2.26 | 0.133 | 0.224 -| config_fbank80_stmn_res2net34w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net34 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.39 | 0.141 | 0.235 | -| | | | Cosine | 2.17 | 0.128 | 0.215 -| config_fbank80_stmn_res2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x4 | ArcFace s=30/m=0.3 | PLDA | 2.28 | 0.131 | 0.225 | -| | | | Cosine | 2.11 | 0.124 | 0.204 | -| config_fbank80_stmn_seres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | SE-Res2Net50 | se-r=16
ArcFace s=30/m=0.3 | PLDA | 2.77 | 0.172 | 0.271 | -| | | | Cosine | 2.45 | 0.141 | 0.225 | -| config_fbank80_stmn_tseres2net50w26s4_arcs30m0.3_adam_lr0.05_amp.v1.sh | Time-SE-Res2Net50 | se-r=256
ArcFace s=30/m=0.3 | PLDA | 2.07 | 0.124 | 0.201 | -| | | | Cosine | 1.95 | 0.113 | 0.181 | -| config_fbank80_stmn_res2net50w13s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=13x8 | ArcFace s=30/m=0.3 | PLDA | 2.34 | 0.136 | 0.230 | -| | | | Cosine | 1.99 | 0.119 | 0.196 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1.sh | Res2Net50 width=26x8 | ArcFace s=30/m=0.3 | PLDA | 2.18 | 0.127 | 0.211 | -| | | | Cosine | 1.89 | 0.112 | 0.184 | -| config_fbank80_stmn_res2net50w26s8_arcs30m0.3_adam_lr0.05_amp.v1_swa.sh | Res2Net50 width=26x8 | + SWA | PLDA | 2.14 | 0.125 | 0.209 | -| | | | Cosine | 1.84 | 0.110 | 0.186 | -| config_fbank80_stmn_spinenet49s_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49S | ArcFace s=30/m=0.3 | PLDA | 2.78 | 0.156 | 0.252 | -| | | | Cosine | 2.26 | 0.134 | 0.214 | -| config_fbank80_stmn_spinenet49_arcs30m0.3_adam_lr0.05_amp.v1.sh | SpineNet49 | ArcFace s=30/m=0.3 | Cosine | 2.24 | 0.134 | 0.221 | -| config_fbank80_stmn_spine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.20 | 0.132 | 0.219 | -| config_fbank80_stmn_tsespine2net49_arcs30m0.3_adam_lr0.05_amp.v1.sh | Spine2Net49 | ArcFace s=30/m=0.3 | Cosine | 2.02 | 0.123 | 0.203 | +| config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.19 | 0.142 | 0.242 | +| | | | Cosine + AS-Norm | 2.00 | 0.133 | 0.254 | +| | | | Cosine + QMF | 1.86 | 0.126 | 0.229 | +| config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.34 | 0.145 | 0.246 | +| | | | Cosine + AS-Norm | 2.10 | 0.135 | 0.248 | +| | | | Cosine + QMF | 2.01 | 0.127 | 0.218 | +| config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | +| | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | +| | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | +| config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 | +| | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 | +| | | | Cosine + QMF | 2.08 | 0.128 | 0.222 | +| config_fbank80_stmn_cfwseresnet34.v3.0.sh | CwFwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.49 | 0.158 | 0.265 | +| | | | Cosine + AS-Norm | 2.29 | 0.145 | 0.251 | +| | | | Cosine + QMF | 2.17 | 0.133 | 0.230 | +| config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | +| | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | +| | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | +| config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | +| | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | +| | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..f4306e2e --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: cfwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..b5458f9d --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: seresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 32 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..01b2cc50 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..74553395 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.05 + se_r: 4 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..11d33ae2 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..6659b2f6 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..58d22733 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -0,0 +1,72 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: tseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 256 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 25 + eff_batch_size: 256 diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml new file mode 100644 index 00000000..0923a608 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh new file mode 100644 index 00000000..56d18bd0 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cfwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# Channel-freq-wise-SE-ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cfwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh new file mode 100644 index 00000000..68849f78 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_cwseresnet34.v3.0.sh @@ -0,0 +1,45 @@ +# Channel-wise ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_cwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_cwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + + +nnet_s2_base_cfg=conf/train_cwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh new file mode 100644 index 00000000..f962c2b3 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# Freq-wise-SE ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_fwseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh new file mode 100644 index 00000000..6ea334b4 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.0.sh @@ -0,0 +1,44 @@ +# IdRnd ResNet100 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.0 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh new file mode 100644 index 00000000..bb5d990c --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -0,0 +1,44 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34.v3.0 + +nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2 +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh new file mode 100644 index 00000000..2528d13f --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_tseresnet34.v3.0.sh @@ -0,0 +1,44 @@ +# TSE-ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_tseresnet34.v3.0 + +nnet_s1_base_cfg=conf/train_tseresnet34_xvec_stage1_v3.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0025.pth + +nnet_s2_base_cfg=conf/train_tseresnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/cmd.sh b/egs/voxceleb/v2.1/cmd.sh new file mode 100755 index 00000000..040f458b --- /dev/null +++ b/egs/voxceleb/v2.1/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/v2.1/conf/clsp.conf b/egs/voxceleb/v2.1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_long.conf b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_short.conf b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..ad991124 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml new file mode 100644 index 00000000..0b1d0454 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..254ff796 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..52be6db5 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..bd3e7f86 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..3443591a --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..abe5da6e --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..7287188c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..3443591a --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml new file mode 100644 index 00000000..5e1260ad --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml new file mode 100644 index 00000000..2addaa1e --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: wavlmlarge_ecapatdnn512x3_v2.0.yaml +trainer: + optim: + opt_type: sgd + lr: 0.4 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 2600 + #min_lr: 4e-4 + min_lr: 1e-6 + warmup_steps: 2600 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..69a8322b --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,63 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: class_id + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..5e1260ad --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: class_id + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/vad_16k.yaml b/egs/voxceleb/v2.1/conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/voxceleb/v2.1/conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..c3466259 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml new file mode 100644 index 00000000..d9c9b782 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..dc3737e3 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..d7e3388f --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus + drop_layers_gt: 9 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..b2430d97 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-base-plus +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..5025f047 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,45 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large + drop_layers_gt: 12 +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml new file mode 100644 index 00000000..0a6303f5 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -0,0 +1,44 @@ +hf_feats: + pretrained_model_path: microsoft/wavlm-large +xvector: + resnet_enc: + in_feats: 765 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 2 + intertop_margin: 0.1 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/datapath.sh b/egs/voxceleb/v2.1/datapath.sh new file mode 100644 index 00000000..a7eb575c --- /dev/null +++ b/egs/voxceleb/v2.1/datapath.sh @@ -0,0 +1,23 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/v2.1/default_config.sh b/egs/voxceleb/v2.1/default_config.sh new file mode 120000 index 00000000..f2d8812d --- /dev/null +++ b/egs/voxceleb/v2.1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh \ No newline at end of file diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..67a4665e --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params layers 2-12 + +# hugging face model +hf_model_name=wav2vec2xlsr300m12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh new file mode 100644 index 00000000..b4130fad --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn1024x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..80ee785b --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..c2b30f68 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus9l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..373535c2 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmbaseplus + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..530096cc --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge12l + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..1b276bcd --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wavlmlarge_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,54 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3 + +# hugging face model +hf_model_name=wavlmlarge + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wavlm2resnet1d + +nnet_s1_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v2.1/hyp_utils b/egs/voxceleb/v2.1/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/v2.1/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/v2.1/path.sh b/egs/voxceleb/v2.1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/v2.1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/v2.1/run_001_prepare_data.sh b/egs/voxceleb/v2.1/run_001_prepare_data.sh new file mode 100755 index 00000000..563d3c2d --- /dev/null +++ b/egs/voxceleb/v2.1/run_001_prepare_data.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh +. $config_file + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train +fi + +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test +fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # split vox2 into 2 parts, for cohort and qmf training + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/v2.1/run_002_compute_evad.sh b/egs/voxceleb/v2.1/run_002_compute_evad.sh new file mode 100755 index 00000000..acccace3 --- /dev/null +++ b/egs/voxceleb/v2.1/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-compute-energy-vad --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/voxceleb/v2.1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..4e0c5b19 --- /dev/null +++ b/egs/voxceleb/v2.1/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --joint-classes speaker --min-train-samples 1 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/v2.1/run_005_train_xvector.sh b/egs/voxceleb/v2.1/run_005_train_xvector.sh new file mode 100755 index 00000000..2479d565 --- /dev/null +++ b/egs/voxceleb/v2.1/run_005_train_xvector.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh new file mode 100755 index 00000000..0dc58048 --- /dev/null +++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=120.0 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + + diff --git a/egs/voxceleb/v2.1/run_007_eval_be.sh b/egs/voxceleb/v2.1/run_007_eval_be.sh new file mode 100755 index 00000000..53621488 --- /dev/null +++ b/egs/voxceleb/v2.1/run_007_eval_be.sh @@ -0,0 +1,321 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +plda_label=${plda_type}y${plda_y_dim}_v1 +be_name=lda${lda_dim}_${plda_label}_${plda_data} + +xvector_dir=exp/xvectors/$nnet_name +be_dir=exp/be/$nnet_name/$be_name +score_dir=exp/scores/$nnet_name +score_plda_dir=$score_dir/${be_name}/plda +score_cosine_dir=$score_dir/cosine +score_cosine_snorm_dir=$score_dir/cosine_snorm +score_cosine_qmf_dir=$score_dir/cosine_qmf + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + +if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring" + $train_cmd $score_cosine_dir/log/voxsrc22_dev.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_dir/voxsrc22_dev_scores.csv + + # $train_cmd $score_cosine_dir/log/voxsrc22_eval.log \ + # hyp_utils/conda_env.sh \ + # hyperion-eval-cosine-scoring-backend \ + # --feats-file csv:$xvector_dir/voxsrc22_eval/xvector.csv \ + # --ndx-file data/voxsrc22_eval/trials.csv \ + # --enroll-map-file data/voxsrc22_eval/enrollment.csv \ + # --score-file $score_cosine_dir/voxsrc22_eval_scores.csv + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_dir/voxsrc22_dev_results.csv + + cat $score_cosine_dir/voxsrc22_dev_results.csv + +fi + +if [ "$do_snorm" == "true" ];then + if [ $stage -le 5 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_snorm_dir/voxceleb1_results.csv + + cat $score_cosine_snorm_dir/voxceleb1_results.csv + fi + + if [ $stage -le 6 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + AS-Norm" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + hyperion-merge-scores --output-file $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_snorm_dir/log/score_voxsrc22_dev.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_snorm_dir/voxsrc22_dev_scores.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + cat $score_cosine_snorm_dir/voxsrc22_dev_results.csv + + fi + +fi + +if [ "$do_qmf" == "true" ];then + if [ $stage -le 7 ];then + echo "Train QMF in Vox2" + echo "...Calculating quality measures for Vox2" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxceleb2_trials_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --ndx-file data/voxceleb2cat_train_trials/trials.csv \ + --enroll-map-file data/voxceleb2cat_train_trials/enrollments.csv \ + --score-file $score_cosine_qmf_dir/voxceleb2_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + hyperion-train-qmf --score-file $score_cosine_qmf_dir/voxceleb2_scores.snorm.csv \ + --key-file data/voxceleb2cat_train_trials/trials.csv \ + --model-file $score_cosine_qmf_dir/qmf.h5 + + fi + + if [ $stage -le 8 ];then + echo "Eval Voxceleb 1 with Cosine scoring + Adaptive SNorm + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxceleb1_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + sleep 5s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxceleb1$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxceleb1_scores$suffix.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxceleb1_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxceleb1_results$suffix.csv + ) & + done + wait + fi + + if [ $stage -le 9 ] && [ "$do_voxsrc22" == "true" ];then + echo "Eval voxsrc2 with Cosine scoring + QMF" + num_parts=16 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend-with-qmf \ + --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ + --ndx-file data/voxsrc22_dev/trials.csv \ + --enroll-map-file data/voxsrc22_dev/enrollment.csv \ + --score-file $score_cosine_qmf_dir/voxsrc22_dev_scores.csv \ + --cohort-segments-file data/voxceleb2cat_train_cohort/segments.csv \ + --cohort-feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --cohort-nbest 1000 --avg-cohort-by speaker \ + --qmf-file $score_cosine_qmf_dir/qmf.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + sleep 5s + done + sleep 10s + done + wait + for suffix in "" .snorm .snorm.qmf + do + ( + hyperion-merge-scores --output-file $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_qmf_dir/log/score_voxsrc22_dev$suffix.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_qmf_dir/voxsrc22_dev_scores$suffix.csv \ + --key-files data/voxsrc22_dev/trials.csv \ + --score-names voxsrc22_dev \ + --key-names all \ + --output-file $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + + echo "$score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv:" + cat $score_cosine_qmf_dir/voxsrc22_dev_results$suffix.csv + ) & + done + wait + fi + +fi + diff --git a/egs/voxceleb/v2/README.md b/egs/voxceleb/v2/README.md index a005b6e8..0bafe85e 100644 --- a/egs/voxceleb/v2/README.md +++ b/egs/voxceleb/v2/README.md @@ -26,12 +26,12 @@ Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Huber ## Usage - Run the run_0*.sh scripts in sequence - - By default it will use + - By default it will use config global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh - For better performance use ```bash -run_011_train_xvector.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh -run_030_extract_xvectors.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh --use-gpu true -run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr0.05_amp.v1.sh +run_011_train_xvector.sh --config-file global_conf/other_config.sh +run_030_extract_xvectors.sh --config-file global_conf/other_config.sh --use-gpu true +run_040_eval_be.sh --config-file global_conf/other_config.sh ``` @@ -155,7 +155,7 @@ run_040_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | | | | Cosine + QMF | 2.38 | 0.159 | 0.266 | | config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 | | | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 | -| | | | Cosine + QMF | 0.242 | 0.144 | 0.231 | +| | | | Cosine + QMF | 2.42 | 0.144 | 0.231 | | config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 | | | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 | | | | | Cosine + QMF | 1.92 | 0.117 | 0.200 | diff --git a/egs/voxceleb/v2/default_config.sh b/egs/voxceleb/v2/default_config.sh index abcc2a2e..f2d8812d 120000 --- a/egs/voxceleb/v2/default_config.sh +++ b/egs/voxceleb/v2/default_config.sh @@ -1 +1 @@ -global_conf/config_wavlmbaseplus_ecapatdnn512x3_v1.0.sh \ No newline at end of file +global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh \ No newline at end of file From ed35173f534f98cb85b609642226b99d17163ddb Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 11 Sep 2023 12:12:49 -0400 Subject: [PATCH 110/154] vox/v2.1 recipe done, not tested --- egs/voxceleb/v2.1/run_005_train_xvector.sh | 27 ++++++++++++++++--- egs/voxceleb/v2.1/run_006_extract_xvectors.sh | 9 ++++--- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/egs/voxceleb/v2.1/run_005_train_xvector.sh b/egs/voxceleb/v2.1/run_005_train_xvector.sh index 2479d565..eb1c591e 100755 --- a/egs/voxceleb/v2.1/run_005_train_xvector.sh +++ b/egs/voxceleb/v2.1/run_005_train_xvector.sh @@ -44,7 +44,7 @@ if [ $stage -le 1 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + hyperion-train-wav2vec2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ --data.train.dataset.class-files $train_data_dir/speaker.csv \ @@ -56,7 +56,7 @@ if [ $stage -le 1 ]; then fi -# Large Margin Fine-tuning +# Finetune full model if [ $stage -le 2 ]; then if [ "$use_wandb" == "true" ];then extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" @@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then $cuda_cmd \ --gpu $ngpu $nnet_s2_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ --data.train.dataset.class-files $train_data_dir/speaker.csv \ @@ -76,3 +76,24 @@ if [ $stage -le 2 ]; then --num-gpus $ngpu \ fi + +# Finetune full model +if [ $stage -le 3 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh index 0dc58048..2cfe27fe 100755 --- a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh +++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh @@ -8,15 +8,16 @@ set -e stage=1 -nnet_stage=2 +nnet_stage=3 config_file=default_config.sh use_gpu=false +hf_chunk_length=120.0 #seconds xvec_chunk_length=120.0 . parse_options.sh || exit 1; . $config_file if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" num_gpus=1 else @@ -58,7 +59,7 @@ if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qm echo "Extracting x-vectors for $name" $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ --random-utt-length --min-utt-length 2 --max-utt-length 30 \ @@ -88,7 +89,7 @@ if [ $stage -le 2 ]; then echo "Extracting x-vectors for $name" $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + hyperion-extract-wav2vec2xvectors ${xvec_args} ${vad_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ --model-path $nnet \ From 8760d055520609a57bc69ac9fc05ef159e9f336a Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 12 Sep 2023 14:06:02 -0400 Subject: [PATCH 111/154] implemented lora in w2v2, not tested --- hyperion/io/bin_vad_reader.py | 4 +- hyperion/np/augment/noise_augment.py | 2 +- hyperion/torch/layers/__init__.py | 13 +- hyperion/torch/layers/lora.py | 80 +++++ .../models/wav2xvectors/hf_wav2xvector.py | 26 +- hyperion/torch/tpm/hf/hf_wav2vec2.py | 18 +- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 320 ++++++++++++++---- hyperion/utils/dataset.py | 68 +++- requirements.txt | 4 +- 9 files changed, 425 insertions(+), 110 deletions(-) create mode 100644 hyperion/torch/layers/lora.py diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py index 82e2a0c5..8ce91d15 100644 --- a/hyperion/io/bin_vad_reader.py +++ b/hyperion/io/bin_vad_reader.py @@ -59,7 +59,7 @@ def read( vad = self.r.read(keys) output_vad = [] for i in range(len(keys)): - vad_i = vad[i].astype(np.bool, copy=False) + vad_i = vad[i].astype(bool, copy=False) offset_i = offset[i] if offset_is_list else offset num_frames_i = num_frames[i] if num_frames_is_list else num_frames vad_i = self._get_bin_vad_slice(vad_i, offset_i, num_frames_i) @@ -77,7 +77,7 @@ def read_timestamps(self, keys, merge_tol=0.001): vad = self.r.read(keys) ts = [] for i in range(len(keys)): - vad_i = vad[i].astype(np.bool, copy=False) + vad_i = vad[i].astype(bool, copy=False) ts_i = bin_vad_to_timestamps( vad_i, self.frame_length / 1000, diff --git a/hyperion/np/augment/noise_augment.py b/hyperion/np/augment/noise_augment.py index 1cc1a0be..92bd57dd 100644 --- a/hyperion/np/augment/noise_augment.py +++ b/hyperion/np/augment/noise_augment.py @@ -55,7 +55,7 @@ def __init__( @staticmethod def _power(x): """Computes power of x in dB.""" - return 10 * np.log10((x ** 2).sum()) + return 10 * np.log10((x**2).sum() + 1e-10) @staticmethod def snr(x, n): diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py index 6b508b0e..bea52c95 100644 --- a/hyperion/torch/layers/__init__.py +++ b/hyperion/torch/layers/__init__.py @@ -4,20 +4,23 @@ """ from .activation_factory import ActivationFactory -from .attention import (LocalScaledDotProdAttRelPosEncV1, - LocalScaledDotProdAttV1, ScaledDotProdAttRelPosEncV1, - ScaledDotProdAttV1) +from .attention import ( + LocalScaledDotProdAttRelPosEncV1, + LocalScaledDotProdAttV1, + ScaledDotProdAttRelPosEncV1, + ScaledDotProdAttV1, +) from .audio_feats import * from .audio_feats_factory import AudioFeatsFactory from .calibrators import LinBinCalibrator from .dropout import DropConnect1d, DropConnect2d, Dropout1d from .global_pool import * from .interpolate import Interpolate +from .lora import LoRAFactory from .margin_losses import ArcLossOutput, CosLossOutput, SubCenterArcLossOutput from .mvn import MeanVarianceNorm from .norm_layer_factory import NormLayer1dFactory, NormLayer2dFactory from .pool_factory import GlobalPool1dFactory -from .pos_encoder import (ConvPosEncoder, NoPosEncoder, PosEncoder, - RelPosEncoder) +from .pos_encoder import ConvPosEncoder, NoPosEncoder, PosEncoder, RelPosEncoder from .spec_augment import AxisMasker, SpecAugment, SpecWarper from .subpixel_convs import ICNR1d, ICNR2d, SubPixelConv1d, SubPixelConv2d diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py new file mode 100644 index 00000000..1436caf5 --- /dev/null +++ b/hyperion/torch/layers/lora.py @@ -0,0 +1,80 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from typing import Union + +import loralib as lora +import torch.nn as nn +from loralib import * + + +class LoRAFactory: + def create_from_pretrained( + layer: Union[nn.Embedding, nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d], + r: int = 8, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + merge_weights: bool = True, + ): + if isinstance(layer, nn.Embedding): + lora_layer = lora.Embedding( + layer.num_embeddings, + layer.embedding_dim, + padding_idx=layer.padding_idx, + max_norm=layer.max_norm, + norm_type=layer.norm_type, + scale_grad_by_freq=layer.scale_grad_by_freq, + sparse=layer.sparse, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + + elif isinstance(layer, nn.Linear): + bias = layer.bias is not None + lora_layer = lora.Linear( + layer.in_features, + layer.out_features, + bias=bias, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + if bias: + lora_layer.bias.data = layer.bias.data + + elif isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): + if isinstance(layer, nn.Conv1d): + lora_class = lora.Conv1d + elif isinstance(layer, nn.Conv2d): + lora_class = lora.Conv2d + elif isinstance(layer, nn.Conv3d): + lora_class = lora.Conv3d + + bias = layer.bias is not None + lora_layer = lora_class( + layer.in_channels, + layer.out_channels, + layer.kernel_size, + stride=layer.stride, + padding=layer.padding, + dilation=layer.dilation, + groups=layer.groups, + bias=bias, + padding_mode=layer.padding_mode, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=merge_weights, + ) + lora_layer.weight.data = layer.weight.data + if bias: + lora_layer.bias.data = layer.bias.data + + return lora_layer diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 24ab5bbb..925f1172 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -5,10 +5,9 @@ import contextlib import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...torch_model import TorchModel from ...utils import remove_silence @@ -29,7 +28,6 @@ class HFWav2XVector(TorchModel): def __init__( self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" ): - super().__init__() self.hf_feats = hf_feats self.xvector = xvector @@ -222,7 +220,6 @@ def extract_embed( embed_layer=None, detach_chunks=False, ): - if vad_samples is not None: x, x_lengths = remove_silence(x, vad_samples, x_lengths) @@ -256,6 +253,9 @@ def freeze_hf_feats(self): def freeze_hf_feature_encoder(self): self.hf_feats.freeze_feature_encoder() + def freeze_hf_except_lora(self, bias=None): + self.hf_feats.freeze_except_lora(bias) + def has_param_groups(self): return self.hf_feats.has_param_groups() @@ -296,6 +296,15 @@ def set_train_mode(self, mode): elif mode == "hf-feat-extractor-frozen": self.unfreeze() self.freeze_hf_feature_encoder() + elif mode == "hf-lora": + self.unfreeze() + self.freeze_hf_except_lora() + elif mode == "hf-all-bias-lora": + self.unfreeze() + self.freeze_hf_except_lora(bias="all") + elif mode == "hf-lora-with-bias": + self.unfreeze() + self.freeze_hf_except_lora(bias="lora_only") else: raise ValueError(f"invalid train_mode={mode}") @@ -310,7 +319,6 @@ def set_train_mode(self, mode): self._train_mode = mode def _train(self, train_mode: str): - if train_mode in ["full", "frozen"]: super()._train(train_mode) elif train_mode == "ft-embed-affine": @@ -322,6 +330,9 @@ def _train(self, train_mode: str): "ft-xvector-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ]: self.hf_feats.train() self.xvector._train("full") @@ -339,6 +350,9 @@ def valid_train_modes(): "ft-xvector-nograd", "hf-feats-frozen-nograd", "hf-feat-extractor-frozen", + "hf-lora", + "hf-all-bias-lora", + "hf-lora-with-bias", ] @staticmethod @@ -353,7 +367,6 @@ def filter_args(**kwargs): return args def get_config(self): - hf_cfg = self.hf_feats.get_config() xvec_cfg = self.xvector.get_config() del hf_cfg["class_name"] @@ -375,7 +388,6 @@ def change_config(self, hf_feats, xvector): @staticmethod def add_class_args(parser, prefix=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index 26da7beb..dd5de2fe 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import Wav2Vec2Config, Wav2Vec2Model - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2Config, Wav2Vec2Model from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase @@ -204,8 +203,13 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): - super().__init__( pretrained_model_path=pretrained_model_path, normalize_input=normalize_input, @@ -223,6 +227,12 @@ def __init__( sample_frequency=sample_frequency, feat_extract_lr=feat_extract_lr, encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, ) if pretrained_model_path is not None and not ignore_pretrained: diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index a9c4ddef..2c8d239f 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -8,12 +8,13 @@ from turtle import right from typing import List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor +from ....utils.misc import filter_func_args +from ...layers import LoRAFactory from ...torch_model import TorchModel from ...utils import scale_seq_lengths, seq_lengths_to_mask from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs @@ -55,6 +56,12 @@ class HFWav2VecBase(TorchModel): sample_frequency: (`int`) waveform sample frequency used to train the model. feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -75,6 +82,12 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): super().__init__() self.pretrained_model_path = pretrained_model_path @@ -90,6 +103,12 @@ def __init__( self.left_encoder_context = left_encoder_context self.feat_extract_lr = feat_extract_lr self.encoder_lr = encoder_lr + self.use_lora = use_lora + self.lora_components = lora_components + self.lora_rank = lora_rank + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + self.lora_merge_weights = lora_merge_weights if pretrained_model_path is not None and not ignore_pretrained: rank = ddp_get_rank() @@ -153,6 +172,16 @@ def __init__( self._feature_encoder_context = None self._frame_shift = None + self.hf_model = None + + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) def __deepcopy__(self, memo): """Reimplementation of deepcopy for Hugging Face models. @@ -225,18 +254,36 @@ def change_config( self, override_dropouts: bool, override_spec_augment: bool, + override_lora: bool, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, **kwargs, ): if override_spec_augment: - logging.info("overriding speech augment") + logging.info(f"overriding speech augment with args={kwargs}") self.change_spec_augment(**kwargs) if override_dropouts: - logging.info("overriding hf model dropouts") + logging.info(f"overriding hf model dropouts with args={kwargs}") self.change_dropouts(**kwargs) + if override_lora: + logging.info("overriding LoRA config") + self.change_lora( + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, + ) + self.feat_extract_lr = feat_extract_lr self.encoder_lr = encoder_lr @@ -259,12 +306,109 @@ def change_spec_augment( self.hf_model.config.mask_feature_length = mask_feature_length self.hf_model.config.mask_feature_min_masks = mask_feature_min_masks + def change_lora( + self, + use_lora: bool = False, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, + ): + if not self.use_lora: + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + pass + else: + # TODO + pass + else: + if use_lora: + # TODO + pass + else: + # TODO + pass + + self.use_lora = use_lora + self.lora_components = lora_components + self.lora_rank = lora_rank + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + self.lora_merge_weights = lora_merge_weights + + def _make_lora_layers( + self, + lora_components: List[str], + lora_rank: int, + lora_alpha: int, + lora_dropout: float, + lora_merge_weights: bool, + ): + counts = {k: 0 for k in lora_components} + self._recursive_replace_layer_by_lora( + self.hf_model, + counts, + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + for k, v in counts.items(): + logging.info("count of LoRA layers for %s = %d", k, v) + assert v > 0, f"did not make any {k} LoRA" + + @staticmethod + def _recursive_replace_layer_by_lora( + model: nn.Module, + counts: dict, + lora_components: List[str], + lora_rank: int, + lora_alpha: int, + lora_dropout: float, + lora_merge_weights: bool, + ): + for name, module in model.named_children(): + if len(list(module.children())) > 0: + HFWav2VecBase._recursive_replace_layer_by_lora( + module, + counts, + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + if isinstance(module, nn.Linear) and name in lora_components: + lora_layer = LoRAFactory.create_from_pretrained( + module, + r=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + merge_weights=lora_merge_weights, + ) + setattr(model, name, lora_layer) + counts[name] += 1 + def change_dropouts(self, **kwargs): pass # needs to be overloaded def freeze_feature_encoder(self): self.hf_model.freeze_feature_encoder() + def freeze_except_lora(self, bias=None): + bias = "none" if bias is None else bias + from ...layers.lora import mark_only_lora_as_trainable + + mark_only_lora_as_trainable(self.hf_model, bias=bias) + def has_param_groups(self): return self.feat_extract_lr is not None or self.encoder_lr is not None @@ -302,14 +446,14 @@ def _normalize(self, x, x_mask=None): """Normalizes the audio to have zero mean and unit variance.""" if x_mask is None: x = x - x.mean(dim=1, keepdim=True) - std = torch.sqrt((x ** 2).mean(dim=1, keepdim=True) + 1e-7) + std = torch.sqrt((x**2).mean(dim=1, keepdim=True) + 1e-7) x = x / std else: x_mask = x_mask.to(dtype=x.dtype) x_samples = torch.mean(x_mask, dim=1, keepdim=True) x_mean = torch.mean(x * x_mask, dim=1, keepdim=True) / x_samples - x2_mean = torch.mean(x ** 2 * x_mask, dim=1, keepdim=True) / x_samples - std = torch.sqrt(x2_mean - x_mean ** 2 + 1e-7) + x2_mean = torch.mean(x**2 * x_mask, dim=1, keepdim=True) / x_samples + std = torch.sqrt(x2_mean - x_mean**2 + 1e-7) x = (x - x_mean) / std return x @@ -544,14 +688,6 @@ def forward_long_impl( else scale_seq_lengths(x_lengths, max_out_length, max_in_length) ) output["hidden_states_lengths"] = feat_lengths - # print( - # "lens", - # mol0, - # max_out_length, - # output.last_hidden_state.size(1), - # output.hidden_states[0].size(1), - # flush=True, - # ) return output def get_config(self): @@ -572,6 +708,14 @@ def get_config(self): "left_encoder_context": self.left_encoder_context, "right_encoder_context": self.right_encoder_context, "sample_frequency": self.sample_frequency, + "feat_extract_lr": self.feat_extract_lr, + "encoder_lr": self.encoder_lr, + "use_lora": self.use_lora, + "lora_components": self.lora_components, + "lora_rank": self.lora_rank, + "lora_alpha": self.lora_alpha, + "lora_dropout": self.lora_dropout, + "lora_merge_weights": self.lora_merge_weights, } base_config = super().get_config() @@ -584,24 +728,78 @@ def save(self, file_path: str): @staticmethod def filter_args(**kwargs): - valid_args = ( - "pretrained_model_path", - "normalize_input", - "use_input_attention_mask", - "cache_dir", - "force_download", - "resume_download", - "revision", - "drop_layers_gt", - "ignore_pretrained", - "override_dropouts", - "override_spec_augment", - "left_encoder_context", - "right_encoder_context", - "sample_frequency", + return filter_func_args(HFWav2VecBase.__init__, **kwargs) + # valid_args = ( + # "pretrained_model_path", + # "normalize_input", + # "use_input_attention_mask", + # "cache_dir", + # "force_download", + # "resume_download", + # "revision", + # "drop_layers_gt", + # "ignore_pretrained", + # "override_dropouts", + # "override_spec_augment", + # "left_encoder_context", + # "right_encoder_context", + # "sample_frequency", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return args + + @staticmethod + def _add_lr_args(parser): + parser.add_argument( + "--feat-extractor-lr", + default=None, + type=float, + help=( + "lr for conv feature extractor, it serves to set a lr " + "different than the global one." + ), + ) + parser.add_argument( + "--encoder-lr", + default=None, + type=float, + help=( + "lr for transformer encoder, it serves to set a lr " + "different than the global one." + ), + ) + + @staticmethod + def _add_lora_args(parser): + parser.add_argument( + "--use-lora", + default=False, + action=ActionYesNo, + help="use low-rank adapters", + ) + parser.add_argument( + "--lora-components", + default=["q_proj", "v_proj"], + nargs="+", + choices=[ + "k_proj", + "q_proj", + "v_proj", + "out_proj", + "intermediate_dense", + "output_dense", + ], + help="list of components where we apply LoRA, eg [Wq, Wv]", + ) + parser.add_argument("--lora-rank", default=4, help="rank of LoRA") + parser.add_argument("--lora-alpha", default=1.0, help="scale for LoRA") + parser.add_argument("--lora-dropout", default=0.0, help="dropout rate for LoRA") + parser.add_argument( + "--lora-merge-weights", + default=True, + action=ActionYesNo, + help="lora weights are merged with the pretrained weights at inference.", ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args @staticmethod def add_class_args(parser, prefix=None, skip=set()): @@ -703,36 +901,22 @@ def add_class_args(parser, prefix=None, skip=set()): "when the signal is evaluated chunk by chunk." ), ) - parser.add_argument( - "--feat-extractor-lr", - default=None, - type=float, - help=( - "lr for conv feature extractor, it serves to set a lr " - "different than the global one." - ), - ) - parser.add_argument( - "--encoder-lr", - default=None, - type=float, - help=( - "lr for transformer encoder, it serves to set a lr " - "different than the global one." - ), - ) + + HFWav2VecBase._add_lr_args(parser) + HFWav2VecBase._add_lora_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def filter_finetune_args(**kwargs): - valid_args = ( - "override_dropouts", - "override_spec_augment", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return args + return filter_func_args(HFWav2VecBase.change_config, **kwargs) + # valid_args = ( + # "override_dropouts", + # "override_spec_augment", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return args @staticmethod def add_finetune_args(parser, prefix=None, skip=set()): @@ -759,23 +943,13 @@ def add_finetune_args(parser, prefix=None, skip=set()): ), ) parser.add_argument( - "--feat-extractor-lr", - default=None, - type=float, - help=( - "lr for conv feature extractor, it serves to set a lr " - "different than the global one." - ), - ) - parser.add_argument( - "--encoder-lr", - default=None, - type=float, - help=( - "lr for transformer encoder, it serves to set a lr " - "different than the global one." - ), + "--override-lora", + default=False, + action=ActionYesNo, + help=("whether to change the config of LoRA layers in the model."), ) + HFWav2VecBase._add_lr_args(parser) + HFWav2VecBase._add_lora_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index dd446576..51f0f37a 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -55,7 +55,6 @@ def __init__( sparse_trials: bool = False, table_sep: Optional[str] = None, ): - if isinstance(segments, SegmentSet): self._segments = segments self._segments_path = None @@ -82,10 +81,12 @@ def __init__( features, FeatureSet ) self._enrollments, self._enrollments_paths = self._parse_dict_args( - enrollments, EnrollmentMap, + enrollments, + EnrollmentMap, ) self._trials, self._trials_paths = self._parse_dict_args( - trials, (TrialKey, TrialNdx, SparseTrialKey), + trials, + (TrialKey, TrialNdx, SparseTrialKey), ) self.sparse_trials = sparse_trials @@ -711,7 +712,8 @@ def add_features(self, features_name: str, features: Union[PathLike, FeatureSet] raise ValueError() def set_segments( - self, segments: Union[PathLike, SegmentSet], update_seg_durs: bool, + self, + segments: Union[PathLike, SegmentSet], ): if isinstance(segments, (str, Path)): self._segments = None @@ -723,7 +725,9 @@ def set_segments( raise ValueError() def set_recordings( - self, recordings: Union[PathLike, RecordingSet], update_seg_durs: bool, + self, + recordings: Union[PathLike, RecordingSet], + update_seg_durs: bool = False, ): if isinstance(recordings, (str, Path)): self._recordings = None @@ -753,7 +757,9 @@ def add_classes(self, classes_name: str, classes: Union[PathLike, ClassInfo]): raise ValueError() def add_enrollments( - self, enrollments_name: str, enrollments: Union[PathLike, EnrollmentMap], + self, + enrollments_name: str, + enrollments: Union[PathLike, EnrollmentMap], ): if self._enrollments is None: self._enrollments = {} @@ -793,7 +799,9 @@ def remove_features(self, features_name: str): del self._features[features_name] del self._features_paths[features_name] - def remove_recordings(self,): + def remove_recordings( + self, + ): if self._recordings_path is not None: self._files_to_delete.append(self._recordings_path) @@ -820,7 +828,8 @@ def remove_classes(self, classes_name: str): del self._classes_paths[classes_name] def remove_enrollments( - self, enrollments_name: str, + self, + enrollments_name: str, ): if self._enrollments_paths[enrollments_name] is not None: self._files_to_delete.append(self._enrollments_paths[enrollments_name]) @@ -829,7 +838,8 @@ def remove_enrollments( del self._enrollments_paths[enrollments_name] def remove_trials( - self, trials_name: str, + self, + trials_name: str, ): if self._trials_paths[trials_name] is not None: self._files_to_delete.append(self._trials_paths[trials_name]) @@ -981,14 +991,20 @@ def split_into_trials_and_cohort( segments_male = SegmentSet(segments[segments["gender"] == "m"]) segments_female = SegmentSet(segments[segments["gender"] == "f"]) trials_male, enroll_male, cohort_male = self._split_into_trials_and_cohort( - segments_male, num_tar_trials, num_trial_speakers, seed, + segments_male, + num_tar_trials, + num_trial_speakers, + seed, ) ( trials_female, enroll_female, cohort_female, ) = self._split_into_trials_and_cohort( - segments_female, num_tar_trials, num_trial_speakers, seed, + segments_female, + num_tar_trials, + num_trial_speakers, + seed, ) trials = TrialKey.merge([trials_male, trials_female]) enroll = EnrollmentMap.cat([enroll_male, enroll_female]) @@ -996,7 +1012,10 @@ def split_into_trials_and_cohort( else: segments = self.segments() trials, enroll, cohort = self._split_into_trials_and_cohort( - segments, num_tar_trials, num_trial_speakers, seed, + segments, + num_tar_trials, + num_trial_speakers, + seed, ) dataset_trials = self.clone() @@ -1019,7 +1038,10 @@ def remove_short_segments(self, min_length: float, length_name: str = "duration" self.clean() def remove_classes_few_segments( - self, class_name: str, min_segs: int, rebuild_idx: bool = False, + self, + class_name: str, + min_segs: int, + rebuild_idx: bool = False, ): segments = self.segments() classes, counts = np.unique(segments[class_name], return_counts=True) @@ -1082,7 +1104,10 @@ def _segments_split_joint_classes( return train_segs, val_segs def _segments_split_disjoint_classes( - self, val_prob: float, disjoint_classes: List[str], rng: np.random.Generator, + self, + val_prob: float, + disjoint_classes: List[str], + rng: np.random.Generator, ): segments = self.segments() classes = segments[disjoint_classes].apply("-".join, axis=1) @@ -1165,15 +1190,24 @@ def split_train_val( train_segs, val_segs = self._segments_split(val_prob, rng) elif joint_classes is not None and disjoint_classes is None: train_segs, val_segs = self._segments_split_joint_classes( - val_prob, joint_classes, min_train_samples, rng, + val_prob, + joint_classes, + min_train_samples, + rng, ) elif joint_classes is None and disjoint_classes is not None: train_segs, val_segs = self._segments_split_disjoint_classes( - val_prob, disjoint_classes, rng, + val_prob, + disjoint_classes, + rng, ) else: train_segs, val_segs = self._segments_split_joint_and_disjoint_classes( - val_prob, joint_classes, disjoint_classes, min_train_samples, rng, + val_prob, + joint_classes, + disjoint_classes, + min_train_samples, + rng, ) train_ds = self.clone() diff --git a/requirements.txt b/requirements.txt index c3410829..1e1aea9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,6 @@ memory_profiler gdown fairscale==0.4.4 tensorboard>=2.5.0 -yapf jsonargparse>=3.5.0 wandb>=0.10.30 librosa>=0.8.1 @@ -22,3 +21,6 @@ twine wheel transformers>=4.16.2 sentencepiece>=0.1.97 +loralib +lhotse + From a75610ee27acf2cd15ecc38151f5efff6fa09623 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 13 Sep 2023 10:59:46 -0400 Subject: [PATCH 112/154] vox2.1 working and lora --- egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml | 13 ++-- ...lsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml | 13 ++-- ...c2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...baseplus9l_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...lmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 13 ++-- ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 13 ++-- ...lmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 13 ++-- ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 13 ++-- ...avlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml | 13 ++-- ...avlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml | 13 ++-- ...wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml | 13 ++-- ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 13 ++-- ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 13 ++-- ...rge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml | 71 ++++++++++++++++++ ...rge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml | 74 +++++++++++++++++++ ...vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh | 55 ++++++++++++++ hyperion/torch/layers/lora.py | 52 +++++++++++-- hyperion/torch/models/xvectors/xvector.py | 36 +-------- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 22 +++++- 22 files changed, 378 insertions(+), 140 deletions(-) create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml create mode 100644 egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh diff --git a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml index 4fdf8068..86f55073 100644 --- a/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml +++ b/egs/voxceleb/v2.1/conf/reverb_noise_aug.yaml @@ -4,32 +4,31 @@ reverb_aug: rir_types: smallroom: weight: 1 - rir_path: scp:data/rirs_smallroom/rirs.scp + rir_path: csv:data/rirs_smallroom/rirs.csv rir_norm: max mediumroom: weight: 1 - rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_path: csv:data/rirs_mediumroom/rirs.csv rir_norm: max realroom: weight: 1 - rir_path: scp:data/rirs_real/rirs.scp + rir_path: csv:data/rirs_real/rirs.csv rir_norm: max noise_aug: noise_prob: 0.7 noise_types: noise: weight: 1 - noise_path: data/musan_noise_proc_audio/wav.scp + noise_path: data/musan_noise_proc_audio/recordings.csv min_snr: 0 max_snr: 18 music: weight: 1 - noise_path: data/musan_music_proc_audio/wav.scp + noise_path: data/musan_music_proc_audio/recordings.csv min_snr: 3 max_snr: 18 babble: weight: 1 - noise_path: data/musan_speech_babble/wav.scp + noise_path: data/musan_speech_babble/recordings.csv min_snr: 3 max_snr: 18 - diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml index ad991124..ffd2f374 100644 --- a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml index 0b1d0454..7dcc56ef 100644 --- a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml index 254ff796..3f5c46bc 100644 --- a/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml index 52be6db5..9e1d0928 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus9l_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml index bd3e7f86..0d0dc398 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml index 69a8322b..8504db9e 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: @@ -60,4 +60,5 @@ trainer: log_interval: 1000 epochs: 8 eff_batch_size: 512 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml index 3443591a..dda0c632 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -15,7 +15,7 @@ data: max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -25,11 +25,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -38,7 +38,7 @@ data: max_chunk_length: 3.0 min_chunk_length: 3.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -70,4 +70,5 @@ trainer: log_interval: 1000 epochs: 4 eff_batch_size: 256 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml index abe5da6e..46ee7d18 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml index 7287188c..db36f8ee 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 64 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 64 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: @@ -60,4 +60,5 @@ trainer: log_interval: 1000 epochs: 8 eff_batch_size: 512 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml index 3443591a..dda0c632 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -15,7 +15,7 @@ data: max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -25,11 +25,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -38,7 +38,7 @@ data: max_chunk_length: 3.0 min_chunk_length: 3.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -70,4 +70,5 @@ trainer: log_interval: 1000 epochs: 4 eff_batch_size: 256 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml index 69a8322b..8504db9e 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: @@ -60,4 +60,5 @@ trainer: log_interval: 1000 epochs: 8 eff_batch_size: 512 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml index 5e1260ad..ad56e80d 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -15,7 +15,7 @@ data: max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -25,11 +25,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -38,7 +38,7 @@ data: max_chunk_length: 3.0 min_chunk_length: 3.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -70,4 +70,5 @@ trainer: log_interval: 1000 epochs: 4 eff_batch_size: 256 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml index 2addaa1e..40341a27 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage1_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 128 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -55,5 +55,6 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 1024 + target_key: speaker train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml index 69a8322b..8504db9e 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -14,17 +14,17 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -32,7 +32,7 @@ data: min_batch_size: 32 max_chunk_length: 3.0 min_chunk_length: 3.0 - class_name: class_id + class_name: speaker data_loader: num_workers: 8 model: @@ -60,4 +60,5 @@ trainer: log_interval: 1000 epochs: 8 eff_batch_size: 512 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml index 5e1260ad..ad56e80d 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -15,7 +15,7 @@ data: max_chunk_length: 6.0 min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -25,11 +25,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker target_sample_freq: 16000 wav_scale: 1 sampler: @@ -38,7 +38,7 @@ data: max_chunk_length: 3.0 min_chunk_length: 3.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker weight_exponent: 0.5 weight_mode: data-prior seg_weight_mode: data-prior @@ -70,4 +70,5 @@ trainer: log_interval: 1000 epochs: 4 eff_batch_size: 256 + target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml new file mode 100644 index 00000000..b5b9b6b6 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml @@ -0,0 +1,71 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + hf_feats: + override_lora: true + use_lora: true + lora_rank: 4 + lora_components: + - q_proj + - v_proj + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 5e-2 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 5e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 8 + eff_batch_size: 512 + target_key: speaker + train_mode: hf-lora diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml new file mode 100644 index 00000000..a39445ff --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 2e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 5000 + hold_steps: 6000 + min_lr: 1e-4 + warmup_steps: 6000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 4 + eff_batch_size: 256 + target_key: speaker + train_mode: hf-lora diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh new file mode 100644 index 00000000..96ef76c5 --- /dev/null +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh @@ -0,0 +1,55 @@ +# Wav2Vec2 Multilingual 300M params + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage1_v2.0.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn512x3_v2.0 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml +nnet_s2_args="" +nnet_name=${hf_model_name}_loraqv_ecapatdnn512x3_v2.0 +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0004.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/hyperion/torch/layers/lora.py b/hyperion/torch/layers/lora.py index 1436caf5..18401669 100644 --- a/hyperion/torch/layers/lora.py +++ b/hyperion/torch/layers/lora.py @@ -7,7 +7,47 @@ import loralib as lora import torch.nn as nn -from loralib import * +from loralib import mark_only_lora_as_trainable + + +def repr_lora(self, str_base): + if isinstance(self.lora_dropout, nn.Dropout): + lora_dropout = self.lora_dropout.p + else: + lora_dropout = 0 + + str_lora = f", r={self.r}, alpha={self.lora_alpha}, dropout={lora_dropout}, merge_weights={self.merge_weights})" + return str_base[:-1] + str_lora + + +class LinearLoRA(lora.Linear): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class EmbeddingLoRA(lora.Embedding): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv1dLoRA(lora.Conv1d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv2dLoRA(lora.Conv2d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) + + +class Conv3dLoRA(lora.Conv3d): + def __repr__(self): + str_base = super().__repr__() + return repr_lora(self, str_base) class LoRAFactory: @@ -19,7 +59,7 @@ def create_from_pretrained( merge_weights: bool = True, ): if isinstance(layer, nn.Embedding): - lora_layer = lora.Embedding( + lora_layer = EmbeddingLoRA( layer.num_embeddings, layer.embedding_dim, padding_idx=layer.padding_idx, @@ -36,7 +76,7 @@ def create_from_pretrained( elif isinstance(layer, nn.Linear): bias = layer.bias is not None - lora_layer = lora.Linear( + lora_layer = LinearLoRA( layer.in_features, layer.out_features, bias=bias, @@ -51,11 +91,11 @@ def create_from_pretrained( elif isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): if isinstance(layer, nn.Conv1d): - lora_class = lora.Conv1d + lora_class = Conv1dLoRA elif isinstance(layer, nn.Conv2d): - lora_class = lora.Conv2d + lora_class = Conv2dLoRA elif isinstance(layer, nn.Conv3d): - lora_class = lora.Conv3d + lora_class = Conv3dLoRA bias = layer.bias is not None lora_layer = lora_class( diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index d67785d2..9ccd0d31 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -6,10 +6,9 @@ from enum import Enum from typing import Optional -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ....utils.misc import filter_func_args from ...layer_blocks import TDNNBlock @@ -52,7 +51,6 @@ def __init__( in_feats=None, proj_feats=None, ): - super().__init__() # encoder network @@ -407,7 +405,6 @@ def extract_embed_slidwin( embed_layer=None, detach_chunks=False, ): - if feat_frame_shift is not None: # assume win_length/shift are in secs, transform to frames # pass feat times from msecs to secs @@ -464,7 +461,6 @@ def compute_slidwin_timestamps( feat_frame_shift=10, feat_snip_edges=False, ): - P = self.compute_slidwin_left_padding( win_length, win_shift, @@ -495,7 +491,6 @@ def compute_slidwin_left_padding( feat_frame_shift=10, feat_snip_edges=False, ): - # pass feat times from msecs to secs feat_frame_shift = feat_frame_shift / 1000 feat_frame_length = feat_frame_length / 1000 @@ -526,7 +521,6 @@ def compute_slidwin_left_padding( return P1 + P2 def get_config(self): - enc_cfg = self.encoder_net.get_config() pool_cfg = PF.get_config(self.pool_net) @@ -694,42 +688,14 @@ def valid_train_modes(): @staticmethod def filter_args(**kwargs): - # get arguments for pooling pool_args = PF.filter_args(**kwargs["pool_net"]) args = filter_func_args(ClassifHead.__init__, kwargs) args["pool_net"] = pool_args return args - # valid_args = ( - # "num_classes", - # "embed_dim", - # "num_embed_layers", - # "hid_act", - # "loss_type", - # "cos_scale", - # "margin", - # "margin_warmup_epochs", - # "intertop_k", - # "intertop_margin", - # "num_subcenters", - # "use_norm", - # "norm_before", - # "in_feats", - # "proj_feats", - # "dropout_rate", - # "norm_layer", - # "head_norm_layer", - # "head_use_in_norm", - # ) - # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - # args["pool_net"] = pool_args - # return args - @staticmethod def add_class_args(parser, prefix=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index 2c8d239f..a981d1ec 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -545,6 +545,24 @@ def forward_impl( """ max_in_length = x.size(-1) x, x_mask = self._preprocess(x, x_lengths) + if ddp_get_rank() == 0: + lora_layer = self.hf_model.encoder.layers[0].attention.v_proj + # print( + # "lora\nw=", + # lora_layer.weight[:3, :3], + # "\na=", + # lora_layer.lora_A[:3, :3], + # "\nb=", + # lora_layer.lora_B[:3, :3], + # "\n", + # "merged=", + # lora_layer.merged, + # "training=", + # lora_layer.training, + # flush=True, + # ) + assert self.training == lora_layer.training + assert self.training == (not lora_layer.merged) output = self.hf_model( x, x_mask, @@ -728,7 +746,7 @@ def save(self, file_path: str): @staticmethod def filter_args(**kwargs): - return filter_func_args(HFWav2VecBase.__init__, **kwargs) + return filter_func_args(HFWav2VecBase.__init__, kwargs) # valid_args = ( # "pretrained_model_path", # "normalize_input", @@ -910,7 +928,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - return filter_func_args(HFWav2VecBase.change_config, **kwargs) + return filter_func_args(HFWav2VecBase.change_config, kwargs) # valid_args = ( # "override_dropouts", # "override_spec_augment", From c23103ee406a833726516ff8ac35b3a06382e97e Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 13 Sep 2023 19:32:11 -0400 Subject: [PATCH 113/154] lora in wavlm and hubert --- hyperion/torch/tpm/hf/hf_hubert.py | 25 ++++++++++++++++++++----- hyperion/torch/tpm/hf/hf_wav2vec2.py | 8 +++++++- hyperion/torch/tpm/hf/hf_wavlm.py | 25 ++++++++++++++++++++----- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index 2957e433..32355bf6 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import HubertConfig, HubertModel - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import HubertConfig, HubertModel from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase @@ -137,6 +136,12 @@ class HFHubert(HFWav2VecBase): sample_frequency: (`int`) waveform sample frequency used to train the model. feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -186,8 +191,12 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): - super().__init__( pretrained_model_path=pretrained_model_path, normalize_input=normalize_input, @@ -205,6 +214,12 @@ def __init__( sample_frequency=sample_frequency, feat_extract_lr=feat_extract_lr, encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -618,7 +633,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - args_base = HFWav2VecBase.filter_args(**kwargs) + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) valid_args = ( "hidden_dropout", "activation_dropout", diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index dd5de2fe..bc98f460 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -149,6 +149,12 @@ class HFWav2Vec2(HFWav2VecBase): sample_frequency: (`int`) waveform sample frequency used to train the model. feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -697,7 +703,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - args_base = HFWav2VecBase.filter_args(**kwargs) + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) valid_args = ( "hidden_dropout", "activation_dropout", diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index e1b67d81..400e6a8b 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -6,11 +6,10 @@ import os from typing import Callable, List, Optional, Tuple, Union -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from transformers import WavLMConfig, WavLMModel - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from transformers import WavLMConfig, WavLMModel from ...utils.ddp import ddp_get_rank, ddp_wait_for_all_procs from .hf_wav2vec_base import HFWav2VecBase @@ -150,6 +149,12 @@ class HFWavLM(HFWav2VecBase): sample_frequency: (`int`) waveform sample frequency used to train the model. feat_extract_lr: learning rate for conv feature extractor, serves to set a lr different than the global one. encoder_lr: learning rate for the wav2vec encoder, serves to set a lr different than the global one. + use_lora: use low-rank adapters + lora_components: list of components where we apply LoRA, eg [Wq, Wv] + lora_rank: rank of LoRA + lora_alpha: scale for LoRA + lora_dropout: dropout rate for LoRA + lora_merge_weights: lora weights are merged with the pretrained weights at inference. """ def __init__( @@ -204,8 +209,12 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + lora_components: List[str] = ["q_proj", "v_proj"], + lora_rank: int = 4, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + lora_merge_weights: bool = True, ): - super().__init__( pretrained_model_path=pretrained_model_path, normalize_input=normalize_input, @@ -223,6 +232,12 @@ def __init__( sample_frequency=sample_frequency, feat_extract_lr=feat_extract_lr, encoder_lr=encoder_lr, + use_lora=use_lora, + lora_components=lora_components, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_merge_weights=lora_merge_weights, ) if pretrained_model_path is not None and not ignore_pretrained: @@ -687,7 +702,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - args_base = HFWav2VecBase.filter_args(**kwargs) + args_base = HFWav2VecBase.filter_finetune_args(**kwargs) valid_args = ( "hidden_dropout", "activation_dropout", From 81c540b1492ec7b42299f0ebb871f6af66d11304 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 15 Sep 2023 12:35:56 -0400 Subject: [PATCH 114/154] fix bug in w2v constructors with lora --- ...v2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh | 6 +++--- egs/voxceleb/v2.1/run_006_extract_xvectors.sh | 2 +- hyperion/torch/tpm/hf/hf_hubert.py | 10 ++++++++++ hyperion/torch/tpm/hf/hf_wav2vec2.py | 9 +++++++++ hyperion/torch/tpm/hf/hf_wav2vec_base.py | 17 ++++------------- hyperion/torch/tpm/hf/hf_wavlm.py | 10 ++++++++++ 6 files changed, 37 insertions(+), 17 deletions(-) diff --git a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh index 96ef76c5..1985b8e6 100644 --- a/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh +++ b/egs/voxceleb/v2.1/global_conf/config_wav2vec2xlsr300m_loraqv_ecapatdnn512x3_v2.0.sh @@ -37,9 +37,9 @@ nnet_s3=$nnet_s3_dir/model_ep0004.pth # back-end do_plda=false -do_snorm=true -do_qmf=true -do_voxsrc22=true +#do_snorm=true +#do_qmf=true +#do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml plda_num_augs=0 diff --git a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh index 2cfe27fe..72b019cd 100755 --- a/egs/voxceleb/v2.1/run_006_extract_xvectors.sh +++ b/egs/voxceleb/v2.1/run_006_extract_xvectors.sh @@ -17,7 +17,7 @@ xvec_chunk_length=120.0 . $config_file if [ "$use_gpu" == "true" ];then - xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" + xvec_args="--use-gpu --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" num_gpus=1 else diff --git a/hyperion/torch/tpm/hf/hf_hubert.py b/hyperion/torch/tpm/hf/hf_hubert.py index 32355bf6..638bf561 100644 --- a/hyperion/torch/tpm/hf/hf_hubert.py +++ b/hyperion/torch/tpm/hf/hf_hubert.py @@ -191,6 +191,7 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, lora_components: List[str] = ["q_proj", "v_proj"], lora_rank: int = 4, lora_alpha: int = 1, @@ -298,6 +299,15 @@ def __init__( if drop_layers_gt is not None: self.drop_upper_layers(drop_layers_gt) + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + self.ignore_pretrained = True @property diff --git a/hyperion/torch/tpm/hf/hf_wav2vec2.py b/hyperion/torch/tpm/hf/hf_wav2vec2.py index bc98f460..5b59d79a 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec2.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec2.py @@ -322,6 +322,15 @@ def __init__( if drop_layers_gt is not None: self.drop_upper_layers(drop_layers_gt) + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + self.ignore_pretrained = True @property diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index a981d1ec..e0bcee1c 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -174,15 +174,6 @@ def __init__( self._frame_shift = None self.hf_model = None - if use_lora: - self._make_lora_layers( - lora_components, - lora_rank, - lora_alpha, - lora_dropout, - lora_merge_weights, - ) - def __deepcopy__(self, memo): """Reimplementation of deepcopy for Hugging Face models. The weight_norm in the Conv. Pos. Encoder of Wav2Vec models make the default deepcopy to fail. @@ -545,8 +536,8 @@ def forward_impl( """ max_in_length = x.size(-1) x, x_mask = self._preprocess(x, x_lengths) - if ddp_get_rank() == 0: - lora_layer = self.hf_model.encoder.layers[0].attention.v_proj + # if ddp_get_rank() == 0: + # lora_layer = self.hf_model.encoder.layers[0].attention.v_proj # print( # "lora\nw=", # lora_layer.weight[:3, :3], @@ -561,8 +552,8 @@ def forward_impl( # lora_layer.training, # flush=True, # ) - assert self.training == lora_layer.training - assert self.training == (not lora_layer.merged) + # assert self.training == lora_layer.training + # assert self.training == (not lora_layer.merged) output = self.hf_model( x, x_mask, diff --git a/hyperion/torch/tpm/hf/hf_wavlm.py b/hyperion/torch/tpm/hf/hf_wavlm.py index 400e6a8b..1db5fa23 100644 --- a/hyperion/torch/tpm/hf/hf_wavlm.py +++ b/hyperion/torch/tpm/hf/hf_wavlm.py @@ -209,6 +209,7 @@ def __init__( sample_frequency: int = 16000, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, + use_lora: bool = False, lora_components: List[str] = ["q_proj", "v_proj"], lora_rank: int = 4, lora_alpha: int = 1, @@ -321,6 +322,15 @@ def __init__( if drop_layers_gt is not None: self.drop_upper_layers(drop_layers_gt) + if use_lora: + self._make_lora_layers( + lora_components, + lora_rank, + lora_alpha, + lora_dropout, + lora_merge_weights, + ) + self.ignore_pretrained = True @property From cb9fa3c9a6d1d6293f0869060bc3273f5beb41bb Mon Sep 17 00:00:00 2001 From: System User Date: Mon, 30 Oct 2023 19:07:10 -0400 Subject: [PATCH 115/154] lre22 fixed v1 done --- egs/lre22/fixed.v1.8k/README.md | 43 + egs/lre22/fixed.v1.8k/cmd.sh | 25 + egs/lre22/fixed.v1.8k/conf/clsp.conf | 11 + .../fixed.v1.8k/conf/coe_gpu_bigmem.conf | 11 + egs/lre22/fixed.v1.8k/conf/coe_gpu_long.conf | 13 + egs/lre22/fixed.v1.8k/conf/coe_gpu_rtx.conf | 11 + egs/lre22/fixed.v1.8k/conf/coe_gpu_short.conf | 11 + egs/lre22/fixed.v1.8k/conf/coe_gpu_v100.conf | 11 + .../conf/fbank64_specaug1_stmn_8k.yaml | 24 + .../fixed.v1.8k/conf/fbank64_stmn_8k.yaml | 12 + .../fixed.v1.8k/conf/reverb_noise_aug.yaml | 35 + ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 101 + ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml | 69 + ...rain_fwseres2net50s8_xvec_stage1_v1.0.yaml | 78 + ...rain_fwseres2net50s8_xvec_stage1_v1.1.yaml | 78 + ...rain_fwseres2net50s8_xvec_stage2_v1.0.yaml | 69 + egs/lre22/fixed.v1.8k/conf/vad_8k.yaml | 9 + egs/lre22/fixed.v1.8k/datapath.sh | 46 + egs/lre22/fixed.v1.8k/default_config.sh | 1 + ...onfig_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | 24 + ...onfig_fbank64_stmn_fwseres2net50s8_v1.0.sh | 28 + egs/lre22/fixed.v1.8k/hyp_utils | 1 + .../apply_tel_codecs_to_kaldi_datadir.py | 215 ++ egs/lre22/fixed.v1.8k/local/download_focal.sh | 27 + .../fixed.v1.8k/local/download_focal.sh~ | 27 + .../local/download_lre22_scorer.sh | 24 + .../local/download_lre22_scorer.sh~ | 25 + .../local/eval_calibration_lre22.sh | 42 + .../fixed.v1.8k/local/eval_fusion_lre22.sh | 46 + egs/lre22/fixed.v1.8k/local/make_musan.py | 189 ++ egs/lre22/fixed.v1.8k/local/make_musan.sh | 48 + egs/lre22/fixed.v1.8k/local/make_rirs_data.sh | 29 + .../fixed.v1.8k/local/make_sre16_train_dev.sh | 65 + .../local/make_sre16_train_eval.sh | 66 + .../local/make_sre18_dev_unlabeled.sh | 38 + .../fixed.v1.8k/local/make_sre18_train_dev.sh | 57 + .../local/make_sre18_train_eval.sh | 60 + .../fixed.v1.8k/local/make_sre19cmn2_eval.sh | 54 + egs/lre22/fixed.v1.8k/local/merge_scores.py | 32 + egs/lre22/fixed.v1.8k/local/prepare_adi17.py | 164 ++ egs/lre22/fixed.v1.8k/local/prepare_ast.py | 144 ++ egs/lre22/fixed.v1.8k/local/prepare_babel.py | 108 + .../fixed.v1.8k/local/prepare_common_voice.py | 146 ++ .../local/prepare_common_voice_accents.py | 132 + .../local/prepare_common_voice_accents_cat.py | 174 ++ .../local/prepare_common_voice_cat.py | 180 ++ egs/lre22/fixed.v1.8k/local/prepare_lre17.py | 140 ++ .../fixed.v1.8k/local/prepare_lre22_dev.py | 108 + .../fixed.v1.8k/local/prepare_lre22_eval.py | 98 + .../local/prepare_some_data_for_lre.py | 135 ++ .../local/prepare_some_data_for_lre_cat.py | 204 ++ .../local/prepare_sre21av_dev_audio.py | 215 ++ .../local/prepare_sre21av_eval_audio.py | 243 ++ .../local/prepare_sre_cts_superset.py | 185 ++ .../fixed.v1.8k/local/prepare_voxlingua107.py | 130 + egs/lre22/fixed.v1.8k/local/score_lre22.sh | 29 + egs/lre22/fixed.v1.8k/local/split_dev.py | 80 + .../local/split_segments_train_val.py | 160 ++ .../local/train_calibration_lre22.sh | 46 + .../fixed.v1.8k/local/train_fusion_lre22.sh | 36 + egs/lre22/fixed.v1.8k/local/validate_lre22.sh | 17 + egs/lre22/fixed.v1.8k/path.sh | 5 + .../dev_splits/fold_0/test_segments.csv | 2114 +++++++++++++++++ .../dev_splits/fold_0/train_segments.csv | 2088 ++++++++++++++++ .../dev_splits/fold_1/test_segments.csv | 2088 ++++++++++++++++ .../dev_splits/fold_1/train_segments.csv | 2114 +++++++++++++++++ .../resources/lre17_ara-ary/segs_ara-ary.csv | 1306 ++++++++++ egs/lre22/fixed.v1.8k/run_001_prepare_data.sh | 93 + egs/lre22/fixed.v1.8k/run_002_compute_evad.sh | 58 + .../run_003_prepare_noises_rirs.sh | 66 + egs/lre22/fixed.v1.8k/run_004_apply_codecs.sh | 25 + .../run_010_prepare_xvec_train_data.sh | 96 + .../fixed.v1.8k/run_011_train_xvector.sh | 164 ++ .../fixed.v1.8k/run_011_train_xvector.sh~ | 161 ++ .../fixed.v1.8k/run_030_extract_xvectors.sh | 215 ++ egs/lre22/fixed.v1.8k/run_040_be_final.sh | 434 ++++ egs/lre22/fixed.v1.8k/run_050_fusion_v1.sh | 43 + egs/lre22/fixed.v1.8k/steps | 1 + egs/lre22/fixed.v1.8k/steps_be/eval_be_v1.py | 117 + egs/lre22/fixed.v1.8k/steps_be/eval_be_v2.py | 117 + egs/lre22/fixed.v1.8k/steps_be/eval_be_v5.py | 129 + egs/lre22/fixed.v1.8k/steps_be/eval_fusion.m | 17 + egs/lre22/fixed.v1.8k/steps_be/train_be_v1.py | 136 ++ egs/lre22/fixed.v1.8k/steps_be/train_be_v2.py | 136 ++ egs/lre22/fixed.v1.8k/steps_be/train_be_v3.py | 204 ++ egs/lre22/fixed.v1.8k/steps_be/train_be_v4.py | 199 ++ egs/lre22/fixed.v1.8k/steps_be/train_be_v5.py | 274 +++ egs/lre22/fixed.v1.8k/steps_be/train_be_v6.py | 196 ++ egs/lre22/fixed.v1.8k/steps_be/train_be_v7.py | 315 +++ egs/lre22/fixed.v1.8k/steps_be/train_be_v8.py | 317 +++ egs/lre22/fixed.v1.8k/steps_be/train_be_v9.py | 220 ++ egs/lre22/fixed.v1.8k/steps_be/train_fusion.m | 16 + egs/lre22/fixed.v1.8k/steps_xvec | 1 + egs/lre22/fixed.v1.8k/utils | 1 + egs/lre22/open.v1.8k/README.md | 43 + egs/lre22/open.v1.8k/cmd.sh | 28 + egs/lre22/open.v1.8k/conf/clsp.conf | 11 + egs/lre22/open.v1.8k/conf/coe_gpu_bigmem.conf | 11 + egs/lre22/open.v1.8k/conf/coe_gpu_long.conf | 13 + egs/lre22/open.v1.8k/conf/coe_gpu_rtx.conf | 11 + egs/lre22/open.v1.8k/conf/coe_gpu_short.conf | 11 + egs/lre22/open.v1.8k/conf/coe_gpu_v100.conf | 11 + .../conf/fbank64_specaug1_stmn_8k.yaml | 24 + .../open.v1.8k/conf/fbank64_stmn_8k.yaml | 12 + .../conf/other_conf/fbank64_stmn_8k.yaml | 12 + .../conf/other_conf/fbank80_stmn_16k.yaml | 12 + .../conf/other_conf/reverb_noise_aug.yaml | 35 + ...ecapatdnn2048-5120x8_xvec_stage1_v2.4.yaml | 124 + ...ecapatdnn2048-5120x8_xvec_stage2_v2.4.yaml | 79 + ...ecapatdnn2048-5120x8_xvec_stage3_v2.4.yaml | 79 + ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 99 + ...rain_ecapatdnn2048x4_xvec_stage1_v2.1.yaml | 95 + ...rain_ecapatdnn2048x4_xvec_stage1_v2.2.yaml | 97 + ...rain_ecapatdnn2048x4_xvec_stage1_v2.3.yaml | 77 + ...rain_ecapatdnn2048x4_xvec_stage1_v2.4.yaml | 97 + ...rain_ecapatdnn2048x4_xvec_stage1_v3.0.yaml | 98 + ...rain_ecapatdnn2048x4_xvec_stage1_v3.1.yaml | 98 + ...rain_ecapatdnn2048x4_xvec_stage1_v3.2.yaml | 98 + ...rain_ecapatdnn2048x4_xvec_stage1_v3.5.yaml | 98 + ...in_ecapatdnn2048x4_xvec_stage2_v2.1.1.yaml | 79 + ...in_ecapatdnn2048x4_xvec_stage2_v2.1.2.yaml | 79 + ...rain_ecapatdnn2048x4_xvec_stage2_v2.1.yaml | 79 + ...rain_ecapatdnn2048x4_xvec_stage2_v2.3.yaml | 75 + ...rain_ecapatdnn2048x4_xvec_stage2_v2.4.yaml | 79 + ...rain_ecapatdnn2048x4_xvec_stage3_v2.1.yaml | 79 + ...rain_ecapatdnn2048x4_xvec_stage3_v2.4.yaml | 79 + ...train_tseres2net50s8_xvec_stage1_v2.1.yaml | 75 + ...train_tseres2net50s8_xvec_stage1_v2.2.yaml | 77 + ...train_tseres2net50s8_xvec_stage1_v2.3.yaml | 77 + ...train_tseres2net50s8_xvec_stage2_v2.1.yaml | 79 + ...train_tseres2net50s8_xvec_stage2_v2.3.yaml | 75 + .../open.v1.8k/conf/other_conf/vad_16k.yaml | 8 + .../open.v1.8k/conf/other_conf/vad_8k.yaml | 8 + .../open.v1.8k/conf/reverb_noise_aug.yaml | 35 + ...rain_ecapatdnn2048x4_xvec_stage1_v1.0.yaml | 105 + ...rain_fwseres2net50s8_xvec_stage1_v1.0.yaml | 82 + egs/lre22/open.v1.8k/conf/vad_8k.yaml | 9 + egs/lre22/open.v1.8k/datapath.sh | 87 + egs/lre22/open.v1.8k/default_config.sh | 1 + ...onfig_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | 20 + ...onfig_fbank64_stmn_fwseres2net50s8_v1.0.sh | 45 + egs/lre22/open.v1.8k/hyp_utils | 1 + egs/lre22/open.v1.8k/local | 1 + egs/lre22/open.v1.8k/path.sh | 5 + egs/lre22/open.v1.8k/resources | 1 + egs/lre22/open.v1.8k/run_001_prepare_data.sh | 342 +++ egs/lre22/open.v1.8k/run_002_compute_evad.sh | 64 + .../open.v1.8k/run_003_prepare_noises_rirs.sh | 66 + egs/lre22/open.v1.8k/run_004_apply_codecs.sh | 28 + .../run_010_prepare_xvec_train_data.sh | 91 + egs/lre22/open.v1.8k/run_011_train_xvector.sh | 92 + .../open.v1.8k/run_030_extract_xvectors.sh | 219 ++ egs/lre22/open.v1.8k/run_040_be_final.sh | 434 ++++ egs/lre22/open.v1.8k/steps | 1 + egs/lre22/open.v1.8k/steps_be | 1 + egs/lre22/open.v1.8k/steps_xvec | 1 + egs/lre22/open.v1.8k/utils | 1 + egs/lre22/open.v2.8k/cmd.sh | 28 + egs/lre22/open.v2.8k/conf/clsp.conf | 11 + egs/lre22/open.v2.8k/conf/coe_gpu_bigmem.conf | 11 + egs/lre22/open.v2.8k/conf/coe_gpu_long.conf | 13 + egs/lre22/open.v2.8k/conf/coe_gpu_rtx.conf | 11 + egs/lre22/open.v2.8k/conf/coe_gpu_short.conf | 11 + egs/lre22/open.v2.8k/conf/coe_gpu_v100.conf | 11 + .../open.v2.8k/conf/reverb_noise_aug.yaml | 35 + ...2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml | 59 + ...ec2xlsr300m_ecapatdnn1024x3_subcenter.yaml | 47 + egs/lre22/open.v2.8k/datapath.sh | 87 + egs/lre22/open.v2.8k/default_config.sh | 1 + ...ig_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh | 36 + egs/lre22/open.v2.8k/hyp_utils | 1 + egs/lre22/open.v2.8k/local | 1 + egs/lre22/open.v2.8k/path.sh | 5 + egs/lre22/open.v2.8k/resources | 1 + egs/lre22/open.v2.8k/run_001_prepare_data.sh | 330 +++ egs/lre22/open.v2.8k/run_002_compute_evad.sh | 64 + .../open.v2.8k/run_003_prepare_noises_rirs.sh | 66 + egs/lre22/open.v2.8k/run_004_apply_codecs.sh | 28 + .../run_010_prepare_xvec_train_data.sh | 78 + egs/lre22/open.v2.8k/run_011_train_xvector.sh | 128 + egs/lre22/open.v2.8k/steps | 1 + egs/lre22/open.v2.8k/steps_be | 1 + egs/lre22/open.v2.8k/steps_xvec | 1 + egs/lre22/open.v2.8k/utils | 1 + egs/sre21-av-a/v1.8k/run_011_train_xvector.sh | 47 - hyperion/np/transforms/skl_tsne.py | 3 +- hyperion/utils/info_table.py | 2 +- hyperion/utils/scp_list.py | 2 +- 188 files changed, 23219 insertions(+), 50 deletions(-) create mode 100644 egs/lre22/fixed.v1.8k/README.md create mode 100755 egs/lre22/fixed.v1.8k/cmd.sh create mode 100644 egs/lre22/fixed.v1.8k/conf/clsp.conf create mode 100644 egs/lre22/fixed.v1.8k/conf/coe_gpu_bigmem.conf create mode 100644 egs/lre22/fixed.v1.8k/conf/coe_gpu_long.conf create mode 100644 egs/lre22/fixed.v1.8k/conf/coe_gpu_rtx.conf create mode 100644 egs/lre22/fixed.v1.8k/conf/coe_gpu_short.conf create mode 100644 egs/lre22/fixed.v1.8k/conf/coe_gpu_v100.conf create mode 100644 egs/lre22/fixed.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml create mode 100644 egs/lre22/fixed.v1.8k/conf/fbank64_stmn_8k.yaml create mode 100644 egs/lre22/fixed.v1.8k/conf/reverb_noise_aug.yaml create mode 100644 egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml create mode 100644 egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml create mode 100644 egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml create mode 100644 egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.1.yaml create mode 100644 egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage2_v1.0.yaml create mode 100644 egs/lre22/fixed.v1.8k/conf/vad_8k.yaml create mode 100644 egs/lre22/fixed.v1.8k/datapath.sh create mode 120000 egs/lre22/fixed.v1.8k/default_config.sh create mode 100644 egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh create mode 100644 egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh create mode 120000 egs/lre22/fixed.v1.8k/hyp_utils create mode 100755 egs/lre22/fixed.v1.8k/local/apply_tel_codecs_to_kaldi_datadir.py create mode 100755 egs/lre22/fixed.v1.8k/local/download_focal.sh create mode 100755 egs/lre22/fixed.v1.8k/local/download_focal.sh~ create mode 100755 egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh create mode 100755 egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh~ create mode 100755 egs/lre22/fixed.v1.8k/local/eval_calibration_lre22.sh create mode 100755 egs/lre22/fixed.v1.8k/local/eval_fusion_lre22.sh create mode 100755 egs/lre22/fixed.v1.8k/local/make_musan.py create mode 100755 egs/lre22/fixed.v1.8k/local/make_musan.sh create mode 100755 egs/lre22/fixed.v1.8k/local/make_rirs_data.sh create mode 100755 egs/lre22/fixed.v1.8k/local/make_sre16_train_dev.sh create mode 100755 egs/lre22/fixed.v1.8k/local/make_sre16_train_eval.sh create mode 100755 egs/lre22/fixed.v1.8k/local/make_sre18_dev_unlabeled.sh create mode 100755 egs/lre22/fixed.v1.8k/local/make_sre18_train_dev.sh create mode 100755 egs/lre22/fixed.v1.8k/local/make_sre18_train_eval.sh create mode 100755 egs/lre22/fixed.v1.8k/local/make_sre19cmn2_eval.sh create mode 100755 egs/lre22/fixed.v1.8k/local/merge_scores.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_adi17.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_ast.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_babel.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_common_voice.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents_cat.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_common_voice_cat.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_lre17.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_lre22_dev.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_lre22_eval.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre_cat.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_sre21av_dev_audio.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_sre21av_eval_audio.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_sre_cts_superset.py create mode 100755 egs/lre22/fixed.v1.8k/local/prepare_voxlingua107.py create mode 100755 egs/lre22/fixed.v1.8k/local/score_lre22.sh create mode 100755 egs/lre22/fixed.v1.8k/local/split_dev.py create mode 100755 egs/lre22/fixed.v1.8k/local/split_segments_train_val.py create mode 100755 egs/lre22/fixed.v1.8k/local/train_calibration_lre22.sh create mode 100755 egs/lre22/fixed.v1.8k/local/train_fusion_lre22.sh create mode 100755 egs/lre22/fixed.v1.8k/local/validate_lre22.sh create mode 100755 egs/lre22/fixed.v1.8k/path.sh create mode 100644 egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/test_segments.csv create mode 100644 egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/train_segments.csv create mode 100644 egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/test_segments.csv create mode 100644 egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/train_segments.csv create mode 100644 egs/lre22/fixed.v1.8k/resources/lre17_ara-ary/segs_ara-ary.csv create mode 100755 egs/lre22/fixed.v1.8k/run_001_prepare_data.sh create mode 100755 egs/lre22/fixed.v1.8k/run_002_compute_evad.sh create mode 100755 egs/lre22/fixed.v1.8k/run_003_prepare_noises_rirs.sh create mode 100755 egs/lre22/fixed.v1.8k/run_004_apply_codecs.sh create mode 100755 egs/lre22/fixed.v1.8k/run_010_prepare_xvec_train_data.sh create mode 100755 egs/lre22/fixed.v1.8k/run_011_train_xvector.sh create mode 100755 egs/lre22/fixed.v1.8k/run_011_train_xvector.sh~ create mode 100755 egs/lre22/fixed.v1.8k/run_030_extract_xvectors.sh create mode 100755 egs/lre22/fixed.v1.8k/run_040_be_final.sh create mode 100755 egs/lre22/fixed.v1.8k/run_050_fusion_v1.sh create mode 120000 egs/lre22/fixed.v1.8k/steps create mode 100755 egs/lre22/fixed.v1.8k/steps_be/eval_be_v1.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/eval_be_v2.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/eval_be_v5.py create mode 100644 egs/lre22/fixed.v1.8k/steps_be/eval_fusion.m create mode 100755 egs/lre22/fixed.v1.8k/steps_be/train_be_v1.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/train_be_v2.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/train_be_v3.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/train_be_v4.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/train_be_v5.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/train_be_v6.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/train_be_v7.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/train_be_v8.py create mode 100755 egs/lre22/fixed.v1.8k/steps_be/train_be_v9.py create mode 100644 egs/lre22/fixed.v1.8k/steps_be/train_fusion.m create mode 120000 egs/lre22/fixed.v1.8k/steps_xvec create mode 120000 egs/lre22/fixed.v1.8k/utils create mode 100644 egs/lre22/open.v1.8k/README.md create mode 100755 egs/lre22/open.v1.8k/cmd.sh create mode 100644 egs/lre22/open.v1.8k/conf/clsp.conf create mode 100644 egs/lre22/open.v1.8k/conf/coe_gpu_bigmem.conf create mode 100644 egs/lre22/open.v1.8k/conf/coe_gpu_long.conf create mode 100644 egs/lre22/open.v1.8k/conf/coe_gpu_rtx.conf create mode 100644 egs/lre22/open.v1.8k/conf/coe_gpu_short.conf create mode 100644 egs/lre22/open.v1.8k/conf/coe_gpu_v100.conf create mode 100644 egs/lre22/open.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml create mode 100644 egs/lre22/open.v1.8k/conf/fbank64_stmn_8k.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/fbank64_stmn_8k.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/fbank80_stmn_16k.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/reverb_noise_aug.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage1_v2.4.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage2_v2.4.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage3_v2.4.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.1.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.2.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.3.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.4.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.2.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.5.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.1.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.2.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.3.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.4.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.1.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.4.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.1.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.2.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.3.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.1.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.3.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/vad_16k.yaml create mode 100644 egs/lre22/open.v1.8k/conf/other_conf/vad_8k.yaml create mode 100644 egs/lre22/open.v1.8k/conf/reverb_noise_aug.yaml create mode 100644 egs/lre22/open.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml create mode 100644 egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml create mode 100644 egs/lre22/open.v1.8k/conf/vad_8k.yaml create mode 100644 egs/lre22/open.v1.8k/datapath.sh create mode 120000 egs/lre22/open.v1.8k/default_config.sh create mode 100644 egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh create mode 100644 egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh create mode 120000 egs/lre22/open.v1.8k/hyp_utils create mode 120000 egs/lre22/open.v1.8k/local create mode 100755 egs/lre22/open.v1.8k/path.sh create mode 120000 egs/lre22/open.v1.8k/resources create mode 100755 egs/lre22/open.v1.8k/run_001_prepare_data.sh create mode 100755 egs/lre22/open.v1.8k/run_002_compute_evad.sh create mode 100755 egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh create mode 100755 egs/lre22/open.v1.8k/run_004_apply_codecs.sh create mode 100755 egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh create mode 100755 egs/lre22/open.v1.8k/run_011_train_xvector.sh create mode 100755 egs/lre22/open.v1.8k/run_030_extract_xvectors.sh create mode 100755 egs/lre22/open.v1.8k/run_040_be_final.sh create mode 120000 egs/lre22/open.v1.8k/steps create mode 120000 egs/lre22/open.v1.8k/steps_be create mode 120000 egs/lre22/open.v1.8k/steps_xvec create mode 120000 egs/lre22/open.v1.8k/utils create mode 100755 egs/lre22/open.v2.8k/cmd.sh create mode 100644 egs/lre22/open.v2.8k/conf/clsp.conf create mode 100644 egs/lre22/open.v2.8k/conf/coe_gpu_bigmem.conf create mode 100644 egs/lre22/open.v2.8k/conf/coe_gpu_long.conf create mode 100644 egs/lre22/open.v2.8k/conf/coe_gpu_rtx.conf create mode 100644 egs/lre22/open.v2.8k/conf/coe_gpu_short.conf create mode 100644 egs/lre22/open.v2.8k/conf/coe_gpu_v100.conf create mode 100644 egs/lre22/open.v2.8k/conf/reverb_noise_aug.yaml create mode 100644 egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml create mode 100644 egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml create mode 100644 egs/lre22/open.v2.8k/datapath.sh create mode 120000 egs/lre22/open.v2.8k/default_config.sh create mode 100644 egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh create mode 120000 egs/lre22/open.v2.8k/hyp_utils create mode 120000 egs/lre22/open.v2.8k/local create mode 100644 egs/lre22/open.v2.8k/path.sh create mode 120000 egs/lre22/open.v2.8k/resources create mode 100755 egs/lre22/open.v2.8k/run_001_prepare_data.sh create mode 100755 egs/lre22/open.v2.8k/run_002_compute_evad.sh create mode 100755 egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh create mode 100755 egs/lre22/open.v2.8k/run_004_apply_codecs.sh create mode 100755 egs/lre22/open.v2.8k/run_010_prepare_xvec_train_data.sh create mode 100755 egs/lre22/open.v2.8k/run_011_train_xvector.sh create mode 120000 egs/lre22/open.v2.8k/steps create mode 120000 egs/lre22/open.v2.8k/steps_be create mode 120000 egs/lre22/open.v2.8k/steps_xvec create mode 120000 egs/lre22/open.v2.8k/utils diff --git a/egs/lre22/fixed.v1.8k/README.md b/egs/lre22/fixed.v1.8k/README.md new file mode 100644 index 00000000..877f99ca --- /dev/null +++ b/egs/lre22/fixed.v1.8k/README.md @@ -0,0 +1,43 @@ +# LRE22 Fixed Condition V1 + +Recipe for the NIST LRE22 fixed condition based to the JHU-MIT Submission. + +## Citing +``` +@inproceedings{villalba23_interspeech, + author={Jesús Villalba and Jonas Borgstrom and Maliha Jahan and Saurabh Kataria and Leibny Paola Garcia and Pedro Torres-Carrasquillo and Najim Dehak}, + title={{Advances in Language Recognition in Low Resource African Languages: The JHU-MIT Submission for NIST LRE22}}, + year=2023, + booktitle={Proc. INTERSPEECH 2023}, + pages={521--525}, + doi={10.21437/Interspeech.2023-1094} +} +``` + +## Training Data + + - x-Vector networks trained on: + - VoxLingua107 + - NIST LRE17 Train + Dev + Eval / CTS + AfV + - Gaussian back-end trained on: + - NIST LRE22 dev with 2-fold cross-val + x10 augmentations + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it uses ECAPA-TDNN 4 layers of 2048 dim. + - To change the default network run scripts with the config-file argument: +```bash +run_011_train_xvector.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +run_030_extract_xvectors.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh --use-gpu true +run_040_be_final.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +``` + +## Results + +| Config | Model Type | Model Details | Back-end | Dev MinCp | Dev ActCp | Eval MinCp | Eval ActCp | +| ------ | ---------- | ------------- | -------- | :-------: | :-------: | :--------: | :--------: | +| config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | ECAPA-TDNN 2048x4 | Stage-2 | GBE | 0.207 | 0.209 | 0.198 | 0.199 | +| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-2 | GBE | 0.227 | 0.229 | 0.213 | 0.215 | +| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | 0.182 | 0.183 | 0.180 | 0.181 | + diff --git a/egs/lre22/fixed.v1.8k/cmd.sh b/egs/lre22/fixed.v1.8k/cmd.sh new file mode 100755 index 00000000..4b4e8ae7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/cmd.sh @@ -0,0 +1,25 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi diff --git a/egs/lre22/fixed.v1.8k/conf/clsp.conf b/egs/lre22/fixed.v1.8k/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_bigmem.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_long.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_rtx.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_short.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/lre22/fixed.v1.8k/conf/coe_gpu_v100.conf b/egs/lre22/fixed.v1.8k/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/lre22/fixed.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml b/egs/lre22/fixed.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml new file mode 100644 index 00000000..fce3804a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/fixed.v1.8k/conf/fbank64_stmn_8k.yaml b/egs/lre22/fixed.v1.8k/conf/fbank64_stmn_8k.yaml new file mode 100644 index 00000000..dfd0d3e5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/fbank64_stmn_8k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/fixed.v1.8k/conf/reverb_noise_aug.yaml b/egs/lre22/fixed.v1.8k/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..22620f03 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,101 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.0 + intertop_margin: 0.0 + margin_warmup_epochs: 3.0 + dropout_rate: 0.2 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + warmup_steps: 15000 + min_lr: 1.0e-06 + update_lr_on_opt_step: true + use_amp: true + swa_start: 12 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 15 + #eff_batch_size: 512 + eff_batch_size: 256 diff --git a/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..25e7b213 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + weight_exponent: 0.3 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + weight_exponent: 0.3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 10000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 10000 + use_amp: true + swa_start: 14 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 18 + eff_batch_size: 256 diff --git a/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..d900ec9b --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + resnet_type: fwseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + se_r: 4 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.0 + intertop_margin: 0.0 + margin_warmup_epochs: 3.0 + dropout_rate: 0.05 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + warmup_steps: 15000 + min_lr: 1.0e-06 + update_lr_on_opt_step: true + use_amp: true + swa_start: 12 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 8 + eff_batch_size: 256 diff --git a/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.1.yaml b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.1.yaml new file mode 100644 index 00000000..28b1f641 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.1.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + resnet_type: fwseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + se_r: 4 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.0 + intertop_margin: 0.0 + margin_warmup_epochs: 3.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + warmup_steps: 15000 + min_lr: 1.0e-06 + update_lr_on_opt_step: true + use_amp: true + swa_start: 12 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 diff --git a/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage2_v1.0.yaml b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage2_v1.0.yaml new file mode 100644 index 00000000..2e6d3a6c --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage2_v1.0.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + weight_exponent: 0.3 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + weight_exponent: 0.3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 8000 + hold_steps: 10000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 10000 + use_amp: true + swa_start: 14 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 7 + eff_batch_size: 256 diff --git a/egs/lre22/fixed.v1.8k/conf/vad_8k.yaml b/egs/lre22/fixed.v1.8k/conf/vad_8k.yaml new file mode 100644 index 00000000..1cfe34b0 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/conf/vad_8k.yaml @@ -0,0 +1,9 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: -4.89 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/lre22/fixed.v1.8k/datapath.sh b/egs/lre22/fixed.v1.8k/datapath.sh new file mode 100644 index 00000000..d6a81520 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/datapath.sh @@ -0,0 +1,46 @@ +# Copyright +# 2021 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + +#paths to databases + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + ldc_root3=/export/fs02/corpora3/LDC + ldc_root5=/export/corpora5/LDC + ldc_root=/export/corpora6/LDC + #voxceleb1_root=/export/corpora5/VoxCeleb1_v2 + #voxceleb2_root=/export/corpora5/VoxCeleb2 + sre16_eval_root=$ldc_root5/LDC2018E30/data/eval/R149_0_1 + #janus_root=$ldc_root/LDC2019E55/Janus_Multimedia_Dataset + #sre_superset_root=$ldc_root/LDC2021E08 + #sre21_dev_root=$ldc_root/LDC2021E09 + #sre21_eval_root=$ldc_root/LDC2021E10 + lre17_train_root=$ldc_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$ldc_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$ldc_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=/export/corpora6/lre22_test_data_v2 + voxlingua_root=/export/corpora6/voxlingua107 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + ldc_root=/export/common/data/corpora/LDC + sre_root=/export/common/data/corpora/NIST/SRE + my_root=/exp/jvillalba/corpora + #voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 + #voxceleb2_root=/expscratch/dgromero/corpora/vox2 + sre16_dev_root=/exp/jvillalba/corpora/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=/exp/jvillalba/corpora/LDC2019S20/data/eval/R149_0_1 + #janus_root=$sre_root/SRE19/LDC2019E55_Janus_Multimedia_Dataset + sre_superset_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E08 + sre21_dev_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E09 + sre21_eval_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E10 + lre17_train_root=$my_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$my_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$my_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=$my_root/lre22_test_data_v2 + voxlingua_root=$my_root/voxlingua107 + musan_root=/expscratch/dgromero/corpora/musan +else + echo "Put your database paths here" + exit 1 +fi diff --git a/egs/lre22/fixed.v1.8k/default_config.sh b/egs/lre22/fixed.v1.8k/default_config.sh new file mode 120000 index 00000000..506bebe6 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh \ No newline at end of file diff --git a/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh b/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh new file mode 100644 index 00000000..b9cd45a5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh @@ -0,0 +1,24 @@ +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxlingua107_lre17_noary + +# x-vector cfg +nnet_type=resnet1d +nnet_stages=2 +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml + +nnet_name=${feat_type}_ecapatdnn2048x4_v1.0 +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/swa_model_ep0016.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0007.pth diff --git a/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh b/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh new file mode 100644 index 00000000..afac4198 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh @@ -0,0 +1,28 @@ +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=voxlingua107_lre17_noary + +# x-vector cfg + +nnet_type=resnet +nnet_stages=2 +nnet_s1_base_cfg=conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml + +nnet_name=${feat_type}_fwseres2net50s8_v1.0 +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0008.pth + + +nnet_s2_base_cfg=conf/train_fwseres2net50s8_xvec_stage2_v1.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0007.pth + + diff --git a/egs/lre22/fixed.v1.8k/hyp_utils b/egs/lre22/fixed.v1.8k/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/lre22/fixed.v1.8k/local/apply_tel_codecs_to_kaldi_datadir.py b/egs/lre22/fixed.v1.8k/local/apply_tel_codecs_to_kaldi_datadir.py new file mode 100755 index 00000000..c0e2b9d3 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/apply_tel_codecs_to_kaldi_datadir.py @@ -0,0 +1,215 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import shutil +from tqdm import tqdm +import time +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import RecordingSet, SegmentSet + +valid_codecs = ["gsm", "g711mu", "g711a", "g722", "g723_1", "g726", "opus"] + +sox_options = {"gsm": "-r 8000 -e gsm-full-rate -t gsm"} +ffmpeg_options = { + "g711a": "-ar 8000 -acodec pcm_alaw -f wav", + "g711mu": "-ar 8000 -acodec pcm_mulaw -f wav", + "g722": "-ar 8000 -acodec g722 -f wav", + "g723_1": "-ar 8000 -acodec g723_1 -b:a 6300 -f wav", + "g726": "-ar 8000 -acodec g726 -f wav", + "opus": "-ar 8000 -acodec libopus -application voip -f opus", +} + + +def apply_sox_codec(storage_path, codec): + + option = sox_options[codec] + storage_path = storage_path.rstrip() + if storage_path[-1] == "|": + storage_path = f"{storage_path} sox -t wav - {option} - |" + else: + storage_path = f"sox {storage_path} {option} - |" + + storage_path = f"{storage_path} sox {option} - -t wav -e signed-integer -b 16 - |" + return storage_path + + +def apply_ffmpeg_codec(storage_path, codec, g726_css, opus_brs, rng): + + option = ffmpeg_options[codec] + if codec == "g726": + code_size = rng.choice(g726_css) + option = f"{option} -code_size {code_size}" + elif codec == "opus": + br = rng.choice(opus_brs) + option = f"{option} -b:a {br}" + + storage_path = storage_path.rstrip() + if storage_path[-1] == "|": + storage_path = f"{storage_path} ffmpeg -i - {option} - |" + else: + storage_path = f"ffmpeg -i {storage_path} {option} - |" + + storage_path = f"{storage_path} ffmpeg -i - -ar 8000 -c:a pcm_s16le -f wav - |" + return storage_path + + +def apply_codec(storage_path, codec, g726_css, opus_brs, rng): + + if codec in ["gsm"]: + storage_path = apply_sox_codec(storage_path, codec) + else: + storage_path = apply_ffmpeg_codec(storage_path, codec, g726_css, + opus_brs, rng) + + return storage_path + + +def apply_codecs( + input_dir, + output_dir, + codecs, + keep_orig, + g726_min_code_size, + opus_brs, + seed, + verbose, +): + config_logger(verbose) + logging.info("Applying codecs %s -> %s", input_dir, output_dir) + rng = np.random.RandomState(seed=seed) + input_dir = Path(input_dir) + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + g726_css = list(range(g726_min_code_size, 6)) + logging.info("making wav.scp") + recs = RecordingSet.load(input_dir / "wav.scp") + recs["orig_id"] = recs["id"] + if keep_orig: + recs_orig = recs.clone() + + codec_idx = 0 + ids = [] + s_paths = [] + for i in tqdm(range(len(recs))): + t1 = time.time() + row = recs.iloc[i] + t2 = time.time() + codec_i = codecs[codec_idx % len(codecs)] + codec_idx += 1 + t3 = time.time() + # recs.loc[row.id, "id"] = f"{row.id}-{codec_i}" + ids.append(f"{row.id}-{codec_i}") + t4 = time.time() + sp = apply_codec(row["storage_path"], codec_i, g726_css, opus_brs, rng) + + t5 = time.time() + # recs.loc[row.id, "storage_path"] = sp + s_paths.append(sp) + t6 = time.time() + + recs["id"] = ids + recs["storage_path"] = s_paths + + mapping = recs[["orig_id", "id"]] + mapping.set_index("orig_id", inplace=True, drop=False) + if keep_orig: + recs = RecordingSet.merge(recs_orig, recs) + recs.sort() + + logging.info("making utt2orig_utt") + recs[["id", "orig_id"]].to_csv(output_dir / "utt2orig_utt", + sep=" ", + header=False, + index=False) + + recs.save(output_dir / "wav.scp") + u2x_files = [] + for pattern in ["utt2*", "vad.scp", "feats.scp"]: + files_p = glob.glob(str(input_dir / pattern)) + u2x_files.extend(files_p) + + for f in u2x_files: + logging.info("making %s", Path(f).name) + u2x = SegmentSet.load(f) + if keep_orig: + u2x_orig = u2x.clone() + + u2x["id"] = mapping.loc[u2x["id"], "id"] + if keep_orig: + u2x = SegmentSet.merge(u2x_orig, u2x) + u2x.sort() + + output_file = output_dir / Path(f).name + u2x.save(output_file) + + spk_files = glob.glob(str(input_dir / "spk2gender")) + for f in spk_files: + logging.info("making %s", Path(f).name) + output_file = output_dir / Path(f).name + shutil.copy2(f, output_file) + + logging.info("making utt2spk") + u2s = SegmentSet.load(output_dir / "utt2spk") + spks = u2s["class_id"].unique() + df_spk = u2s.df.sort_values(by="class_id") + df_spk.set_index("class_id", inplace=True) + + with open(output_dir / "spk2utt", "w") as f: + for spk in spks: + seg_ids = df_spk.loc[spk, "id"] + if isinstance(seg_ids, list): + seg_ids = " ".join(seg_ids) + f.write(f"{spk} {seg_ids}\n") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Apply telephone codecs to kaldi data dir") + parser.add_argument("--input-dir", + required=True, + help="Path to the original kaldi dataset") + + parser.add_argument("--output-dir", + required=True, + help="Codec augmented directory") + parser.add_argument( + "--codecs", + default=valid_codecs, + nargs="+", + choices=valid_codecs, + help="List of codecs to apply", + ) + parser.add_argument( + "--g726-min-code-size", + default=2, + choices=[2, 3, 4, 5], + help="minimum code-size for g726", + ) + parser.add_argument( + "--opus-brs", + default=[4500, 5500, 7700, 9500, 12500, 16000, 32000], + nargs="+", + help="opus codec bit rates", + ) + parser.add_argument("--keep-orig", default=False, action=ActionYesNo) + parser.add_argument("--seed", default=1234, help="random seed") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + + args = parser.parse_args() + apply_codecs(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/download_focal.sh b/egs/lre22/fixed.v1.8k/local/download_focal.sh new file mode 100755 index 00000000..13b86e57 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/download_focal.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 +# +# Downloads Niko Brummer's FoCal Multiclass + +set -e +tool=FoCal_MultiClass_V1 +s_dir=focal_multiclass + +# shareable link: +# https://drive.google.com/file/d/13rPUqS68NdEF5NB0vsL7bDEju5dhmmDZ/view?usp=sharing + + +wget --no-check-certificate "https://drive.google.com/uc?export=download&id=13rPUqS68NdEF5NB0vsL7bDEju5dhmmDZ" -O $tool.zip +unzip $tool.zip -d $s_dir + +if [ ! -f $s_dir/v1.0/readme.txt ];then + echo "the focal tool wasn't dowloaded correctly, download manually" + exit 1 +fi + +rm -f $tool.zip + + + + diff --git a/egs/lre22/fixed.v1.8k/local/download_focal.sh~ b/egs/lre22/fixed.v1.8k/local/download_focal.sh~ new file mode 100755 index 00000000..b871348f --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/download_focal.sh~ @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 +# +# Downloads Niko Brummer's FoCal Multiclass + +set -e +tool=FoCal_MultiClass_V1 +s_dir=focal_multiclass_v1.0 + +# shareable link: +# https://drive.google.com/file/d/13rPUqS68NdEF5NB0vsL7bDEju5dhmmDZ/view?usp=sharing + + +wget --no-check-certificate "https://drive.google.com/uc?export=download&id=13rPUqS68NdEF5NB0vsL7bDEju5dhmmDZ" -O $tool.zip +unzip $tool.zip + +if [ ! -f $s_dir/readme.txt ];then + echo "the focal tool wasn't dowloaded correctly, download manually" + exit 1 +fi + +rm -f $tool.zip + + + + diff --git a/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh b/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh new file mode 100755 index 00000000..344a6a34 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 +# +# Downloads NIST scoring tools for LRE22 + +set -e +tool=lre-scorer +s_dir=lre-scorer + +# shareable link: +# https://drive.google.com/file/d/13pvUhFPGLgqId5yB8i25X__LFXKIU-ju/view?usp=sharing + +wget --no-check-certificate "https://drive.google.com/uc?export=download&id=13pvUhFPGLgqId5yB8i25X__LFXKIU-ju" -O $tool.tar.gz +tar xzvf $tool.tar.gz + +if [ ! -f $s_dir/scorerLRE22.py ];then + echo "the scoring tool wasn't dowloaded correctly, download manually" + exit 1 +fi + +rm -f $tool.tar.gz + + diff --git a/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh~ b/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh~ new file mode 100755 index 00000000..4201eecf --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/download_lre22_scorer.sh~ @@ -0,0 +1,25 @@ +#!/bin/bash + +# Copyright 2022 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 +# +# Downloads NIST scoring tools for LRE22 + +set -e +tool=lre-scorer +s_dir=lre-scorer + +# shareable link: +# https://drive.google.com/file/d/13pvUhFPGLgqId5yB8i25X__LFXKIU-ju/view?usp=sharing + +wget --no-check-certificate "https://drive.google.com/uc?export=download&id=13pvUhFPGLgqId5yB8i25X__LFXKIU-ju" -O $tool.tar.gz +tar xzvf $tool.tar.gz + +if [ ! -f $s_dir/scorerLRE22.py ];then + echo "the scoring tool wasn't dowloaded correctly, download manually" + exit 1 +fi + +rm -f $tool.tar.gz + + diff --git a/egs/lre22/fixed.v1.8k/local/eval_calibration_lre22.sh b/egs/lre22/fixed.v1.8k/local/eval_calibration_lre22.sh new file mode 100755 index 00000000..2c28e70e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/eval_calibration_lre22.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 2 ];then + echo "Usage: $0 " + exit 1 +fi + +score_dir=$1 +model_file=$2 +nocal_dir=$score_dir/nocal +cal_dir=$score_dir/cal_v1 + +dev_file=$nocal_dir/lre22_dev_scores.tsv +dev_cal_file=$cal_dir/lre22_dev_scores.tsv +eval_file=$nocal_dir/lre22_eval_scores.tsv +eval_cal_file=$cal_dir/lre22_eval_scores.tsv +mkdir -p $cal_dir + + +if [ "$(hostname --domain)" == "cm.gemini" ];then + module load matlab +fi + +if [ -f $dev_file ];then + echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({'$dev_file'}, '$dev_cal_file', '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/eval_lre22_dev.log +fi + +if [ -f $eval_file ];then + echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({'$eval_file'}, '$eval_cal_file', '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/eval_lre22_eval.log +fi + + diff --git a/egs/lre22/fixed.v1.8k/local/eval_fusion_lre22.sh b/egs/lre22/fixed.v1.8k/local/eval_fusion_lre22.sh new file mode 100755 index 00000000..284cac7e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/eval_fusion_lre22.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 3 ];then + echo "Usage: $0 $output_dir/eval_lre22_dev.log +fi + +if [ -f $eval_file_1 ];then + echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({$eval_files}, '$eval_fus_file', '$model_file'); +" | matlab -nodisplay -nosplash > $output_dir/eval_lre22_eval.log +fi + + diff --git a/egs/lre22/fixed.v1.8k/local/make_musan.py b/egs/lre22/fixed.v1.8k/local/make_musan.py new file mode 100755 index 00000000..b0ae6846 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_musan.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + + +def prepare_music(root_dir, fs, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_speech(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def prepare_noise(root_dir, fs): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + + +def main(): + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) + + +if __name__ == "__main__": + main() diff --git a/egs/lre22/fixed.v1.8k/local/make_musan.sh b/egs/lre22/fixed.v1.8k/local/make_musan.sh new file mode 100755 index 00000000..4a6d30f9 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_musan.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Copyright 2019 Johns Hopkins University (Jesus Villalba) (added fs support) +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +use_vocals='Y' + +. parse_options.sh || exit 1; + +if [ $# -ne 3 ];then + echo "Usage: $0 [options] "; + echo "e.g.: $0 /export/corpora/JHU/musan 8 data" + exit 1; +fi + +in_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} $fs ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > $data_dir/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list $data_dir/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf $data_dir/musan.tmp + diff --git a/egs/lre22/fixed.v1.8k/local/make_rirs_data.sh b/egs/lre22/fixed.v1.8k/local/make_rirs_data.sh new file mode 100755 index 00000000..c6652eda --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_rirs_data.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# +# Apache 2.0. +set -e + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom" +fi + +rir_dir=$1 +fs=$2 +data_dir=$3 + +mkdir -p $data_dir + +rir_list=$rir_dir/rir_list +if [ "$fs" -eq 16 ];then + awk '{ key=$5; sub(/.*\//,"",key); print key,$5 }' $rir_list > $data_dir/wav.scp +else + awk '{ +key=$5; sub(/.*\//,"",key); +print key,"sox "$5" -r 8000 -t wav -b 16 -e signed-integer - |" }' \ + $rir_list > $data_dir/wav.scp +fi +awk '{ key=$5; sub(/.*\//,"",key); print key,$4 }' $rir_list > $data_dir/rir2room + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre16_train_dev.sh b/egs/lre22/fixed.v1.8k/local/make_sre16_train_dev.sh new file mode 100755 index 00000000..f861a8f4 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre16_train_dev.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +meta=$input_path/metadata +call2lang=$meta/calls.tsv +call2spk=$meta/call_sides.tsv +spk2gender=$meta/subjects.tsv +segm_file=$docs/sre16_dev_segment_key.tsv + +tel_up="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + +#Dev CMN2 Mandarin and Cebuano +for lang in cmn ceb +do + output_dir=$output_path/sre16_train_dev_$lang + mkdir -p $output_dir + awk -v c2l=$call2lang -v c2s=$call2spk -v s2g=$spk2gender -v l=$lang -F "\t" 'BEGIN{ +while(getline < c2l) +{ + if($2 == l){ calls[$1]=1 } +} +while(getline < c2s) { spk[$1]=$3 } +while(getline < s2g) { gender[$1]=tolower($2) } +} +{ if($2 in calls) { s=spk[$2]; print $1, s, gender[s] }}' $segm_file > $output_dir/table + + awk '{ print $2"-"$1,$2}' $output_dir/table | sort -k1,1 > $output_dir/utt2spk + utils/utt2spk_to_spk2utt.pl $output_dir/utt2spk > $output_dir/spk2utt + awk '{ print $2,$3}' $output_dir/table | sort -k1,1 -u > $output_dir/spk2gender + awk -v lang=$lang 'BEGIN{if(lang=="cmn"){lang_ldc="zho-cmn"} else { lang_ldc="ceb-ceb" }} { print $1,lang_ldc}' $output_dir/utt2spk > $output_dir/utt2lang + + find -L $input_path -name "*.sph" > $output_dir/wav.scp.tmp + + awk -v fwav=$output_dir/wav.scp.tmp 'BEGIN{ +while(getline < fwav) +{ + bn=$1; + sub(/.*\//,"",bn); + sub(/\.sph$/,"",bn); + wav[bn]=$1; +} +} +{ print $2"-"$1,"sph2pipe -f wav -p -c 1 "wav[$1]" |'"$tel_up"'"}' $output_dir/table | \ + sort -k1,1 > $output_dir/wav.scp + + rm -f $output_dir/wav.scp.tmp + utils/fix_data_dir.sh $output_dir + utils/validate_data_dir.sh --no-text --no-feats $output_dir +done + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre16_train_eval.sh b/egs/lre22/fixed.v1.8k/local/make_sre16_train_eval.sh new file mode 100755 index 00000000..3589a60e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre16_train_eval.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Copyright 2020 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +meta=$input_path/metadata +call2lang=$meta/calls.tsv +call2spk=$meta/call_sides.tsv +spk2gender=$meta/subjects.tsv +segm_file=$docs/sre16_eval_segment_key.tsv + +tel_up="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + +#Dev CMN2 Cantonese and Tagalog +for lang in yue tgl +do + output_dir=$output_path/sre16_train_eval_$lang + mkdir -p $output_dir + awk -v c2l=$call2lang -v c2s=$call2spk -v s2g=$spk2gender -v l=$lang -F "\t" 'BEGIN{ +while(getline < c2l) +{ + if($2 == l){ calls[$1]=1 } +} +while(getline < c2s) { spk[$1]=$3 } +while(getline < s2g) { gender[$1]=tolower($2) } +} +{ if($2 in calls) { s=spk[$2]; print $1, s, gender[s] }}' $segm_file > $output_dir/table + + awk '{ print $2"-"$1,$2}' $output_dir/table | sort -k1,1 > $output_dir/utt2spk + utils/utt2spk_to_spk2utt.pl $output_dir/utt2spk > $output_dir/spk2utt + awk '{ print $2,$3}' $output_dir/table | sort -k1,1 -u > $output_dir/spk2gender + awk -v lang=$lang 'BEGIN{if(lang=="yue"){lang_ldc="zho-yue"} else { lang_ldc="tl-tl" }} { print $1,lang_ldc}' $output_dir/utt2spk > $output_dir/utt2lang + + + find -L $input_path -name "*.sph" > $output_dir/wav.scp.tmp + + awk -v fwav=$output_dir/wav.scp.tmp 'BEGIN{ +while(getline < fwav) +{ + bn=$1; + sub(/.*\//,"",bn); + sub(/\.sph$/,"",bn); + wav[bn]=$1; +} +} +{ print $2"-"$1,"sph2pipe -f wav -p -c 1 "wav[$1]" |'"$tel_up"'"}' $output_dir/table | \ + sort -k1,1 > $output_dir/wav.scp + + rm -f $output_dir/wav.scp.tmp + utils/fix_data_dir.sh $output_dir + utils/validate_data_dir.sh --no-text --no-feats $output_dir +done + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre18_dev_unlabeled.sh b/egs/lre22/fixed.v1.8k/local/make_sre18_dev_unlabeled.sh new file mode 100755 index 00000000..5d49bba7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre18_dev_unlabeled.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +#enroll_file=$docs/sre18_dev_enrollment.tsv +#enroll_diar_file=$docs/sre18_dev_enrollment_diarization.tsv +segm_file=$docs/sre18_dev_segment_key.tsv +#trial_file=$docs/sre18_dev_trials.tsv +#key_file=$docs/sre18_dev_trial_key.tsv + +tel_up="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + +#Unlabeled +unlab_dir=$output_path/sre18_dev_unlabeled +mkdir -p $unlab_dir +awk '/unlabeled/ { print $1,"sph2pipe -f wav -p -c 1 '$input_path'/data/unlabeled/"$1" |'"$tel_up"'"}' $segm_file | \ + sort -k1,1 > $unlab_dir/wav.scp +awk '/unlabeled/ { print $1,$1}' $segm_file | sort -k1,1 > $unlab_dir/utt2spk +cp $unlab_dir/utt2spk $unlab_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $unlab_dir/utt2spk > $unlab_dir/utt2lang + +utils/fix_data_dir.sh $unlab_dir +utils/validate_data_dir.sh --no-text --no-feats $unlab_dir + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre18_train_dev.sh b/egs/lre22/fixed.v1.8k/local/make_sre18_train_dev.sh new file mode 100755 index 00000000..9e6ff763 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre18_train_dev.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +segm_file=$docs/sre18_dev_segment_key.tsv + +tel_up="" +vid_down="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + + +#Dev CMN2 +output_dir=$output_path/sre18_train_dev_cmn2 +mkdir -p $output_dir +awk '$7=="cmn2" && $4 != "unlabeled" { print $2"-"$1,$2}' $segm_file | sort -k1,1 > $output_dir/utt2spk +utils/utt2spk_to_spk2utt.pl $output_dir/utt2spk > $output_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $output_dir/utt2spk > $output_dir/utt2lang + +find -L $input_path -name "*.sph" > $output_dir/wav.scp.tmp + +awk -v fwav=$output_dir/wav.scp.tmp 'BEGIN{ +while(getline < fwav) +{ + bn=$1; + sub(/.*\//,"",bn); + wav[bn]=$1; +} +} +$7=="cmn2" && $4 != "unlabeled" { print $2"-"$1,"sph2pipe -f wav -p -c 1 "wav[$1]" |'"$tel_up"'"}' $segm_file | \ + sort -k1,1 > $output_dir/wav.scp + +rm -f $output_dir/wav.scp.tmp + +awk -v sf=$segm_file 'BEGIN{ +while(getline < sf) +{ + gender[$1]=substr($3,1,1) +} +} +{ sub(/^[^-]*-/,"",$2); print $1,gender[$2] } ' $output_dir/spk2utt > $output_dir/spk2gender + +utils/fix_data_dir.sh $output_dir +utils/validate_data_dir.sh --no-text --no-feats $output_dir + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre18_train_eval.sh b/egs/lre22/fixed.v1.8k/local/make_sre18_train_eval.sh new file mode 100755 index 00000000..33ff5a5a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre18_train_eval.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Copyright 2018 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +segm_file=$docs/sre18_eval_segment_key.tsv + +tel_up="" +vid_down="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" + vid_down=" -r 16k " +elif [ $fs -eq 8 ];then + vid_down=" -r 8k " +fi + + +#Eval CMN2 +output_dir=$output_path/sre18_train_eval_cmn2 +mkdir -p $output_dir +awk '$7=="cmn2" && $4 != "unlabeled" { print $2"-"$1,$2}' $segm_file | sort -k1,1 > $output_dir/utt2spk +utils/utt2spk_to_spk2utt.pl $output_dir/utt2spk > $output_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $output_dir/utt2spk > $output_dir/utt2lang + +find $input_path -name "*.sph" > $output_dir/wav.scp.tmp + +awk -v fwav=$output_dir/wav.scp.tmp 'BEGIN{ +while(getline < fwav) +{ + bn=$1; + sub(/.*\//,"",bn); + wav[bn]=$1; +} +} +$7=="cmn2" && $4 != "unlabeled" { print $2"-"$1,"sph2pipe -f wav -p -c 1 "wav[$1]" |'"$tel_up"'"}' $segm_file | \ + sort -k1,1 > $output_dir/wav.scp + +rm -f $output_dir/wav.scp.tmp + +awk -v sf=$segm_file 'BEGIN{ +while(getline < sf) +{ + gender[$1]=substr($3,1,1) +} +} +{ sub(/^[^-]*-/,"",$2); print $1,gender[$2] } ' $output_dir/spk2utt > $output_dir/spk2gender + +utils/fix_data_dir.sh $output_dir +utils/validate_data_dir.sh --no-text --no-feats $output_dir + + diff --git a/egs/lre22/fixed.v1.8k/local/make_sre19cmn2_eval.sh b/egs/lre22/fixed.v1.8k/local/make_sre19cmn2_eval.sh new file mode 100755 index 00000000..d6f877f5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/make_sre19cmn2_eval.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Jesus Villalba) +# Apache 2.0 + +if [ $# != 3 ]; then + echo "Usage: $0 " + exit 1 +fi +input_path=$1 +fs=$2 +output_path=$3 + +docs=$input_path/docs +enroll_file=$docs/sre19_cts_challenge_enrollment.tsv +trial_file=$docs/sre19_cts_challenge_trials.tsv +key_file=$docs/sre19_cts_challenge_trial_key.tsv + +tel_up="" +vid_down="" +if [ $fs -eq 16 ];then + tel_up=" sox -t wav - -t wav -r 16k - |" +fi + +#Enrollment CMN2 +enroll_dir=$output_path/sre19_eval_enroll_cmn2 +mkdir -p $enroll_dir +awk '/\.sph/ { print $1"-"$2,"sph2pipe -f wav -p -c 1 '$input_path'/data/enrollment/"$2" |'"$tel_up"'"}' $enroll_file | \ + sort -k1,1 > $enroll_dir/wav.scp +awk '!/modelid/ && /\.sph/ { print $1"-"$2,$1}' $enroll_file | sort -k1,1 > $enroll_dir/utt2spk +utils/utt2spk_to_spk2utt.pl $enroll_dir/utt2spk > $enroll_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $enroll_dir/utt2spk > $enroll_dir/utt2lang + +utils/fix_data_dir.sh $enroll_dir +utils/validate_data_dir.sh --no-text --no-feats $enroll_dir + + +#Test set CMN2 +test_dir=$output_path/sre19_eval_test_cmn2 +mkdir -p $test_dir +awk '/\.sph/ { print $2,"sph2pipe -f wav -p -c 1 '$input_path'/data/test/"$2" |'"$tel_up"'"}' $trial_file | \ + sort -u -k1,1 > $test_dir/wav.scp +awk '{ print $1,$1}' $test_dir/wav.scp | sort -k1,1 > $test_dir/utt2spk +cp $test_dir/utt2spk $test_dir/spk2utt +awk '{ print $1,"ara-aeb" }' $test_dir/utt2spk > $test_dir/utt2lang +awk '!/modelid/ { print $1,$2,$4 }' $key_file > $test_dir/trials + +cp $trial_file $test_dir/trials.tsv +cp $key_file $test_dir/trial_key.tsv + +utils/fix_data_dir.sh $test_dir +utils/validate_data_dir.sh --no-text --no-feats $test_dir + + diff --git a/egs/lre22/fixed.v1.8k/local/merge_scores.py b/egs/lre22/fixed.v1.8k/local/merge_scores.py new file mode 100755 index 00000000..8d0df80e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/merge_scores.py @@ -0,0 +1,32 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import pandas as pd + + +def merge_scores(in_score_files, out_score_file): + + dfs = [] + for f in in_score_files: + df_f = pd.read_csv(f, sep="\t") + dfs.append(df_f) + + df = pd.concat(dfs) + df.sort_values(by="segmentid", inplace=True) + df.to_csv(out_score_file, sep="\t", index=False) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Split Segment list into training and validation" + ) + parser.add_argument("--in-score-files", nargs="+", required=True) + parser.add_argument("--out-score-file", required=True) + args = parser.parse_args() + merge_scores(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_adi17.py b/egs/lre22/fixed.v1.8k/local/prepare_adi17.py new file mode 100755 index 00000000..c04d988b --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_adi17.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# prepare_adi17.py --corpus-dir /export/corpora6/ADI17 --output-dir data/adi17 --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +lre_map = { + "ALG": "ara-arq", + "EGY": "ara-arz", + "IRA": "ara-acm", + "JOR": "ara-jor", + "KSA": "ara-ksa", + "KUW": "ara-kuw", + "LEB": "ara-leb", + "LIB": "ara-ayl", + "MAU": "ara-mau", + "MOR": "ara-mor", + "OMA": "ara-oma", + "PAL": "ara-pal", + "QAT": "ara-qat", + "SUD": "ara-sud", + "SYR": "ara-syr", + "UAE": "ara-uae", + "YEM": "ara-yem" +} + + +def map_to_lre(langs): + return [lre_map[l] for l in langs] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "sox {} -t wav -r {} - |".format(filename, target_fs) + else: + wav = filename + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_adi17(corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, + target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + train_files = glob.glob(str(corpus_dir / "train_segments/*/*.wav"), + recursive=True) + train_ids = [Path(f).stem for f in train_files] + train_langs = [Path(f).parent.stem for f in train_files] + dev_files = glob.glob(str(corpus_dir / "dev_segments/*.wav"), + recursive=True) + test_files = glob.glob(str(corpus_dir / "test_segments/*.wav"), + recursive=True) + dev_test_files = dev_files + test_files + df_labels = pd.concat([ + pd.read_csv(str(corpus_dir / "adi17_official_dev_label.txt"), + delim_whitespace=True), + pd.read_csv(str(corpus_dir / "adi17_official_test_label.txt"), + delim_whitespace=True) + ]) + df_labels = df_labels.set_index("id") + dev_test_ids = [Path(f).stem for f in dev_test_files] + dev_test_langs = df_labels.loc[dev_test_ids, "label"].values + all_ids = train_ids + dev_test_ids + all_files = train_files + dev_test_files + all_langs = list(train_langs) + list(dev_test_langs) + if map_langs_to_lre_codes: + all_langs = map_to_lre(all_langs) + + all_ids = [f"{a}-{b}" for a, b in zip(all_langs, all_ids)] + df = pd.DataFrame({ + "id": all_ids, + "language": all_langs, + "filename": all_files + }) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = corpus_dir.stem + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares ADI17 for training") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--remove-langs", + default=None, + nargs="+", + help="languages to remove") + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_adi17(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_ast.py b/egs/lre22/fixed.v1.8k/local/prepare_ast.py new file mode 100755 index 00000000..957ee9bf --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_ast.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python +# prepare_ast.py --corpus-dir /export/corpora6/LRE/AST2004 --output-dir data/ast --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-iaf", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf", + "aa": "afr-afr", + "ba": "afr-afr", + "ca": "afr-afr", + "ae": "eng-ens", + "be": "eng-ens", + "ce": "eng-ens", +} + + +def map_to_lre(langs): + return [lre_map[l] if l in lre_map else "{}-{}".format(l, l) for l in langs] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "sox -t raw -e a-law -r 8000 {} -t wav -e signed-integer -b 16 -r {} - |".format(filename, target_fs) + else: + wav = "sox -t raw -e a-law -r 8000 {} -t wav -e signed-integer -b 16 -r 16000 - |".format(filename) + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_ast( + corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, target_fs, verbose +): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "*/*/*/*/*.alaw")) + langs = [(Path(f).parent.parent.parent.parent.stem).lower() for f in files] + files2 = glob.glob(str(corpus_dir / "*/*/*/*.alaw")) + langs2 = [(Path(f).parent.parent.parent.stem).lower() for f in files2] + files = files + files2 + langs = langs + langs2 + files = [f for f, l in zip(files, langs) if l not in ['ee']] + langs = [l for l in langs if l not in ['ee']] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "cts" + df["corpus_id"] = corpus_dir.stem + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__":#ast + + parser = ArgumentParser(description="Prepares AST for training") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--remove-langs", default=None, nargs="+", help="languages to remove" + ) + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_ast(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_babel.py b/egs/lre22/fixed.v1.8k/local/prepare_babel.py new file mode 100755 index 00000000..4eb18945 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_babel.py @@ -0,0 +1,108 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import glob +import re +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def make_kaldi(df, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + source = row["source"] + if source == "cts": + wav = f"sph2pipe -f wav -p -c 1 {filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + else: + wav = f"sox {filename} -t wav -r {target_fs} - |" + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_babel(corpus_dir, lang_code, output_dir, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + logging.info("searching audio files") + wavs = glob.glob(str(corpus_dir / "**/audio/*.sph"), recursive=True) + logging.info("found %d files", len(wavs)) + wavs = [corpus_dir / w for w in wavs] + seg_ids = [w.stem for w in wavs] + df = pd.DataFrame({"id": seg_ids, "filename": wavs}) + + # sort by segment id + df.sort_values(by="id", inplace=True) + df["corpus_id"] = "babel" + df["sample_rate"] = target_fs + df["language"] = lang_code + df["source"] = "cts" + logging.info("saving files") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Prepares Babel datasets for training in LRE") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument( + "--lang-code", + required=True, + help="language code", + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_babel(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_common_voice.py b/egs/lre22/fixed.v1.8k/local/prepare_common_voice.py new file mode 100755 index 00000000..411ae94a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_common_voice.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +# prepare_common_voice.py --corpus-dir /export/corpora6/LRE/CommonVoice2020 --output-dir data/cv --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd +from hyperion.hyp_defs import config_logger + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-ine", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf" +} + + +def map_to_lre(langs): + return [ + lre_map[l] if l in lre_map else "{}-{}".format(l, l) for l in langs + ] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "ffmpeg -i {} -acodec pcm_s16le -ar {} -f wav - |".format( + filename, target_fs) + else: + wav = "ffmpeg -i {} -acodec pcm_s16le -f wav - |".format( + filename) + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_common_voice(corpus_dir, output_dir, keep_langs, + map_langs_to_lre_codes, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "**/clips/*.mp3"), recursive=True) + langs = [(Path(f).parent.parent.stem).lower() for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + df = df[df["language"].isin(keep_langs)] + # if remove_langs is not None: + # for lang in remove_langs: + # df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = "cv" + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares Common Voice for training") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--keep-langs", + default=["tir-tir"], + nargs="+", + help="languages to keep") + + # parser.add_argument("--remove-langs", + # default=None, + # nargs="+", + # help="languages to remove") + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_common_voice(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents.py b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents.py new file mode 100755 index 00000000..4c44b7f7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python +# prepare_common_voice.py --corpus-dir /export/corpora6/LRE/CommonVoice2020 --output-dir data/cv --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd +from hyperion.hyp_defs import config_logger + +en_map = {"indian": "eng-ine"} +fr_map = { + "france": "fra-fra", + "canada": "fra-can", + "algeria": "fra-ntf", + "morocco": "fra-ntf", + "tunisia": "fra-ntf", +} + +lre_map = { + "en": en_map, + "fr": fr_map, +} + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "ffmpeg -i {} -acodec pcm_s16le -ar {} -f wav - |".format( + filename, target_fs) + else: + wav = "ffmpeg -i {} -acodec pcm_s16le -f wav - |".format( + filename) + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_common_voice(corpus_dir, output_dir, lang, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + my_map = lre_map[lang] + df = pd.read_csv(corpus_dir / lang / "validated.tsv", sep="\t") + mask = None + for dialect in my_map.keys(): + mask_d = df["accent"] == dialect + if mask is None: + mask = mask_d + else: + mask = np.logical_or(mask, mask_d) + + df = df.loc[mask] + files = df["path"] + files = [corpus_dir / lang / "clips" / f for f in df["path"]] + langs = [my_map[l] for l in df["accent"]] + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = "cv" + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Prepares Common Voice Accents for training in LRE22") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--lang", + default="en", + choices=["en", "fr"], + help="languages") + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_common_voice(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents_cat.py b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents_cat.py new file mode 100755 index 00000000..bf9d79ed --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_accents_cat.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +# prepare_common_voice.py --corpus-dir /export/corpora6/LRE/CommonVoice2020 --output-dir data/cv --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd +from hyperion.hyp_defs import config_logger + +en_map = {"indian": "eng-ine"} +fr_map = { + "france": "fra-fra", + "canada": "fra-can", + "algeria": "fra-ntf", + "morocco": "fra-ntf", + "tunisia": "fra-ntf", +} + +lre_map = { + "en": en_map, + "fr": fr_map, +} + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + list_dir = output_dir / "lists_cat" + list_dir.mkdir(parents=True, exist_ok=True) + for r in range(len(df)): + file_list = df.iloc[r].file_lists + with open(list_dir / f"{df.iloc[r].id}.txt", "w") as f: + for fn in file_list: + f.write("file %s\n" % fn) + + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = list_dir / f"{segment_id}.txt" + if target_fs != 16000: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -ar {target_fs} -f wav - |" + else: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -f wav - |" + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_common_voice(corpus_dir, output_dir, lang, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + my_map = lre_map[lang] + df = pd.read_csv(corpus_dir / lang / "validated.tsv", sep="\t") + mask = None + for dialect in my_map.keys(): + mask_d = df["accent"] == dialect + if mask is None: + mask = mask_d + else: + mask = np.logical_or(mask, mask_d) + + df = df.loc[mask] + files = df["path"] + files = [corpus_dir / lang / "clips" / f for f in df["path"]] + langs = [my_map[l] for l in df["accent"]] + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({ + "id": ids, + "language": langs, + "filename": files, + "speaker": df["client_id"] + }) + + # sort by speaker, id + df.sort_values(by=["speaker", "id"], inplace=True) + + file_lists = [] + file_list = [] + seg_count = 0 + prev_spk = "" + cat_segs = [] + cur_seg = 0 + for r in range(len(df)): + row = df.iloc[r] + if seg_count == 5 or (row.speaker != prev_spk and seg_count > 0): + file_lists.append(file_list) + cat_segs.append(cur_seg) + file_list = [] + seg_count = 0 + cur_seg = r + + file_list.append(row.filename) + seg_count += 1 + prev_spk = row.speaker + + if file_list: + file_lists.append(file_list) + cat_segs.append(cur_seg) + + df_cat = df.iloc[cat_segs].drop(["filename"], axis=1) + df_cat["file_lists"] = file_lists + + df_cat["sample_coding"] = "pcm" + df_cat["source"] = "afv" + df_cat["corpus_id"] = "cv" + df_cat["sample_rate"] = target_fs + + # sort by segment id + df_cat.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df_cat.drop(["file_lists"], axis=1).to_csv(output_file, + sep=",", + index=False) + + make_kaldi(df_cat, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Prepares Common Voice Accents for training in LRE22") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--lang", + default="en", + choices=["en", "fr"], + help="languages") + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_common_voice(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_common_voice_cat.py b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_cat.py new file mode 100755 index 00000000..0790be25 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_common_voice_cat.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +# prepare_common_voice.py --corpus-dir /export/corpora6/LRE/CommonVoice2020 --output-dir data/cv --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd +from hyperion.hyp_defs import config_logger + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-ine", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf" +} + + +def map_to_lre(langs): + return [ + lre_map[l] if l in lre_map else "{}-{}".format(l, l) for l in langs + ] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + list_dir = output_dir / "lists_cat" + list_dir.mkdir(parents=True, exist_ok=True) + for r in range(len(df)): + file_list = df.iloc[r].file_lists + with open(list_dir / f"{df.iloc[r].id}.txt", "w") as f: + for fn in file_list: + f.write("file %s\n" % fn) + + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = list_dir / f"{segment_id}.txt" + if target_fs != 16000: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -ar {target_fs} -f wav - |" + else: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -f wav - |" + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_common_voice(corpus_dir, output_dir, keep_langs, + map_langs_to_lre_codes, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "**/clips/*.mp3"), recursive=True) + langs = [(Path(f).parent.parent.stem).lower() for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + df = df[df["language"].isin(keep_langs)] + # if remove_langs is not None: + # for lang in remove_langs: + # df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = "cv" + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + file_lists = [] + file_list = [] + seg_count = 0 + prev_lang = "" + cat_segs = [] + cur_seg = 0 + for r in range(len(df)): + row = df.iloc[r] + if seg_count == 5 or (row.language != prev_lang and seg_count > 0): + file_lists.append(file_list) + cat_segs.append(cur_seg) + file_list = [] + seg_count = 0 + cur_seg = r + + file_list.append(row.filename) + seg_count += 1 + prev_lang = row.language + + if file_list: + file_lists.append(file_list) + cat_segs.append(cur_seg) + + df_cat = df.iloc[cat_segs].drop(["filename"], axis=1) + df_cat["file_lists"] = file_lists + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df_cat.drop(["file_lists"], axis=1).to_csv(output_file, + sep=",", + index=False) + + make_kaldi(df_cat, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares Common Voice for training") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--keep-langs", + default=["tir-tir"], + nargs="+", + help="languages to keep") + + # parser.add_argument("--remove-langs", + # default=None, + # nargs="+", + # help="languages to remove") + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_common_voice(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_lre17.py b/egs/lre22/fixed.v1.8k/local/prepare_lre17.py new file mode 100755 index 00000000..18eaa1d2 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_lre17.py @@ -0,0 +1,140 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import re +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + source = row["source"] + if source == "cts": + wav = f"sph2pipe -f wav -p -c 1 {filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + else: + wav = f"sox {filename} -t wav -r {target_fs} - |" + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + "duration", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + "utt2speech_dur", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_lre17(corpus_dir, subset, source, output_dir, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s - %s -> %s", corpus_dir, subset, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / subset + if subset == "eval": + table_info = corpus_dir / "docs" / f"lre17_eval_segment_keys.tsv" + else: + table_info = corpus_dir / "docs" / f"{subset}_info.tab" + df = pd.read_csv(table_info, sep="\t") + df.rename( + columns={ + "language_code": "language", + "segmentid": "id", + "file_duration": "duration", + }, + inplace=True, + ) + + if subset == "eval": + df["data_source"] = df["data_source"].str.lower() + df["sample_coding"] = df["data_source"].apply( + lambda x: "mulaw" if x == "mls14" else "pcm" + ) + df.loc[df["speech_duration"].isnull(), "speech_duration"] = 1000 + df["length_condition"] = df.pop("speech_duration").astype("int32") + + if subset in ["dev", "eval"]: + # drop files of 3 and 10 secs since they are contained in the files of 30 secs + df = df[df["length_condition"] > 10] + if source != "all": + df = df[df["data_source"] == source] + + # move segment column to first positon + first_col = df.pop("id") + df.insert(0, "id", first_col) + + # sort by segment id + df.sort_values(by="id", inplace=True) + + if subset == "train": + df["filename"] = df.apply(lambda x: wav_dir / x.language / x.id, axis=1) + else: + df["filename"] = df.apply(lambda x: wav_dir / x.id, axis=1) + df["source"] = df["id"].apply(lambda x: "cts" if re.match(r".*\.sph", x) else "afv") + df["corpus_id"] = "lre17" + df["sample_rate"] = target_fs + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares LDC2022E16/17 LRE17 for training") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument( + "--subset", + required=True, + help="train/dev/eval", + choices=["train", "dev", "eval"], + ) + parser.add_argument( + "--source", + default="all", + help="all/mls14/vast", + choices=["all", "mls14", "vast"], + ) + + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_lre17(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_lre22_dev.py b/egs/lre22/fixed.v1.8k/local/prepare_lre22_dev.py new file mode 100755 index 00000000..825f9b67 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_lre22_dev.py @@ -0,0 +1,108 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + source = row["source"] + if source == "cts": + wav = f"sph2pipe -f wav -p -c 1 {filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + else: + wav = f"sox {filename} -t wav -r {target_fs} - |" + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + "duration", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + "utt2speech_dur", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_lre22(corpus_dir, output_dir, target_fs, verbose): + config_logger(verbose) + subset = "dev" + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / subset + table_info = corpus_dir / "metadata" / "lre22_dev_metadata.tsv" + df = pd.read_csv(table_info, sep="\t") + df.rename( + columns={ + "language_code": "language", + "file_name": "id", + "duration_sec": "duration", + }, + inplace=True, + ) + + # move segment column to first positon + first_col = df.pop("id") + df.insert(0, "id", first_col) + + # sort by segment id + df.sort_values(by="id", inplace=True) + + df["filename"] = df.apply(lambda x: wav_dir / f"{x.id}.sph", axis=1) + df["source_coding"] = "alaw" + df["source"] = "cts" + df["corpus_id"] = "lre22" + df["sample_rate"] = target_fs + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares LDC2022E14 LRE22") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_lre22(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_lre22_eval.py b/egs/lre22/fixed.v1.8k/local/prepare_lre22_eval.py new file mode 100755 index 00000000..39aa06de --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_lre22_eval.py @@ -0,0 +1,98 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + source = row["source"] + if source == "cts": + wav = f"sph2pipe -f wav -p -c 1 {filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + else: + wav = f"sox {filename} -t wav -r {target_fs} - |" + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + ] + files = [ + "utt2spk", + "spk2utt", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_lre22(corpus_dir, output_dir, target_fs, verbose): + config_logger(verbose) + subset = "eval" + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / subset + table_info = corpus_dir / "docs" / "lre22_eval_trials.tsv" + df = pd.read_csv(table_info, sep="\t") + df.rename( + columns={ + "segmentid": "id", + }, + inplace=True, + ) + + # sort by segment id + df.sort_values(by="id", inplace=True) + + df["filename"] = df.apply(lambda x: wav_dir / f"{x.id}.sph", axis=1) + df["source_coding"] = "alaw" + df["source"] = "cts" + df["corpus_id"] = "lre22" + df["sample_rate"] = target_fs + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares LRE22 eval data") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_lre22(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre.py b/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre.py new file mode 100755 index 00000000..d3eb68f1 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# prepare_data.py --corpus-dir /export/corpora6/LRE/FLEURS2022 --output-dir data/fleurs --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/Lwazi2009 --output-dir data/lwazi --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/NCHLT2014 --output-dir data/nchlt --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/AMMI2020 --output-dir data/ammi --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-iaf", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf" +} + + +def map_to_lre(langs): + return [lre_map[l] if l in lre_map else "{}-{}".format(l,l) for l in langs] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = "sox {} -t wav -r {} - |".format(filename, target_fs) + else: + wav = filename + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_data( + corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, target_fs, verbose +): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "*/*/*/*/*.wav")) + langs = [(Path(f).parent.parent.parent.parent.stem).lower() for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = corpus_dir.stem + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares NCHLT, FLEURS, Lwazi, and AMMI corpus for training") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--remove-langs", default=None, nargs="+", help="languages to remove" + ) + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_data(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre_cat.py b/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre_cat.py new file mode 100755 index 00000000..df62f18a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_some_data_for_lre_cat.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# prepare_data.py --corpus-dir /export/corpora6/LRE/FLEURS2022 --output-dir data/fleurs --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/Lwazi2009 --output-dir data/lwazi --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/NCHLT2014 --output-dir data/nchlt --map-langs-to-lre-codes --target-fs 8000 +# prepare_data.py --corpus-dir /export/corpora6/LRE/AMMI2020 --output-dir data/ammi --map-langs-to-lre-codes --target-fs 8000 +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +lre_map = { + "afr": "afr-afr", + "ndb": "nbl-nbl", + "oro": "orm-orm", + "tso": "tso-tso", + "ven": "ven-ven", + "xho": "xho-xho", + "zul": "zul-zul", + "tig": "tir-tir", + "sae": "eng-ens", + "ine": "eng-iaf", + "tun": "ara-aeb", + "alg": "ara-arq", + "lib": "ara-ayl", + "naf": "fra-ntf" +} + +buggy_files = { + "200630-192328_tir_c85_elicit_7", + "200701-120846_tir_c85_elicit_35", + "200701-133352_tir_c85_elicit_57", + "200701-134903_tir_c85_elicit_19", + "200701-134903_tir_c85_elicit_32", + "200701-234652_tir_c85_elicit_78", + "200702-083859_tir_c85_elicit_18", + "200702-125252_tir_c85_elicit_46", + "200702-161120_tir_c85_elicit_4", + "200702-161120_tir_c85_elicit_7", + "200702-172026_tir_c85_elicit_31", + "200702-182933_tir_c85_elicit_133", + "200702-182933_tir_c85_elicit_88", + "200702-193310_tir_c85_elicit_2", + "200702-194850_tir_c85_elicit_88", + "200702-200911_tir_c85_elicit_171", +} + + +def map_to_lre(langs): + return [ + lre_map[l] if l in lre_map else "{}-{}".format(l, l) for l in langs + ] + + +def make_kaldi(df, wav_dir, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + list_dir = output_dir / "lists_cat" + list_dir.mkdir(parents=True, exist_ok=True) + for r in range(len(df)): + file_list = df.iloc[r].file_lists + with open(list_dir / f"{df.iloc[r].id}.txt", "w") as f: + for fn in file_list: + f.write("file %s\n" % fn) + + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = list_dir / f"{segment_id}.txt" + if target_fs != 16000: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -ar {target_fs} -f wav - |" + else: + wav = f"ffmpeg -f concat -safe 0 -i {filename} -acodec pcm_s16le -f wav - |" + + f.write("{} {}\n".format(segment_id, wav)) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv(output_file, + sep=" ", + columns=["id", c], + header=False, + index=False) + + +def prepare_data(corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, + target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir + files = glob.glob(str(corpus_dir / "**/*.wav"), recursive=True) + langs = [(Path(f).parent.parent.parent.parent.stem).lower() for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = ["{}-{}".format(l, Path(f).stem) for f, l in zip(files, langs)] + val = np.array( + [False if Path(f).stem in buggy_files else True for f in files]) + non_val = np.any(val == False) + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + if non_val: + df = df.loc[val] + logging.info("detected invalid files %d / %d remained", len(df), + len(val)) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + # sort by segment id + df.sort_values(by=["language", "id"], inplace=True) + + file_lists = [] + file_list = [] + seg_count = 0 + prev_lang = "" + cat_segs = [] + cur_seg = 0 + for r in range(len(df)): + row = df.iloc[r] + if seg_count == 5 or (row.language != prev_lang and seg_count > 0): + file_lists.append(file_list) + cat_segs.append(cur_seg) + file_list = [] + seg_count = 0 + cur_seg = r + + file_list.append(row.filename) + seg_count += 1 + prev_lang = row.language + + if file_list: + file_lists.append(file_list) + cat_segs.append(cur_seg) + + df_cat = df.iloc[cat_segs].drop(["filename"], axis=1) + df_cat["file_lists"] = file_lists + + # sort by segment id + df_cat.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df_cat.drop(["file_lists"], axis=1).to_csv(output_file, + sep=",", + index=False) + + make_kaldi(df_cat, wav_dir, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description= + "Prepares NCHLT, FLEURS, Lwazi, and AMMI corpus for training") + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument("--remove-langs", + default=None, + nargs="+", + help="languages to remove") + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument("--target-fs", + default=8000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_data(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_sre21av_dev_audio.py b/egs/lre22/fixed.v1.8k/local/prepare_sre21av_dev_audio.py new file mode 100755 index 00000000..bc2c3001 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_sre21av_dev_audio.py @@ -0,0 +1,215 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +from enum import Enum + + +class LangTrialCond(Enum): + ENG_ENG = 1 + ENG_CMN = 2 + ENG_YUE = 3 + CMN_CMN = 4 + CMN_YUE = 5 + YUE_YUE = 6 + OTHER_OTHER = 7 + OTHER_ENG = 8 + OTHER_CMN = 9 + OTHER_YUE = 10 + + @staticmethod + def is_eng(val): + if val in "ENG" or val in "USE": + return True + return False + + @staticmethod + def get_side_cond(val): + if val == "ENG" or val == "USE": + return "ENG" + if "YUE" in val: + return "YUE" + if "CMN" in val: + return "CMN" + + return "OTHER" + + @staticmethod + def get_trial_cond(enr, test): + enr = LangTrialCond.get_side_cond(enr) + test = LangTrialCond.get_side_cond(test) + trial = enr + "_" + test + try: + return LangTrialCond[trial] + except: + trial = test + "_" + enr + return LangTrialCond[trial] + + +class SourceTrialCond(Enum): + CTS_CTS = 1 + CTS_AFV = 2 + AFV_AFV = 3 + + @staticmethod + def get_trial_cond(enr, test): + trial = enr.upper() + "_" + test.upper() + try: + return SourceTrialCond[trial] + except: + trial = test.upper() + "_" + enr.upper() + return SourceTrialCond[trial] + + +def write_wav(df, target_fs, wav_dir, output_file): + with open(output_file, "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + ext = segment_id.split(".")[-1] + if ext == "flac": + if target_fs == 16000: + wav = f"{wav_dir}/{segment_id}" + else: + wav = f"sox {wav_dir}/{segment_id} -t wav -r {target_fs} - |" + elif ext == "mp4": + wav = f"ffmpeg -v 8 -i {wav_dir}/{segment_id} -vn -ar {target_fs} -ac 1 -f wav - |" + else: + wav = f"sph2pipe -f wav -p -c 1 {wav_dir}/{segment_id} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + f.write(f"{segment_id} {wav}\n") + + +def make_enroll_dir(df_segms, wav_dir, target_fs, source, output_path): + # fix source + df_segms.loc[df_segms["id"].str.match(r".*\.flac$"), "source_type"] = "afv" + enroll_dir = Path(output_path + f"_enroll_{source}") + wav_dir = wav_dir / "enrollment" + logging.info("making enrollment dir %s", enroll_dir) + enroll_dir.mkdir(parents=True, exist_ok=True) + df_segms = (df_segms[(df_segms["partition"] == "enrollment") + & (df_segms["source_type"] == source) & + (df_segms["language"] != "other")].drop( + ["partition"], axis=1).sort_values(by="id")) + segment_file = enroll_dir / "segments.csv" + df_segms.to_csv(segment_file, sep=",", index=False) + + with open(enroll_dir / "utt2spk", "w") as f1, open(enroll_dir / "spk2utt", + "w") as f2: + for u in df_segms["id"]: + f1.write(f"{u} {u}\n") + f2.write(f"{u} {u}\n") + + with open(enroll_dir / "utt2lang", "w") as f: + for u, s in zip(df_segms["id"], df_segms["language"]): + f.write(f"{u} {s}\n") + + write_wav(df_segms, target_fs, wav_dir, enroll_dir / "wav.scp") + + +def make_test_dir(df_segms, wav_dir, target_fs, source, output_path): + if source == "na": + # fix source + df_segms.loc[df_segms["id"].str.match(r".*\.mp4$"), + "source_type"] = "afv" + source = "afv" + + test_dir = Path(output_path + f"_test_{source}") + wav_dir = wav_dir / "test" + logging.info("making test dir %s", test_dir) + test_dir.mkdir(parents=True, exist_ok=True) + df_segms = (df_segms[(df_segms["partition"] == "test") + & (df_segms["source_type"] == source) & + (df_segms["language"] != "other")].drop( + ["partition"], axis=1).sort_values(by="id")) + + segment_file = test_dir / "segments.csv" + df_segms.to_csv(segment_file, sep=",", index=False) + + with open(test_dir / "utt2spk", "w") as f1, open(test_dir / "spk2utt", + "w") as f2: + for u in df_segms["id"]: + f1.write(f"{u} {u}\n") + f2.write(f"{u} {u}\n") + + with open(test_dir / "utt2lang", "w") as f: + for u, s in zip(df_segms["id"], df_segms["language"]): + f.write(f"{u} {s}\n") + + with open(test_dir / "spk2gender", "w") as f: + for u, g in zip(df_segms["id"], df_segms["gender"]): + g = g[0] + f.write(f"{u} {g}\n") + + write_wav(df_segms, target_fs, wav_dir, test_dir / "wav.scp") + + +def prepare_sre21av_dev_audio(corpus_dir, output_path, av_output_path, + target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_path) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / "audio" + segments_file = corpus_dir / "docs" / "sre21_dev_segment_key.tsv" + df_segms = pd.read_csv(segments_file, sep="\t") + df_segms.rename( + columns={ + "segmentid": "id", + "subjectid": "speaker_id" + }, + inplace=True, + ) + df_segms.replace({"language": "english"}, {"language": "eng-zho"}, + inplace=True) + df_segms.replace({"language": "cantonese"}, {"language": "zho-yue"}, + inplace=True) + df_segms.replace({"language": "mandarin"}, {"language": "zho-cmn"}, + inplace=True) + + enroll_file = corpus_dir / "docs" / "sre21_audio_dev_enrollment.tsv" + + make_enroll_dir(df_segms, wav_dir, target_fs, "cts", output_path) + make_enroll_dir(df_segms, wav_dir, target_fs, "afv", output_path) + make_test_dir(df_segms, wav_dir, target_fs, "cts", output_path) + make_test_dir(df_segms, wav_dir, target_fs, "afv", output_path) + + wav_dir = corpus_dir / "data" / "video" + make_test_dir(df_segms, wav_dir, target_fs, "na", av_output_path) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares SRE21 dev audio part") + + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-path", + required=True, + help="Output data path prefix") + parser.add_argument( + "--av-output-path", + required=True, + help="Output data path prefix for audio visual", + ) + parser.add_argument("--target-fs", + default=16000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_sre21av_dev_audio(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_sre21av_eval_audio.py b/egs/lre22/fixed.v1.8k/local/prepare_sre21av_eval_audio.py new file mode 100755 index 00000000..301eebf7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_sre21av_eval_audio.py @@ -0,0 +1,243 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +from enum import Enum + + +class LangTrialCond(Enum): + ENG_ENG = 1 + ENG_CMN = 2 + ENG_YUE = 3 + CMN_CMN = 4 + CMN_YUE = 5 + YUE_YUE = 6 + OTHER_OTHER = 7 + OTHER_ENG = 8 + OTHER_CMN = 9 + OTHER_YUE = 10 + + @staticmethod + def is_eng(val): + if val in "ENG" or val in "USE": + return True + return False + + @staticmethod + def get_side_cond(val): + if val == "ENG" or val == "USE": + return "ENG" + if "YUE" in val: + return "YUE" + if "CMN" in val: + return "CMN" + + return "OTHER" + + @staticmethod + def get_trial_cond(enr, test): + enr = LangTrialCond.get_side_cond(enr) + test = LangTrialCond.get_side_cond(test) + trial = enr + "_" + test + try: + return LangTrialCond[trial] + except: + trial = test + "_" + enr + return LangTrialCond[trial] + + +class SourceTrialCond(Enum): + CTS_CTS = 1 + CTS_AFV = 2 + AFV_AFV = 3 + + @staticmethod + def get_trial_cond(enr, test): + trial = enr.upper() + "_" + test.upper() + try: + return SourceTrialCond[trial] + except: + trial = test.upper() + "_" + enr.upper() + return SourceTrialCond[trial] + + +def write_wav(df, target_fs, wav_dir, output_file): + with open(output_file, "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + ext = segment_id.split(".")[-1] + if ext == "flac": + if target_fs == 16000: + wav = f"{wav_dir}/{segment_id}" + else: + wav = f"sox {wav_dir}/{segment_id} -t wav -r {target_fs} - |" + elif ext == "mp4": + wav = f"ffmpeg -v 8 -i {wav_dir}/{segment_id} -vn -ar {target_fs} -ac 1 -f wav - |" + else: + wav = f"sph2pipe -f wav -p -c 1 {wav_dir}/{segment_id} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + f.write(f"{segment_id} {wav}\n") + + +def make_enroll_dir(df_segms, wav_dir, target_fs, source, output_path): + + # fix source + df_segms.loc[df_segms["id"].str.match(r".*\.flac$"), "source_type"] = "afv" + enroll_dir = Path(output_path + f"_enroll_{source}") + wav_dir = wav_dir / "enrollment" + logging.info("making enrollment dir %s", enroll_dir) + enroll_dir.mkdir(parents=True, exist_ok=True) + df_segms = (df_segms[(df_segms["partition"] == "enrollment") + & (df_segms["source_type"] == source) & + (df_segms["language"] != "other")].drop( + ["partition"], axis=1).sort_values(by="id")) + segment_file = enroll_dir / "segments.csv" + df_segms.to_csv(segment_file, sep=",", index=False) + + with open(enroll_dir / "utt2spk", "w") as f1, open(enroll_dir / "spk2utt", + "w") as f2: + for u in df_segms["id"]: + f1.write(f"{u} {u}\n") + f2.write(f"{u} {u}\n") + + with open(enroll_dir / "utt2lang", "w") as f: + for u, s in zip(df_segms["id"], df_segms["language"]): + f.write(f"{u} {s}\n") + + write_wav(df_segms, target_fs, wav_dir, enroll_dir / "wav.scp") + + +def make_test_dir(df_segms, wav_dir, target_fs, source, output_path): + + if source == "na": + # fix source + df_segms.loc[df_segms["id"].str.match(r".*\.mp4$"), + "source_type"] = "afv" + source = "afv" + + test_dir = Path(output_path + f"_test_{source}") + wav_dir = wav_dir / "test" + logging.info("making test dir %s", test_dir) + test_dir.mkdir(parents=True, exist_ok=True) + df_segms = (df_segms[(df_segms["partition"] == "test") + & (df_segms["source_type"] == source) & + (df_segms["language"] != "other")].drop( + ["partition"], axis=1).sort_values(by="id")) + + segment_file = test_dir / "segments.csv" + df_segms.to_csv(segment_file, sep=",", index=False) + + with open(test_dir / "utt2spk", "w") as f1, open(test_dir / "spk2utt", + "w") as f2: + for u in df_segms["id"]: + f1.write(f"{u} {u}\n") + f2.write(f"{u} {u}\n") + + with open(test_dir / "utt2lang", "w") as f: + for u, s in zip(df_segms["id"], df_segms["language"]): + f.write(f"{u} {s}\n") + + with open(test_dir / "spk2gender", "w") as f: + for u, g in zip(df_segms["id"], df_segms["gender"]): + g = g[0] + f.write(f"{u} {g}\n") + + write_wav(df_segms, target_fs, wav_dir, test_dir / "wav.scp") + + +def prepare_sre21av_eval_audio(corpus_dir, output_path, av_output_path, + target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_path) + corpus_dir = Path(corpus_dir) + wav_dir = corpus_dir / "data" / "audio" + segments_file = corpus_dir / "docs" / "sre21_eval_segment_key.tsv" + df_segms = pd.read_csv(segments_file, sep="\t") + df_segms.rename( + columns={ + "segmentid": "id", + "subjectid": "speaker_id" + }, + inplace=True, + ) + df_segms.replace({"language": "english"}, {"language": "eng-zho"}, + inplace=True) + df_segms.replace({"language": "cantonese"}, {"language": "zho-yue"}, + inplace=True) + df_segms.replace({"language": "mandarin"}, {"language": "zho-cmn"}, + inplace=True) + + # enroll_file = corpus_dir / "docs" / "sre21_audio_eval_enrollment.tsv" + # df_enr = pd.read_csv(enroll_file, sep="\t") + # df_enr.rename( + # columns={ + # "segmentid": "id", + # "modelid": "model_id" + # }, + # inplace=True, + # ) + # key_file = corpus_dir / "docs" / "sre21_audio_eval_trial_key.tsv" + # df_key = pd.read_csv(key_file, sep="\t") + # df_key.rename( + # columns={ + # "segmentid": "id", + # "modelid": "model_id" + # }, + # inplace=True, + # ) + + make_enroll_dir(df_segms, wav_dir, target_fs, "cts", output_path) + make_enroll_dir(df_segms, wav_dir, target_fs, "afv", output_path) + make_test_dir(df_segms, wav_dir, target_fs, "cts", output_path) + make_test_dir(df_segms, wav_dir, target_fs, "afv", output_path) + + key_file = corpus_dir / "docs" / "sre21_audio-visual_eval_trial_key.tsv" + # df_key = pd.read_csv(key_file, sep="\t") + # df_key.rename( + # columns={ + # "segmentid": "id", + # "modelid": "model_id" + # }, + # inplace=True, + # ) + wav_dir = corpus_dir / "data" / "video" + make_test_dir(df_segms, wav_dir, target_fs, "na", av_output_path) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares SRE21 eval audio part") + + parser.add_argument("--corpus-dir", + required=True, + help="Path to the original dataset") + parser.add_argument("--output-path", + required=True, + help="Output data path prefix") + parser.add_argument( + "--av-output-path", + required=True, + help="Output data path prefix for audio visual", + ) + parser.add_argument("--target-fs", + default=16000, + type=int, + help="Target sampling frequency") + parser.add_argument("-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int) + args = parser.parse_args() + prepare_sre21av_eval_audio(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_sre_cts_superset.py b/egs/lre22/fixed.v1.8k/local/prepare_sre_cts_superset.py new file mode 100755 index 00000000..af299781 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_sre_cts_superset.py @@ -0,0 +1,185 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + +multigender_spks = [ + "111774", + "111781", + "112778", + "112783", + "112879", + "113153", + "113213", + "113603", + "128673", + "128770", +] + +lre_map = { + "USE": "eng-usg", + "AMH": "am-am", + "BEN": "bn-bn", + "FAR": "far-far", + "HIN": "hi-hi", + "INE": "eng-ine", + "ITA": "it-it", + "JPN": "ja-ja", + "KAT": "ka-ka", + "KHM": "km-km", + "KOR": "ko-ko", + "LAO": "lo-lo", + "PAN": "pa-pa", + "POL": "qsl-pol", + "RUS": "qsl-rus", + "TAM": "ta-ta", + "TGL": "tl-tl", + "THA": "th-th", + "TIR": "tir-tir", + "URD": "ur-ur", + "UZB": "uz-uz", + "VIE": "vi-vi", + "CMN": "zho-cmn", + "YUE": "zho-yue", + "WUU": "zho-wuu", + "NAN": "zho-nan", +} + + +def fix_multigender_spks(df): + + logging.info("Fixing multigender speakers") + n0 = len(df) + for spk in multigender_spks: + male_idx = (df["speaker_id"] == spk) & (df["gender"] == "male") + female_idx = (df["speaker_id"] == spk) & (df["gender"] == "female") + num_male = np.sum(male_idx) + num_female = np.sum(female_idx) + if num_male > num_female: + df = df[~female_idx] + else: + df = df[~male_idx] + + logging.info("Fixed multigender speakers, %d/%d segments remained", len(df), n0) + return df + + +def prepare_sre_cts_superset(corpus_dir, output_dir, target_fs, verbose): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + wav_dir = Path(corpus_dir) / "data" + table_file = Path(corpus_dir) / "docs/cts_superset_segment_key.tsv" + df = pd.read_csv(table_file, sep="\t") + df.drop(["segmentid", "speakerid"], axis=1, inplace=True) + df.rename( + columns={ + "subjectid": "speaker_id", + "sessionid": "session_id", + "corpusid": "corpus_id", + "phoneid": "phone_id", + }, + inplace=True, + ) + df["speaker_id"] = df["speaker_id"].astype("str") + df = fix_multigender_spks(df) + + logging.info("remove generic ENG or mixed langs") + n0 = len(df) + df = df[df["language"] != "ENG"] + df = df[df["language"] != "SPA"] + df = df[df["language"] != "UND"] + df = df[~df["language"].str.contains("\.")] + logging.info("remained %d out of %d", len(df), n0) + logging.info("renaming languages like LRE") + for k, v in lre_map.items(): + idx = df["language"] == k + df.loc[idx, "language"] = v + + df["id"] = df["filename"].str.replace("/", "-") + # put segment_id as first columnt + cols = df.columns.tolist() + cols = cols[-1:] + cols[:-1] + df = df[cols] + logging.info("sorting by segment_id") + df.sort_values(by="id", inplace=True) + + logging.info("saving segments.csv") + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / "segments.csv" + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "speaker_id", + "speech_duration", + "session_id", + "corpus_id", + "phone_id", + "language", + ] + files = [ + "utt2spk", + "utt2speech_dur", + "utt2session", + "utt2corpus", + "utt2phone", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + df.to_csv(output_file, sep=" ", columns=["id", c], header=False, index=False) + + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + wav = f"sph2pipe -f wav -p -c 1 {wav_dir}/{filename} |" + if target_fs != 8000: + wav = f"{wav} sox -t wav - -t wav -r {target_fs} - |" + f.write(f"{segment_id} {wav}\n") + + # speaker table + logging.info("saving speaker files") + spk_df = df[["speaker_id", "gender"]].drop_duplicates() + output_file = output_dir / "speaker.csv" + spk_df.to_csv(output_file, sep=",", index=False) + gender = df["gender"].str.replace("female", "f").str.replace("male", "m") + spk_df["gender"] = gender + output_file = output_dir / "spk2gender" + spk_df.to_csv(output_file, sep=" ", header=False, index=False) + + with open(output_dir / "spk2utt", "w") as f: + for spk in df["speaker_id"].unique(): + segment_ids = " ".join(df[df["speaker_id"] == spk].id.values) + f.write(f"{spk} {segment_ids}\n") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares SRE superset LDC2021E08") + + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="Ouput data path") + parser.add_argument( + "--target-fs", default=8000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_sre_cts_superset(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/prepare_voxlingua107.py b/egs/lre22/fixed.v1.8k/local/prepare_voxlingua107.py new file mode 100755 index 00000000..c4dc3894 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/prepare_voxlingua107.py @@ -0,0 +1,130 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict, ActionYesNo +import logging +from pathlib import Path +import glob +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger + + +lre_map = { + "pl": "qsl-pol", + "ru": "qsl-rus", + "cs": "qsl-cze", + "uk": "qsl-ukr", + "hr": "qsl-cro", + "bg": "qsl-bul", + "be": "qsl-bel", + "sk": "qsl-slk", + "sl": "qsl-slv", + "bs": "qsl-bos", + "sr": "qsl-ser", + "zh": "zho-cmn", + "fr": "fra-mix", + "af": "afr-afr", +} + + +def map_to_lre(langs): + return [lre_map[l] if l in lre_map else f"{l}-{l}" for l in langs] + + +def make_kaldi(df, output_dir, target_fs): + # make wav.scp + logging.info("making wav.scp") + with open(output_dir / "wav.scp", "w") as f: + for _, row in df.iterrows(): + segment_id = row["id"] + filename = row["filename"] + if target_fs != 16000: + wav = f"sox {filename} -t wav -r {target_fs} - |" + else: + wav = filename + + f.write(f"{segment_id} {wav}\n") + + # Kaldi data directory files + # utt2xxx files + logging.info("saving Kaldi utt2xxx files") + columns = [ + "id", + "id", + "language", + ] + files = [ + "utt2spk", + "spk2utt", + "utt2lang", + ] + for c, f in zip(columns, files): + output_file = output_dir / f + if c in df: + df.to_csv( + output_file, sep=" ", columns=["id", c], header=False, index=False + ) + + +def prepare_voxlingua107( + corpus_dir, output_dir, remove_langs, map_langs_to_lre_codes, target_fs, verbose +): + config_logger(verbose) + logging.info("Preparing corpus %s -> %s", corpus_dir, output_dir) + corpus_dir = Path(corpus_dir) + files = glob.glob(str(corpus_dir / "*/*.wav")) + langs = [Path(f).parent.stem for f in files] + if map_langs_to_lre_codes: + langs = map_to_lre(langs) + ids = [f"{l}-{Path(f).stem}" for f, l in zip(files, langs)] + df = pd.DataFrame({"id": ids, "language": langs, "filename": files}) + if remove_langs is not None: + for lang in remove_langs: + df = df[df["language"] != lang] + + df["sample_coding"] = "pcm" + df["source"] = "afv" + df["corpus_id"] = "voxlingua107" + df["sample_rate"] = target_fs + + # sort by segment id + df.sort_values(by="id", inplace=True) + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_file = output_dir / "segments.csv" + logging.info("saving %s", output_file) + df.drop(["filename"], axis=1).to_csv(output_file, sep=",", index=False) + + make_kaldi(df, output_dir, target_fs) + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Prepares Voxlingua107 for training") + parser.add_argument( + "--corpus-dir", required=True, help="Path to the original dataset" + ) + parser.add_argument("--output-dir", required=True, help="data path") + parser.add_argument( + "--remove-langs", default=None, nargs="+", help="languages to remove" + ) + parser.add_argument( + "--map-langs-to-lre-codes", + default=False, + action=ActionYesNo, + help="use LRE17 language codes", + ) + + parser.add_argument( + "--target-fs", default=16000, type=int, help="Target sampling frequency" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + prepare_voxlingua107(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/score_lre22.sh b/egs/lre22/fixed.v1.8k/local/score_lre22.sh new file mode 100755 index 00000000..e6564da4 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/score_lre22.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 3 ];then + echo "Usage: $0 " + exit 1 +fi + +dev_eval=$1 +score_file=$(readlink -f $2) +output_file=$(readlink -f $3) +echo $dev_eval $score_file $output_file +output_dir=$(dirname $output_file) +mkdir -p $output_dir + +conda activate $HYP_ENV + +cd ./lre-scorer +echo "Scoring $score_file -> $output_file" +if [ "$dev_eval" == "dev" ];then + config=config.ini +else + config=config_eval.ini +fi + +python ./scoreit.py -s $score_file -o $output_file -e $config + +cd - diff --git a/egs/lre22/fixed.v1.8k/local/split_dev.py b/egs/lre22/fixed.v1.8k/local/split_dev.py new file mode 100755 index 00000000..5988e245 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/split_dev.py @@ -0,0 +1,80 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from pathlib import Path +from jsonargparse import ArgumentParser, namespace_to_dict +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet + + +def split_dev(segs_file, output_dir, num_folds, verbose): + config_logger(verbose) + segs = SegmentSet.load(segs_file) + assert "subclass_idx" in segs + class_ids = segs["class_id"] + _, class_idx = np.unique(class_ids, return_inverse=True) + logging.info("splitting segments into %d folds", num_folds) + folds = [[] for i in range(num_folds)] + for c in range(np.max(class_idx) + 1): + c_idx = class_idx == c + subclass_idx = segs.loc[c_idx, "subclass_idx"] + num_c = len(subclass_idx) + num_c_pf = num_c / num_folds + _, counts = np.unique(subclass_idx, return_counts=True) + acc_counts = np.cumsum(counts) + logging.info( + f"class {c} subclass-counts={counts}, subclass-acc-counts={acc_counts}" + ) + c_idx = np.nonzero(c_idx)[0] + first = 0 + for f in range(num_folds): + if f < num_folds - 1: + last = np.argmin(np.abs(acc_counts - (f + 1) * num_c_pf)) + else: + last = np.max(subclass_idx) + f_idx = np.logical_and(subclass_idx >= first, subclass_idx <= last) + folds[f].extend(c_idx[f_idx]) + logging.info( + ( + f"class {c} fold {f} add {np.sum(f_idx)} samples," + f"accum {len(folds[f])} samples, " + f"first-subclass={first}, last-subclass={last}" + ) + ) + first = last + 1 + + output_dir = Path(output_dir) + for f in range(num_folds): + logging.info( + "fold %d, train-samples=%d test-samples=%d", + f, + len(segs) - len(folds[f]), + len(folds[f]), + ) + f_dir = output_dir / f"fold_{f}" + f_dir.mkdir(parents=True, exist_ok=True) + mask = np.zeros((len(segs),), dtype=bool) + mask[folds[f]] = True + segs_test = SegmentSet(segs.loc[mask]) + segs_test.save(f_dir / "test_segments.csv") + segs_train = SegmentSet(segs.loc[~mask]) + segs_train.save(f_dir / "train_segments.csv") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Splits LRE22 into folds") + parser.add_argument( + "--segs-file", required=True, help="Segments file with subclass_idx column", + ) + parser.add_argument("--output-dir", required=True, help="output path") + parser.add_argument("--num-folds", default=2, type=int, help="number of folds") + parser.add_argument("-v", "--verbose", default=1, choices=[0, 1, 2, 3], type=int) + + args = parser.parse_args() + split_dev(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/split_segments_train_val.py b/egs/lre22/fixed.v1.8k/local/split_segments_train_val.py new file mode 100755 index 00000000..922c868c --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/split_segments_train_val.py @@ -0,0 +1,160 @@ +#!/bin/env python +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from jsonargparse import ArgumentParser, namespace_to_dict +import logging +from pathlib import Path +import re +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import RecordingSet, FeatureSet, SegmentSet, ClassInfo + + +def split_train_val( + segments_file, + recordings_file, + feats_file, + durations_file, + ara_ary_seg_file, + in_class_name, + out_class_name, + val_percent, + remove_langs, + seed, + output_dir, + verbose, +): + + config_logger(verbose) + rng = np.random.RandomState(seed=seed) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + segs = SegmentSet.load(segments_file) + if durations_file is not None: + durs = SegmentSet.load(durations_file) + if "duration" in durs: + segs["duration"] = durs.loc[segs["id"], "duration"] + else: + segs["duration"] = durs.loc[segs["id"], "class_id"].astype(float) + + if remove_langs is not None: + for lang in remove_langs: + segs = segs[segs[in_class_name] != lang] + + segs = SegmentSet(segs) + + if ara_ary_seg_file is not None: + segs_ary = SegmentSet.load(ara_ary_seg_file) + segs.loc[segs_ary["id"], in_class_name] = segs_ary["class_id"] + n1 = len(segs) + noary_idx = segs[in_class_name] != "ara-ary" + segs = SegmentSet(segs.loc[noary_idx]) + logging.info("removing ara-ary segments remained %d / %d", len(segs), n1) + + logging.info("creating class_info file") + class_ids = segs[in_class_name].drop_duplicates().sort_values() + class_info = ClassInfo(pd.DataFrame({"id": class_ids})) + class_info.save(output_dir / "class_file.csv") + + logging.info("splitting segments into train and val") + segs.set_index(in_class_name) + val_mask = np.zeros((len(segs),), dtype=bool) + for c in class_info["id"]: + seg_idx_c = segs.get_loc(c) + num_val = int(val_percent * len(seg_idx_c) / 100) + val_idx = rng.choice(seg_idx_c, size=num_val, replace=False) + val_mask[val_idx] = True + logging.info( + "class %s total=%d train=%d val=%d", + c, + len(seg_idx_c), + len(seg_idx_c) - num_val, + num_val, + ) + + segs.reset_index() + if out_class_name is not None: + segs.rename(columns={in_class_name: out_class_name}, inplace=True) + + train_segs = SegmentSet(segs.loc[~val_mask]) + train_segs.save(output_dir / "train_segments.csv") + val_segs = SegmentSet(segs.loc[val_mask]) + val_segs.save(output_dir / "val_segments.csv") + + if recordings_file is not None: + logging.info("splitting recordings into train and val") + recs = RecordingSet.load(recordings_file) + train_recs = RecordingSet(recs.loc[train_segs.recording_ids(train_segs["id"])]) + train_recs.save(output_dir / "train_recordings.csv") + val_recs = RecordingSet(recs.loc[val_segs.recording_ids(val_segs["id"])]) + val_recs.save(output_dir / "val_recordings.csv") + + if feats_file is not None: + logging.info("splitting features into train and val") + feats = FeatureSet.load(feats_file) + train_feats = FeatureSet(feats.loc[train_segs["id"]]) + train_feats.save(output_dir / "train_feats.csv") + val_feats = FeatureSet(feats.loc[val_segs["id"]]) + val_feats.save(output_dir / "val_feats.csv") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Split Segment list into training and validation" + ) + parser.add_argument( + "--segments-file", required=True, help="path to segments file", + ) + parser.add_argument( + "--recordings-file", + default=None, + help="if not None, splits recordings file into train and val", + ) + + parser.add_argument( + "--durations-file", + default=None, + help="if not None, add durations to segments file", + ) + + parser.add_argument( + "--feats-file", + default=None, + help="if not None, splits features file into train and val", + ) + parser.add_argument( + "--ara-ary-seg-file", + default=None, + help="segment-file with labels for Maghrebi Arabic", + ) + + parser.add_argument( + "--in-class-name", + default="class_id", + help="column name containing the class_id that we consider to make the partition", + ) + parser.add_argument( + "--out-class-name", + default=None, + help="if not None, we rename the class_id column in the output file", + ) + parser.add_argument( + "--val-percent", default=5.0, type=float, help="percentage of data used for val" + ) + parser.add_argument( + "--remove-langs", default=None, nargs="+", help="remove languages from training" + ) + parser.add_argument("--seed", default=1123, type=int, help="random seed") + + parser.add_argument("--output-dir", required=True, help="output directory") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + split_train_val(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/local/train_calibration_lre22.sh b/egs/lre22/fixed.v1.8k/local/train_calibration_lre22.sh new file mode 100755 index 00000000..227331b3 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/train_calibration_lre22.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 1 ];then + echo "Usage: $0 " + exit 1 +fi + +score_dir=$1 +nocal_dir=$score_dir/nocal +cal_dir=$score_dir/cal_v1 + +train_list=data/lre22_dev/utt2lang +train_file=$nocal_dir/lre22_dev_scores.tsv +train_cal_file=$cal_dir/lre22_dev_scores.tsv +eval_file=$nocal_dir/lre22_eval_scores.tsv +eval_cal_file=$cal_dir/lre22_eval_scores.tsv +mkdir -p $cal_dir +model_file=$cal_dir/cal.mat + +if [ "$(hostname --domain)" == "cm.gemini" ];then + module load matlab +fi + +echo " +addpath('steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +train_fusion('$train_list', {'$train_file'}, '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/train.log + +echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({'$train_file'}, '$train_cal_file', '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/eval_lre22_dev.log + +if [ -f $eval_file ];then + echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({'$eval_file'}, '$eval_cal_file', '$model_file'); +" | matlab -nodisplay -nosplash > $cal_dir/eval_lre22_eval.log +fi + + diff --git a/egs/lre22/fixed.v1.8k/local/train_fusion_lre22.sh b/egs/lre22/fixed.v1.8k/local/train_fusion_lre22.sh new file mode 100755 index 00000000..add44362 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/train_fusion_lre22.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 2 ];then + echo "Usage: $0 " + exit 1 +fi + +score_dirs="$1" +output_dir=$2 + +train_list=data/lre22_dev/utt2lang +train_base=lre22_dev_scores.tsv +train_files=$(echo $score_dirs | awk 'BEGIN{OFS=","}{ for(i=1;i<=NF;i++){ $i="'\''"$i"/'$train_base\''" }; print $0}') + +train_fus_file=$output_dir/$train_base +mkdir -p $output_dir +model_file=$output_dir/fus.mat + +if [ "$(hostname --domain)" == "cm.gemini" ];then + module load matlab +fi + +echo " +addpath('steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +train_fusion('$train_list', {$train_files}, '$model_file'); +" | matlab -nodisplay -nosplash > $output_dir/train.log + +echo " +addpath('./steps_be'); +addpath(genpath('$PWD/focal_multiclass/v1.0')); +eval_fusion({$train_files}, '$train_fus_file', '$model_file'); +" | matlab -nodisplay -nosplash > $output_dir/eval_lre22_dev.log + diff --git a/egs/lre22/fixed.v1.8k/local/validate_lre22.sh b/egs/lre22/fixed.v1.8k/local/validate_lre22.sh new file mode 100755 index 00000000..fe039a5a --- /dev/null +++ b/egs/lre22/fixed.v1.8k/local/validate_lre22.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +. path.sh + +if [ $# -ne 1 ];then + echo "Usage: $0 " + exit 1 +fi + +score_file=$(readlink -f $1) +conda activate $HYP_ENV + +cd ./lre-scorer +echo "Scoring $score_file -> $output_file" +python ./scoreit.py -s $score_file -o $score_file.val -v + +cd - diff --git a/egs/lre22/fixed.v1.8k/path.sh b/egs/lre22/fixed.v1.8k/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/lre22/fixed.v1.8k/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/test_segments.csv b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/test_segments.csv new file mode 100644 index 00000000..6518f24e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/test_segments.csv @@ -0,0 +1,2114 @@ +id,class_id,subclass_idx +lre22_dev_aadaq,afr-afr,5 +lre22_dev_aaxdt,xho-xho,14 +lre22_dev_abujj,xho-xho,15 +lre22_dev_acgiu,zul-zul,6 +lre22_dev_acnyv,ven-ven,7 +lre22_dev_adbku,ara-ayl,4 +lre22_dev_ademr,orm-orm,3 +lre22_dev_adgoy,xho-xho,4 +lre22_dev_adnpi,eng-ens,1 +lre22_dev_adqaa,ven-ven,10 +lre22_dev_adwzf,zul-zul,2 +lre22_dev_aeiuj,afr-afr,4 +lre22_dev_afhui,eng-ens,4 +lre22_dev_afuav,nbl-nbl,15 +lre22_dev_afvvg,ven-ven,10 +lre22_dev_afxjf,eng-iaf,10 +lre22_dev_agmwb,ara-aeb,10 +lre22_dev_agnik,eng-ens,3 +lre22_dev_ahcja,orm-orm,14 +lre22_dev_ahobp,afr-afr,13 +lre22_dev_ahupk,eng-ens,11 +lre22_dev_aicjg,xho-xho,12 +lre22_dev_aikrz,eng-ens,9 +lre22_dev_ailwo,orm-orm,7 +lre22_dev_aiqhl,tir-tir,10 +lre22_dev_aiuwf,ara-ayl,5 +lre22_dev_aizyr,ara-arq,0 +lre22_dev_ajbui,zul-zul,12 +lre22_dev_ajigk,ara-aeb,10 +lre22_dev_ajuwq,ara-ayl,3 +lre22_dev_akbly,nbl-nbl,3 +lre22_dev_akhwr,xho-xho,6 +lre22_dev_aksxd,nbl-nbl,6 +lre22_dev_aktcg,afr-afr,1 +lre22_dev_aktzw,eng-ens,11 +lre22_dev_akulq,orm-orm,14 +lre22_dev_alcie,orm-orm,11 +lre22_dev_alunz,xho-xho,6 +lre22_dev_amaec,tir-tir,10 +lre22_dev_amnvo,ara-arq,6 +lre22_dev_amxrk,zul-zul,9 +lre22_dev_anmuv,tso-tso,11 +lre22_dev_aomcz,ara-aeb,7 +lre22_dev_aooht,fra-ntf,11 +lre22_dev_aprbe,ara-arq,3 +lre22_dev_apxxx,orm-orm,12 +lre22_dev_aqdwu,ven-ven,6 +lre22_dev_aqejl,xho-xho,5 +lre22_dev_aqnyy,tso-tso,5 +lre22_dev_arjuc,afr-afr,5 +lre22_dev_arrkp,tir-tir,1 +lre22_dev_atdgp,zul-zul,13 +lre22_dev_atoxn,eng-ens,10 +lre22_dev_audls,afr-afr,6 +lre22_dev_auilj,ven-ven,11 +lre22_dev_auqgt,eng-iaf,3 +lre22_dev_autlo,zul-zul,7 +lre22_dev_avait,zul-zul,3 +lre22_dev_avvik,nbl-nbl,14 +lre22_dev_awgem,ara-ayl,3 +lre22_dev_awgnb,fra-ntf,14 +lre22_dev_awvym,ara-ayl,9 +lre22_dev_axhbz,tir-tir,12 +lre22_dev_axici,tir-tir,8 +lre22_dev_axtpv,xho-xho,6 +lre22_dev_aygsz,ara-aeb,4 +lre22_dev_ayiif,ven-ven,7 +lre22_dev_azqvo,zul-zul,3 +lre22_dev_basml,eng-ens,11 +lre22_dev_bawje,tir-tir,6 +lre22_dev_bbana,zul-zul,7 +lre22_dev_bbtpz,ven-ven,5 +lre22_dev_bcbrw,eng-iaf,2 +lre22_dev_bchvx,zul-zul,9 +lre22_dev_bcllp,afr-afr,13 +lre22_dev_bcsmi,fra-ntf,6 +lre22_dev_bdqaw,ven-ven,6 +lre22_dev_bdwle,ara-arq,6 +lre22_dev_behbh,ara-ayl,4 +lre22_dev_bexda,ara-arq,6 +lre22_dev_bfbyn,ara-aeb,9 +lre22_dev_bfjgx,ara-ayl,7 +lre22_dev_bgbjo,nbl-nbl,1 +lre22_dev_bgebs,ara-ayl,5 +lre22_dev_bgnod,fra-ntf,3 +lre22_dev_bhezb,ara-ayl,7 +lre22_dev_bhyuy,afr-afr,13 +lre22_dev_bidge,tir-tir,12 +lre22_dev_bimnd,eng-ens,7 +lre22_dev_biyaj,ara-ayl,5 +lre22_dev_bjsgu,afr-afr,10 +lre22_dev_blmfp,eng-iaf,5 +lre22_dev_blohd,ven-ven,4 +lre22_dev_bmebz,ara-arq,4 +lre22_dev_bmjuo,ara-aeb,6 +lre22_dev_bmkrm,fra-ntf,10 +lre22_dev_bmzym,zul-zul,5 +lre22_dev_bnfuu,orm-orm,13 +lre22_dev_bnilb,zul-zul,8 +lre22_dev_bnxna,eng-ens,1 +lre22_dev_boikl,orm-orm,7 +lre22_dev_boisz,ven-ven,2 +lre22_dev_boqxy,zul-zul,13 +lre22_dev_bpqhd,tso-tso,2 +lre22_dev_briiw,ara-aeb,8 +lre22_dev_brohj,fra-ntf,1 +lre22_dev_brqdv,nbl-nbl,3 +lre22_dev_brwcj,afr-afr,6 +lre22_dev_bsclv,orm-orm,8 +lre22_dev_bsdbb,ara-arq,4 +lre22_dev_bstjt,nbl-nbl,10 +lre22_dev_btbke,ara-aeb,0 +lre22_dev_btcfj,ven-ven,12 +lre22_dev_btomw,ven-ven,6 +lre22_dev_btpvy,afr-afr,1 +lre22_dev_btrtb,ara-arq,4 +lre22_dev_btruf,zul-zul,8 +lre22_dev_btsll,ara-ayl,7 +lre22_dev_butrw,ara-ayl,6 +lre22_dev_buwrj,ara-ayl,2 +lre22_dev_bvlhb,fra-ntf,8 +lre22_dev_bvmql,xho-xho,10 +lre22_dev_bvnsc,tir-tir,10 +lre22_dev_bwrej,ven-ven,9 +lre22_dev_bxial,eng-ens,2 +lre22_dev_bxnbf,fra-ntf,9 +lre22_dev_bybim,afr-afr,6 +lre22_dev_byegp,orm-orm,15 +lre22_dev_byngq,ven-ven,9 +lre22_dev_byytf,fra-ntf,6 +lre22_dev_bzies,tso-tso,3 +lre22_dev_bzipd,afr-afr,7 +lre22_dev_cacop,nbl-nbl,5 +lre22_dev_caent,afr-afr,12 +lre22_dev_capsb,ven-ven,0 +lre22_dev_cawbw,orm-orm,12 +lre22_dev_cblep,ven-ven,3 +lre22_dev_cblig,fra-ntf,6 +lre22_dev_ccexy,ven-ven,7 +lre22_dev_ccsye,ara-aeb,8 +lre22_dev_cctyt,eng-iaf,11 +lre22_dev_ccuie,eng-ens,7 +lre22_dev_ccvzf,eng-iaf,1 +lre22_dev_cdlkq,tso-tso,8 +lre22_dev_cdtiu,ara-ayl,9 +lre22_dev_cemyb,tir-tir,12 +lre22_dev_ceprg,eng-iaf,9 +lre22_dev_ceqow,nbl-nbl,15 +lre22_dev_cfdsu,fra-ntf,7 +lre22_dev_cfhbm,ven-ven,3 +lre22_dev_cfsew,afr-afr,12 +lre22_dev_cgges,eng-iaf,11 +lre22_dev_cgjnr,eng-iaf,10 +lre22_dev_cgotg,eng-ens,11 +lre22_dev_cgovb,nbl-nbl,15 +lre22_dev_cgssg,tir-tir,7 +lre22_dev_chhsl,tir-tir,7 +lre22_dev_chjuh,nbl-nbl,9 +lre22_dev_chpoe,nbl-nbl,11 +lre22_dev_chtgu,ara-aeb,10 +lre22_dev_chtlt,eng-iaf,10 +lre22_dev_cigir,eng-ens,9 +lre22_dev_ciyeh,ara-ayl,2 +lre22_dev_cjswm,orm-orm,12 +lre22_dev_cjtdl,ven-ven,13 +lre22_dev_ckzie,ara-aeb,10 +lre22_dev_cldfc,ara-ayl,8 +lre22_dev_clxqz,ara-arq,9 +lre22_dev_cmahj,afr-afr,13 +lre22_dev_cmqxm,tir-tir,6 +lre22_dev_cmrdt,afr-afr,5 +lre22_dev_cmvpq,ara-ayl,2 +lre22_dev_cnbfw,eng-iaf,5 +lre22_dev_cnbvd,afr-afr,9 +lre22_dev_cnomp,orm-orm,15 +lre22_dev_cnrvj,xho-xho,11 +lre22_dev_cnszu,ara-ayl,4 +lre22_dev_cnudd,xho-xho,14 +lre22_dev_cnuoi,orm-orm,14 +lre22_dev_cnxjs,orm-orm,8 +lre22_dev_coarm,xho-xho,4 +lre22_dev_cocyn,zul-zul,6 +lre22_dev_colxc,zul-zul,13 +lre22_dev_cosfn,ara-aeb,10 +lre22_dev_cosgu,ara-ayl,7 +lre22_dev_cpjab,ara-aeb,10 +lre22_dev_cpple,tso-tso,6 +lre22_dev_cqhjy,ara-ayl,3 +lre22_dev_cqkmy,ara-aeb,10 +lre22_dev_cqukb,tso-tso,9 +lre22_dev_cqusc,orm-orm,6 +lre22_dev_cqyzf,fra-ntf,13 +lre22_dev_crcwu,xho-xho,12 +lre22_dev_crqjz,nbl-nbl,10 +lre22_dev_crtpm,ara-arq,5 +lre22_dev_crucu,tir-tir,6 +lre22_dev_crvby,eng-iaf,12 +lre22_dev_crvoh,eng-ens,7 +lre22_dev_csjxv,ara-arq,3 +lre22_dev_ctfiv,ara-aeb,5 +lre22_dev_ctgpr,ven-ven,12 +lre22_dev_ctlrz,tir-tir,8 +lre22_dev_ctzhm,zul-zul,6 +lre22_dev_cudew,ven-ven,8 +lre22_dev_cusin,ara-arq,10 +lre22_dev_cvaad,eng-iaf,5 +lre22_dev_cvedm,zul-zul,12 +lre22_dev_cvgfx,eng-iaf,8 +lre22_dev_cvujh,ara-ayl,2 +lre22_dev_cweil,ara-aeb,10 +lre22_dev_cweuh,eng-ens,7 +lre22_dev_cwiro,afr-afr,6 +lre22_dev_cwtby,ara-arq,7 +lre22_dev_cxggy,afr-afr,4 +lre22_dev_cxnqr,tso-tso,7 +lre22_dev_cxpan,nbl-nbl,14 +lre22_dev_cxsxl,ara-aeb,10 +lre22_dev_cxyti,tso-tso,8 +lre22_dev_cypcg,zul-zul,12 +lre22_dev_czcmz,zul-zul,10 +lre22_dev_czdzw,orm-orm,7 +lre22_dev_czppj,zul-zul,10 +lre22_dev_czxff,zul-zul,9 +lre22_dev_czxld,fra-ntf,9 +lre22_dev_dajnt,zul-zul,12 +lre22_dev_dbcxi,orm-orm,9 +lre22_dev_dbdbv,tso-tso,8 +lre22_dev_dbdwv,orm-orm,5 +lre22_dev_dbgof,nbl-nbl,15 +lre22_dev_dblhh,eng-iaf,0 +lre22_dev_dbljb,xho-xho,12 +lre22_dev_dcibg,eng-iaf,4 +lre22_dev_dcobk,ara-arq,8 +lre22_dev_dcvcu,afr-afr,4 +lre22_dev_dcvyc,fra-ntf,14 +lre22_dev_ddfeo,ara-ayl,5 +lre22_dev_ddhaq,zul-zul,10 +lre22_dev_ddhes,afr-afr,6 +lre22_dev_ddsds,afr-afr,12 +lre22_dev_ddxvn,ven-ven,5 +lre22_dev_dfdrs,ven-ven,7 +lre22_dev_dfifl,ara-ayl,9 +lre22_dev_dfjek,ven-ven,4 +lre22_dev_dflco,zul-zul,12 +lre22_dev_dftta,tso-tso,6 +lre22_dev_dfxnq,eng-ens,11 +lre22_dev_dgjdi,orm-orm,8 +lre22_dev_dgqwo,tir-tir,9 +lre22_dev_dhapq,ara-aeb,8 +lre22_dev_dhdfk,eng-ens,8 +lre22_dev_dhfjj,ara-arq,4 +lre22_dev_dhlxh,ara-aeb,4 +lre22_dev_dhnne,eng-ens,10 +lre22_dev_dhtlz,eng-ens,6 +lre22_dev_diarz,ara-ayl,2 +lre22_dev_diggg,tir-tir,9 +lre22_dev_diqtw,ara-aeb,8 +lre22_dev_dixuw,orm-orm,9 +lre22_dev_diypf,orm-orm,13 +lre22_dev_djzsk,nbl-nbl,13 +lre22_dev_dksey,nbl-nbl,11 +lre22_dev_dlzwh,fra-ntf,12 +lre22_dev_dmdpv,eng-ens,2 +lre22_dev_dmeea,orm-orm,14 +lre22_dev_dmhdv,xho-xho,10 +lre22_dev_dmics,fra-ntf,14 +lre22_dev_dmiiu,ara-aeb,6 +lre22_dev_dmjxr,xho-xho,10 +lre22_dev_dmzxn,afr-afr,4 +lre22_dev_dngtw,ara-ayl,3 +lre22_dev_dnjdq,eng-ens,7 +lre22_dev_dnprz,zul-zul,12 +lre22_dev_dobdj,fra-ntf,0 +lre22_dev_dobwk,orm-orm,8 +lre22_dev_donqm,ara-arq,3 +lre22_dev_dpbyt,tso-tso,6 +lre22_dev_dpfns,ara-aeb,4 +lre22_dev_dpjjp,fra-ntf,7 +lre22_dev_dpomx,eng-iaf,5 +lre22_dev_dpwhs,eng-ens,8 +lre22_dev_dpygj,eng-iaf,8 +lre22_dev_dqzex,xho-xho,3 +lre22_dev_drcqx,eng-iaf,7 +lre22_dev_drfhb,ara-aeb,10 +lre22_dev_drfte,ara-arq,8 +lre22_dev_driks,eng-ens,11 +lre22_dev_drofs,fra-ntf,1 +lre22_dev_dslxl,ara-ayl,7 +lre22_dev_dsmwd,ven-ven,13 +lre22_dev_dsyyk,tir-tir,9 +lre22_dev_dthcb,zul-zul,12 +lre22_dev_dtumd,fra-ntf,5 +lre22_dev_dtwmj,afr-afr,7 +lre22_dev_duegm,tso-tso,9 +lre22_dev_dvirs,afr-afr,6 +lre22_dev_dvtzf,eng-iaf,7 +lre22_dev_dwcfi,ven-ven,7 +lre22_dev_dwfle,fra-ntf,7 +lre22_dev_dwgsv,tir-tir,6 +lre22_dev_dwlay,ara-arq,3 +lre22_dev_dwnit,xho-xho,15 +lre22_dev_dwvoh,tso-tso,6 +lre22_dev_dxgpq,afr-afr,12 +lre22_dev_dxhpf,ara-ayl,9 +lre22_dev_dxlhq,ara-arq,5 +lre22_dev_dxrcj,zul-zul,5 +lre22_dev_dywox,tir-tir,9 +lre22_dev_dzjrv,eng-iaf,8 +lre22_dev_dzsql,tso-tso,6 +lre22_dev_dzxkv,orm-orm,13 +lre22_dev_eabne,xho-xho,2 +lre22_dev_eacdl,fra-ntf,14 +lre22_dev_eaupg,eng-iaf,11 +lre22_dev_eawug,eng-iaf,6 +lre22_dev_ebbgx,nbl-nbl,15 +lre22_dev_ecber,afr-afr,10 +lre22_dev_ecdgv,ara-arq,5 +lre22_dev_ecneb,afr-afr,6 +lre22_dev_ecxrr,tir-tir,9 +lre22_dev_edldw,tir-tir,10 +lre22_dev_edofc,afr-afr,6 +lre22_dev_edvaf,xho-xho,13 +lre22_dev_edydw,eng-ens,5 +lre22_dev_eejtn,zul-zul,4 +lre22_dev_eekzc,fra-ntf,4 +lre22_dev_eenhx,eng-iaf,9 +lre22_dev_efcgi,fra-ntf,0 +lre22_dev_efdoz,ven-ven,8 +lre22_dev_efioy,tso-tso,9 +lre22_dev_efiwx,eng-ens,9 +lre22_dev_efrlw,ven-ven,2 +lre22_dev_eghmh,eng-ens,11 +lre22_dev_ehhyu,nbl-nbl,10 +lre22_dev_eiomi,ven-ven,12 +lre22_dev_eisiy,orm-orm,8 +lre22_dev_ejaiq,ara-aeb,1 +lre22_dev_ejkmr,eng-iaf,5 +lre22_dev_ejthv,ven-ven,12 +lre22_dev_ejtyd,fra-ntf,14 +lre22_dev_ekfzq,ara-ayl,6 +lre22_dev_ekgjp,zul-zul,3 +lre22_dev_ekixu,nbl-nbl,2 +lre22_dev_ekjxx,ara-arq,6 +lre22_dev_ekvxc,eng-iaf,4 +lre22_dev_eldrg,orm-orm,11 +lre22_dev_elitc,ara-arq,3 +lre22_dev_emdtf,xho-xho,0 +lre22_dev_emhqx,tir-tir,4 +lre22_dev_emxnm,afr-afr,4 +lre22_dev_emzaa,xho-xho,3 +lre22_dev_engqe,xho-xho,15 +lre22_dev_ennjl,tso-tso,10 +lre22_dev_eokyg,nbl-nbl,2 +lre22_dev_epkwr,tir-tir,5 +lre22_dev_epojj,tir-tir,7 +lre22_dev_epsdk,nbl-nbl,12 +lre22_dev_epsfl,xho-xho,14 +lre22_dev_epuno,eng-ens,1 +lre22_dev_epylu,eng-iaf,10 +lre22_dev_ereen,ara-arq,10 +lre22_dev_eriaf,eng-ens,4 +lre22_dev_ermqx,ara-arq,2 +lre22_dev_escob,fra-ntf,9 +lre22_dev_esjsk,ara-ayl,7 +lre22_dev_esqti,xho-xho,9 +lre22_dev_etaln,zul-zul,12 +lre22_dev_etarn,nbl-nbl,6 +lre22_dev_etndu,ven-ven,13 +lre22_dev_etpdc,afr-afr,3 +lre22_dev_etsam,zul-zul,7 +lre22_dev_etwge,eng-ens,6 +lre22_dev_etxyc,orm-orm,12 +lre22_dev_eumsq,zul-zul,10 +lre22_dev_eusfl,orm-orm,8 +lre22_dev_eutkk,tso-tso,0 +lre22_dev_euxuy,orm-orm,13 +lre22_dev_evaon,ara-aeb,4 +lre22_dev_evkaz,eng-iaf,8 +lre22_dev_evret,fra-ntf,8 +lre22_dev_evvep,tso-tso,9 +lre22_dev_evvvd,tir-tir,10 +lre22_dev_ewems,ven-ven,7 +lre22_dev_ewijw,orm-orm,11 +lre22_dev_ewqpv,eng-iaf,6 +lre22_dev_ewywf,nbl-nbl,10 +lre22_dev_exaia,afr-afr,3 +lre22_dev_exbum,afr-afr,4 +lre22_dev_exhhd,ara-aeb,5 +lre22_dev_exkkf,afr-afr,3 +lre22_dev_extrh,zul-zul,6 +lre22_dev_exzyo,xho-xho,15 +lre22_dev_eyrzt,ara-ayl,1 +lre22_dev_eysdu,zul-zul,4 +lre22_dev_eyshz,xho-xho,1 +lre22_dev_eyuyq,ara-ayl,7 +lre22_dev_ezsyu,ven-ven,3 +lre22_dev_faahr,afr-afr,9 +lre22_dev_fabli,ven-ven,6 +lre22_dev_fatah,zul-zul,12 +lre22_dev_fccpw,orm-orm,12 +lre22_dev_fcpbu,xho-xho,8 +lre22_dev_fcqbx,tso-tso,3 +lre22_dev_fcwnw,fra-ntf,8 +lre22_dev_fdgia,orm-orm,10 +lre22_dev_febnk,eng-ens,5 +lre22_dev_fedau,eng-iaf,5 +lre22_dev_fehxn,xho-xho,8 +lre22_dev_fejsd,ven-ven,8 +lre22_dev_feqjc,eng-iaf,12 +lre22_dev_fesss,nbl-nbl,15 +lre22_dev_feuww,fra-ntf,8 +lre22_dev_fevex,zul-zul,2 +lre22_dev_ffban,ara-arq,6 +lre22_dev_ffefw,orm-orm,13 +lre22_dev_ffsps,fra-ntf,8 +lre22_dev_ffwid,tso-tso,11 +lre22_dev_fgbtr,nbl-nbl,15 +lre22_dev_fgmbr,ara-arq,6 +lre22_dev_fgmxd,eng-ens,9 +lre22_dev_fgnfs,tir-tir,12 +lre22_dev_fgrze,eng-ens,11 +lre22_dev_fhlhy,ara-aeb,7 +lre22_dev_fihvr,eng-iaf,7 +lre22_dev_fiizm,xho-xho,14 +lre22_dev_fiksd,fra-ntf,12 +lre22_dev_fitjt,tso-tso,6 +lre22_dev_fiuun,eng-ens,7 +lre22_dev_fjdul,ara-ayl,3 +lre22_dev_fjgrh,ven-ven,8 +lre22_dev_fkaqj,nbl-nbl,13 +lre22_dev_flfgv,ara-aeb,9 +lre22_dev_flirl,fra-ntf,13 +lre22_dev_fljab,fra-ntf,14 +lre22_dev_flnzm,tir-tir,11 +lre22_dev_flsmp,orm-orm,15 +lre22_dev_fmjvq,ven-ven,2 +lre22_dev_fmmxd,afr-afr,4 +lre22_dev_fnglh,afr-afr,13 +lre22_dev_fnsax,xho-xho,6 +lre22_dev_fojyn,eng-ens,5 +lre22_dev_foqgk,ven-ven,2 +lre22_dev_fovba,ara-arq,4 +lre22_dev_fozyj,ara-arq,2 +lre22_dev_fpavw,ara-aeb,8 +lre22_dev_fptba,eng-ens,3 +lre22_dev_fqdfc,tso-tso,11 +lre22_dev_fqdhm,eng-iaf,8 +lre22_dev_fqfet,nbl-nbl,7 +lre22_dev_fqgty,fra-ntf,4 +lre22_dev_fqgyd,zul-zul,10 +lre22_dev_fqvup,tso-tso,2 +lre22_dev_frviu,ara-aeb,10 +lre22_dev_frwfk,nbl-nbl,9 +lre22_dev_fsygm,eng-iaf,5 +lre22_dev_ftfjv,orm-orm,11 +lre22_dev_ftjvg,afr-afr,12 +lre22_dev_ftmnu,ara-aeb,10 +lre22_dev_ftrcl,eng-ens,3 +lre22_dev_ftygz,eng-ens,8 +lre22_dev_fughv,eng-iaf,3 +lre22_dev_fuhuk,ara-ayl,5 +lre22_dev_fusyr,ven-ven,13 +lre22_dev_futhm,zul-zul,5 +lre22_dev_fvbzh,ara-ayl,7 +lre22_dev_fvecf,ven-ven,9 +lre22_dev_fvktn,fra-ntf,8 +lre22_dev_fvpts,orm-orm,6 +lre22_dev_fvsmm,eng-iaf,12 +lre22_dev_fvvgc,ara-arq,5 +lre22_dev_fwvzh,zul-zul,2 +lre22_dev_fwwsy,xho-xho,5 +lre22_dev_fxggn,fra-ntf,1 +lre22_dev_fxqfi,orm-orm,10 +lre22_dev_fxuqw,ara-ayl,3 +lre22_dev_fxwfc,eng-iaf,12 +lre22_dev_fymdc,tso-tso,4 +lre22_dev_fywir,tso-tso,10 +lre22_dev_fzjzu,xho-xho,14 +lre22_dev_fzpeh,ara-aeb,10 +lre22_dev_fztdi,tir-tir,9 +lre22_dev_gcced,ven-ven,6 +lre22_dev_gchqj,zul-zul,10 +lre22_dev_gctmk,xho-xho,12 +lre22_dev_gcupw,ven-ven,7 +lre22_dev_gdfdn,tir-tir,5 +lre22_dev_gdlpg,tir-tir,3 +lre22_dev_gdrwq,fra-ntf,14 +lre22_dev_gdvjh,afr-afr,5 +lre22_dev_gdvtc,eng-iaf,13 +lre22_dev_gdxck,orm-orm,4 +lre22_dev_gecgq,afr-afr,12 +lre22_dev_gevbs,nbl-nbl,13 +lre22_dev_gfqxw,tir-tir,11 +lre22_dev_gfujh,eng-ens,8 +lre22_dev_gfwqx,fra-ntf,10 +lre22_dev_ggchj,tir-tir,10 +lre22_dev_ggeie,ara-arq,8 +lre22_dev_ggqob,ara-aeb,9 +lre22_dev_ghllb,eng-ens,8 +lre22_dev_ghlqh,afr-afr,12 +lre22_dev_ghmuk,afr-afr,13 +lre22_dev_ghskg,tso-tso,4 +lre22_dev_ghwmw,ara-arq,2 +lre22_dev_giijn,ven-ven,6 +lre22_dev_gised,xho-xho,9 +lre22_dev_gisrt,tir-tir,9 +lre22_dev_gjptx,nbl-nbl,4 +lre22_dev_gjvkc,ara-arq,7 +lre22_dev_gjxkc,eng-iaf,13 +lre22_dev_gkywh,ara-aeb,7 +lre22_dev_glhtl,eng-iaf,3 +lre22_dev_glulw,ara-aeb,8 +lre22_dev_gmpja,nbl-nbl,3 +lre22_dev_gmpjm,nbl-nbl,12 +lre22_dev_gnkvz,eng-iaf,13 +lre22_dev_gnmcz,nbl-nbl,4 +lre22_dev_goggr,afr-afr,5 +lre22_dev_goqov,ara-aeb,8 +lre22_dev_gpzgq,tso-tso,9 +lre22_dev_gpzuz,fra-ntf,5 +lre22_dev_gqpul,ara-arq,10 +lre22_dev_gratu,tir-tir,7 +lre22_dev_grewx,afr-afr,9 +lre22_dev_grizt,eng-ens,2 +lre22_dev_grsam,afr-afr,11 +lre22_dev_grsyr,zul-zul,1 +lre22_dev_grxus,nbl-nbl,15 +lre22_dev_gsanj,ven-ven,13 +lre22_dev_gsbwz,nbl-nbl,9 +lre22_dev_gtwjj,tso-tso,4 +lre22_dev_gtxwq,orm-orm,12 +lre22_dev_gubts,ara-ayl,0 +lre22_dev_gvawh,xho-xho,11 +lre22_dev_gvfsb,ara-aeb,10 +lre22_dev_gvhgg,afr-afr,9 +lre22_dev_gvnaj,fra-ntf,8 +lre22_dev_gvysc,ara-aeb,10 +lre22_dev_gwfkz,xho-xho,2 +lre22_dev_gwnqp,xho-xho,7 +lre22_dev_gwumi,tso-tso,3 +lre22_dev_gwvcw,xho-xho,11 +lre22_dev_gwwxz,eng-iaf,1 +lre22_dev_gwzrc,eng-ens,11 +lre22_dev_gxtlx,fra-ntf,13 +lre22_dev_gxygl,tso-tso,9 +lre22_dev_gycld,orm-orm,4 +lre22_dev_gzakl,nbl-nbl,15 +lre22_dev_gzrgo,ara-arq,9 +lre22_dev_hbkul,orm-orm,6 +lre22_dev_hbodn,eng-ens,10 +lre22_dev_hbwgy,ara-arq,6 +lre22_dev_hbwyc,nbl-nbl,5 +lre22_dev_hczek,fra-ntf,7 +lre22_dev_hdpsb,nbl-nbl,6 +lre22_dev_hdvsb,ara-aeb,8 +lre22_dev_hetsy,xho-xho,10 +lre22_dev_hfgrm,ven-ven,12 +lre22_dev_hfurz,afr-afr,13 +lre22_dev_hfwyw,nbl-nbl,11 +lre22_dev_hgdqx,tso-tso,3 +lre22_dev_hgwdk,eng-ens,8 +lre22_dev_hgxqf,eng-iaf,8 +lre22_dev_hgyuk,ven-ven,11 +lre22_dev_hhetm,fra-ntf,14 +lre22_dev_hhjki,ara-arq,8 +lre22_dev_hhvtc,ara-arq,10 +lre22_dev_hhxqv,tso-tso,5 +lre22_dev_hiisb,nbl-nbl,15 +lre22_dev_hioxp,tso-tso,3 +lre22_dev_hjqaf,ara-aeb,9 +lre22_dev_hjqid,orm-orm,6 +lre22_dev_hjzwc,eng-iaf,3 +lre22_dev_hkdzu,ara-arq,9 +lre22_dev_hlatl,eng-iaf,12 +lre22_dev_hlywv,nbl-nbl,2 +lre22_dev_hlzxa,ven-ven,7 +lre22_dev_hmvzg,ara-ayl,3 +lre22_dev_hnjgb,eng-ens,9 +lre22_dev_hntdv,eng-ens,11 +lre22_dev_hoish,tir-tir,2 +lre22_dev_hokbg,ara-ayl,6 +lre22_dev_hondp,eng-iaf,8 +lre22_dev_hpbve,tir-tir,11 +lre22_dev_hpdvc,fra-ntf,8 +lre22_dev_hpgst,orm-orm,5 +lre22_dev_hqbjb,xho-xho,5 +lre22_dev_hqdev,tso-tso,2 +lre22_dev_hqidg,tir-tir,1 +lre22_dev_hqids,afr-afr,9 +lre22_dev_hqltr,tir-tir,4 +lre22_dev_hqqhq,eng-ens,11 +lre22_dev_hrmcg,zul-zul,13 +lre22_dev_hrrcp,afr-afr,8 +lre22_dev_hstgi,xho-xho,9 +lre22_dev_hsvpq,ara-ayl,9 +lre22_dev_hswsy,ara-aeb,4 +lre22_dev_htcgm,eng-iaf,6 +lre22_dev_htedo,xho-xho,13 +lre22_dev_hthkx,eng-iaf,7 +lre22_dev_htohd,afr-afr,6 +lre22_dev_htxik,fra-ntf,0 +lre22_dev_huqbr,xho-xho,10 +lre22_dev_hvdom,afr-afr,8 +lre22_dev_hvkoa,afr-afr,13 +lre22_dev_hvnkg,tir-tir,9 +lre22_dev_hvocp,nbl-nbl,12 +lre22_dev_hvqzj,zul-zul,12 +lre22_dev_hvwph,afr-afr,3 +lre22_dev_hwaqg,zul-zul,8 +lre22_dev_hwgvu,ara-aeb,6 +lre22_dev_hwhlz,ven-ven,11 +lre22_dev_hwkes,fra-ntf,12 +lre22_dev_hwvna,eng-ens,2 +lre22_dev_hxfim,eng-iaf,12 +lre22_dev_hxmdw,afr-afr,10 +lre22_dev_hxrnp,zul-zul,6 +lre22_dev_hxvie,tir-tir,9 +lre22_dev_hxvju,zul-zul,3 +lre22_dev_hxzxm,zul-zul,6 +lre22_dev_hybef,nbl-nbl,14 +lre22_dev_hyfok,eng-ens,2 +lre22_dev_hyscv,ara-arq,4 +lre22_dev_hyzod,eng-iaf,6 +lre22_dev_hzdpb,tso-tso,7 +lre22_dev_hzjwn,ara-aeb,5 +lre22_dev_hzljv,tir-tir,8 +lre22_dev_hzomy,tso-tso,9 +lre22_dev_iaaar,tso-tso,9 +lre22_dev_iaimu,afr-afr,13 +lre22_dev_iakmg,orm-orm,15 +lre22_dev_iarxv,ara-aeb,9 +lre22_dev_iaywv,ara-ayl,6 +lre22_dev_ibcne,eng-ens,11 +lre22_dev_ibeth,zul-zul,2 +lre22_dev_ibwbi,tir-tir,9 +lre22_dev_ibyqr,tso-tso,7 +lre22_dev_iccwp,eng-iaf,6 +lre22_dev_ichmi,afr-afr,12 +lre22_dev_idjrt,zul-zul,8 +lre22_dev_iegng,afr-afr,8 +lre22_dev_iezrr,ara-ayl,7 +lre22_dev_ifaib,ara-ayl,5 +lre22_dev_ifhil,tso-tso,9 +lre22_dev_ifptd,ven-ven,12 +lre22_dev_ifriu,ara-aeb,6 +lre22_dev_ignvp,zul-zul,13 +lre22_dev_igxzy,eng-iaf,12 +lre22_dev_ihdva,fra-ntf,10 +lre22_dev_iiydv,eng-iaf,5 +lre22_dev_ijoyg,ara-ayl,9 +lre22_dev_ikghg,eng-iaf,7 +lre22_dev_ikijv,ven-ven,2 +lre22_dev_ilawb,ara-aeb,8 +lre22_dev_ilgnm,orm-orm,6 +lre22_dev_ilqhp,orm-orm,13 +lre22_dev_imrsx,tso-tso,8 +lre22_dev_inrfz,ara-arq,1 +lre22_dev_inrlw,eng-ens,1 +lre22_dev_inttm,tso-tso,8 +lre22_dev_iorip,ven-ven,13 +lre22_dev_ioryq,ara-aeb,8 +lre22_dev_iosse,afr-afr,1 +lre22_dev_ipahz,tir-tir,12 +lre22_dev_ipaup,tir-tir,10 +lre22_dev_ipllz,tir-tir,12 +lre22_dev_iprih,ara-aeb,4 +lre22_dev_iqkpj,tir-tir,6 +lre22_dev_iqowb,ara-aeb,0 +lre22_dev_iqzfp,orm-orm,15 +lre22_dev_irhue,tso-tso,8 +lre22_dev_irkvo,orm-orm,15 +lre22_dev_irnie,ara-aeb,8 +lre22_dev_irnxg,zul-zul,9 +lre22_dev_irsgt,ven-ven,2 +lre22_dev_isavf,nbl-nbl,0 +lre22_dev_isfpd,nbl-nbl,11 +lre22_dev_iskfd,ara-arq,4 +lre22_dev_isndz,ara-arq,6 +lre22_dev_istwz,nbl-nbl,15 +lre22_dev_isxpy,orm-orm,5 +lre22_dev_iszkk,tir-tir,9 +lre22_dev_itdot,ara-ayl,9 +lre22_dev_itfgh,eng-iaf,9 +lre22_dev_itlqd,tir-tir,12 +lre22_dev_itmbo,ara-aeb,10 +lre22_dev_itznp,ara-aeb,3 +lre22_dev_iucwv,zul-zul,5 +lre22_dev_iuowb,ara-aeb,8 +lre22_dev_iupes,zul-zul,4 +lre22_dev_iurgk,fra-ntf,4 +lre22_dev_ivcpr,nbl-nbl,12 +lre22_dev_ivrwa,ven-ven,3 +lre22_dev_ivvlb,afr-afr,11 +lre22_dev_ivwhm,tir-tir,6 +lre22_dev_iwoya,ara-aeb,4 +lre22_dev_iwpvu,orm-orm,5 +lre22_dev_ixpuq,ara-ayl,5 +lre22_dev_ixpyb,tso-tso,11 +lre22_dev_iyfiz,eng-iaf,5 +lre22_dev_iylyu,xho-xho,12 +lre22_dev_iyuli,zul-zul,13 +lre22_dev_iyupt,orm-orm,5 +lre22_dev_iyxjf,zul-zul,12 +lre22_dev_iyzgz,tso-tso,10 +lre22_dev_izepb,ara-arq,4 +lre22_dev_izkix,ven-ven,10 +lre22_dev_izknz,ven-ven,12 +lre22_dev_jadfl,ara-arq,9 +lre22_dev_jafja,zul-zul,9 +lre22_dev_jamvn,ven-ven,1 +lre22_dev_jbach,eng-iaf,2 +lre22_dev_jbqcq,ara-aeb,6 +lre22_dev_jcxgo,afr-afr,6 +lre22_dev_jddrh,fra-ntf,13 +lre22_dev_jdjpg,tir-tir,12 +lre22_dev_jdtrb,eng-iaf,11 +lre22_dev_jdwjj,zul-zul,7 +lre22_dev_jdzqw,tir-tir,3 +lre22_dev_jeaev,nbl-nbl,8 +lre22_dev_jeobs,ara-aeb,9 +lre22_dev_jesxq,eng-ens,10 +lre22_dev_jgcla,ara-arq,2 +lre22_dev_jggxv,fra-ntf,3 +lre22_dev_jgntz,orm-orm,5 +lre22_dev_jhcao,ven-ven,7 +lre22_dev_jhgik,eng-ens,11 +lre22_dev_jhpkj,ara-arq,4 +lre22_dev_jhuof,orm-orm,15 +lre22_dev_jignq,ara-ayl,9 +lre22_dev_jjffc,ven-ven,13 +lre22_dev_jjkfe,eng-ens,9 +lre22_dev_jjqxi,ara-aeb,8 +lre22_dev_jjrgq,eng-iaf,4 +lre22_dev_jkacy,tso-tso,3 +lre22_dev_jkmin,orm-orm,15 +lre22_dev_jkobe,xho-xho,7 +lre22_dev_jkosd,zul-zul,10 +lre22_dev_jkovc,tso-tso,3 +lre22_dev_jktcq,zul-zul,7 +lre22_dev_jlodp,eng-ens,9 +lre22_dev_jmbjo,nbl-nbl,9 +lre22_dev_jmccw,ara-arq,3 +lre22_dev_jminj,fra-ntf,5 +lre22_dev_jmmyw,afr-afr,3 +lre22_dev_jobae,fra-ntf,13 +lre22_dev_jobsv,nbl-nbl,14 +lre22_dev_jobxi,ara-arq,5 +lre22_dev_joghi,ara-arq,6 +lre22_dev_johkj,xho-xho,7 +lre22_dev_jolqw,ara-ayl,5 +lre22_dev_jplye,fra-ntf,11 +lre22_dev_jpsmt,ara-arq,9 +lre22_dev_jqdnf,eng-iaf,13 +lre22_dev_jqqpg,orm-orm,5 +lre22_dev_jqqrs,nbl-nbl,11 +lre22_dev_jrmnp,tir-tir,9 +lre22_dev_jsahe,fra-ntf,12 +lre22_dev_jsciw,eng-ens,5 +lre22_dev_jsisu,eng-iaf,4 +lre22_dev_jstjq,zul-zul,4 +lre22_dev_jsxuw,eng-iaf,8 +lre22_dev_jtaxh,ven-ven,4 +lre22_dev_jtgjo,ara-arq,9 +lre22_dev_jtxor,orm-orm,3 +lre22_dev_junyj,orm-orm,5 +lre22_dev_juykt,ara-ayl,7 +lre22_dev_jvqzf,fra-ntf,9 +lre22_dev_jvvxl,afr-afr,7 +lre22_dev_jvxpt,nbl-nbl,1 +lre22_dev_jwfeb,eng-iaf,4 +lre22_dev_jwmmp,eng-ens,3 +lre22_dev_jwyiq,tso-tso,10 +lre22_dev_jxcmp,ara-aeb,10 +lre22_dev_jxfsy,ara-ayl,9 +lre22_dev_jxjar,tso-tso,10 +lre22_dev_jylrr,ara-aeb,9 +lre22_dev_jzciw,orm-orm,5 +lre22_dev_jzcyt,tso-tso,5 +lre22_dev_jzhpf,tso-tso,4 +lre22_dev_jzidh,afr-afr,11 +lre22_dev_jznzw,eng-iaf,6 +lre22_dev_jzoqd,afr-afr,7 +lre22_dev_jzwnu,ven-ven,11 +lre22_dev_kaoyk,afr-afr,6 +lre22_dev_kasoe,zul-zul,12 +lre22_dev_kaygq,eng-ens,9 +lre22_dev_kayqh,fra-ntf,8 +lre22_dev_kbpcw,eng-iaf,3 +lre22_dev_kbtrx,orm-orm,10 +lre22_dev_kcebk,ven-ven,7 +lre22_dev_kdbil,orm-orm,15 +lre22_dev_kddhf,ara-arq,10 +lre22_dev_kdeij,ara-ayl,3 +lre22_dev_kdiak,zul-zul,12 +lre22_dev_kedwl,nbl-nbl,12 +lre22_dev_keouf,fra-ntf,9 +lre22_dev_keozw,ara-aeb,10 +lre22_dev_kervm,eng-ens,7 +lre22_dev_kflpm,xho-xho,1 +lre22_dev_kfqpd,ara-arq,8 +lre22_dev_kgaqj,ara-aeb,8 +lre22_dev_kghnx,fra-ntf,3 +lre22_dev_kgoze,zul-zul,4 +lre22_dev_kgrxe,fra-ntf,9 +lre22_dev_kgsdu,ara-arq,5 +lre22_dev_kheef,xho-xho,15 +lre22_dev_khgyl,xho-xho,8 +lre22_dev_khsgr,tso-tso,7 +lre22_dev_khxvm,nbl-nbl,9 +lre22_dev_kijjo,ara-aeb,3 +lre22_dev_kiush,xho-xho,2 +lre22_dev_kiyso,ara-arq,1 +lre22_dev_kjewo,ven-ven,6 +lre22_dev_kjgkg,ara-ayl,5 +lre22_dev_kjksh,ven-ven,3 +lre22_dev_kjomd,afr-afr,4 +lre22_dev_kjrcy,afr-afr,11 +lre22_dev_kkauw,fra-ntf,10 +lre22_dev_kkiew,orm-orm,15 +lre22_dev_kkyyu,zul-zul,8 +lre22_dev_klafc,ara-ayl,4 +lre22_dev_klalo,eng-ens,5 +lre22_dev_kliip,afr-afr,1 +lre22_dev_klkxg,tso-tso,8 +lre22_dev_klqwc,ara-arq,7 +lre22_dev_kmbgg,tir-tir,12 +lre22_dev_kmgoo,tir-tir,8 +lre22_dev_kmnko,zul-zul,3 +lre22_dev_kmtyc,ara-aeb,8 +lre22_dev_kmxqj,xho-xho,8 +lre22_dev_kmzdw,fra-ntf,3 +lre22_dev_knxsi,ara-arq,9 +lre22_dev_kofob,orm-orm,7 +lre22_dev_kokfk,fra-ntf,14 +lre22_dev_kokir,nbl-nbl,12 +lre22_dev_kooxu,ara-arq,9 +lre22_dev_korip,tso-tso,7 +lre22_dev_kpbnd,zul-zul,4 +lre22_dev_kpnyf,eng-iaf,3 +lre22_dev_kpwts,ara-ayl,8 +lre22_dev_kpxne,orm-orm,6 +lre22_dev_kpzbl,ven-ven,12 +lre22_dev_kqact,zul-zul,0 +lre22_dev_kqfbl,eng-iaf,12 +lre22_dev_kqfsm,zul-zul,5 +lre22_dev_kqfyp,ara-arq,1 +lre22_dev_kqkqj,ara-ayl,7 +lre22_dev_kqvwr,xho-xho,13 +lre22_dev_kragl,zul-zul,13 +lre22_dev_krbdn,xho-xho,14 +lre22_dev_ksake,ara-aeb,8 +lre22_dev_ksoly,nbl-nbl,11 +lre22_dev_kttyt,orm-orm,5 +lre22_dev_kttzq,tso-tso,9 +lre22_dev_ktwaf,zul-zul,3 +lre22_dev_ktwqf,ven-ven,6 +lre22_dev_ktxef,zul-zul,0 +lre22_dev_ktztb,orm-orm,12 +lre22_dev_kufkm,nbl-nbl,15 +lre22_dev_kuqsu,afr-afr,9 +lre22_dev_kuyka,tir-tir,4 +lre22_dev_kvcpn,ara-ayl,3 +lre22_dev_kvghz,eng-iaf,10 +lre22_dev_kvswv,ven-ven,11 +lre22_dev_kxkos,orm-orm,10 +lre22_dev_kxkzg,ara-ayl,9 +lre22_dev_kxqef,ven-ven,12 +lre22_dev_kyjpf,ven-ven,7 +lre22_dev_kynap,ara-ayl,9 +lre22_dev_kyptg,ven-ven,8 +lre22_dev_kytyr,nbl-nbl,11 +lre22_dev_kywmf,orm-orm,4 +lre22_dev_kzibn,zul-zul,3 +lre22_dev_kzqxx,fra-ntf,1 +lre22_dev_lacgv,tso-tso,7 +lre22_dev_lagpe,tso-tso,6 +lre22_dev_lanuu,tso-tso,9 +lre22_dev_lapag,afr-afr,6 +lre22_dev_larnq,zul-zul,4 +lre22_dev_lbbvq,xho-xho,8 +lre22_dev_lbfca,ara-arq,8 +lre22_dev_lbhoj,orm-orm,11 +lre22_dev_lbiin,ara-ayl,4 +lre22_dev_lcdyj,ara-arq,9 +lre22_dev_ldasz,fra-ntf,9 +lre22_dev_ldbur,tso-tso,1 +lre22_dev_lddhs,orm-orm,12 +lre22_dev_ldedw,ara-aeb,5 +lre22_dev_ldmbr,ara-ayl,5 +lre22_dev_ldmqc,tir-tir,7 +lre22_dev_leadw,eng-iaf,3 +lre22_dev_leaqq,tso-tso,10 +lre22_dev_ledsh,afr-afr,11 +lre22_dev_leovk,afr-afr,6 +lre22_dev_lexlh,ara-aeb,2 +lre22_dev_lfilk,eng-ens,10 +lre22_dev_lfyll,zul-zul,10 +lre22_dev_lgada,zul-zul,6 +lre22_dev_lgcjy,afr-afr,9 +lre22_dev_lgfri,ara-aeb,5 +lre22_dev_lgkbt,xho-xho,4 +lre22_dev_lhbjq,ara-arq,0 +lre22_dev_lhemi,xho-xho,9 +lre22_dev_lhfne,ara-arq,6 +lre22_dev_lhmtg,ara-arq,9 +lre22_dev_lieso,ara-aeb,8 +lre22_dev_likcy,afr-afr,13 +lre22_dev_lipyu,zul-zul,12 +lre22_dev_lisum,ven-ven,4 +lre22_dev_ljevp,ara-ayl,3 +lre22_dev_ljijh,orm-orm,3 +lre22_dev_ljylg,nbl-nbl,13 +lre22_dev_lkfig,ara-ayl,2 +lre22_dev_lklnc,ara-arq,3 +lre22_dev_lkopy,tir-tir,9 +lre22_dev_lllwi,eng-iaf,5 +lre22_dev_llstb,nbl-nbl,10 +lre22_dev_lmeax,eng-iaf,10 +lre22_dev_lmkui,ara-arq,7 +lre22_dev_lmrbp,tir-tir,9 +lre22_dev_lnejh,eng-ens,10 +lre22_dev_lnttv,ven-ven,10 +lre22_dev_loxqz,eng-iaf,8 +lre22_dev_loybq,ara-aeb,10 +lre22_dev_lpadb,fra-ntf,4 +lre22_dev_lpahk,nbl-nbl,11 +lre22_dev_lphgs,tir-tir,7 +lre22_dev_lphoa,eng-ens,2 +lre22_dev_lpkie,eng-iaf,5 +lre22_dev_lpkpc,zul-zul,6 +lre22_dev_lptpx,eng-iaf,4 +lre22_dev_lqwcv,xho-xho,13 +lre22_dev_lrgwx,orm-orm,10 +lre22_dev_lruoj,orm-orm,2 +lre22_dev_lrwee,fra-ntf,10 +lre22_dev_lsess,ven-ven,1 +lre22_dev_lsycj,tir-tir,9 +lre22_dev_ltaoe,eng-ens,8 +lre22_dev_ltish,ara-aeb,5 +lre22_dev_ltqeb,eng-ens,8 +lre22_dev_ltzfg,ven-ven,10 +lre22_dev_luuhd,ara-arq,2 +lre22_dev_lvejl,zul-zul,11 +lre22_dev_lvgsm,tir-tir,10 +lre22_dev_lvwle,xho-xho,7 +lre22_dev_lvxea,tir-tir,8 +lre22_dev_lwsmk,eng-ens,10 +lre22_dev_lwzhq,ara-ayl,3 +lre22_dev_lxbdd,ara-ayl,8 +lre22_dev_lxdgx,nbl-nbl,1 +lre22_dev_lxjij,ara-ayl,7 +lre22_dev_lxldm,tso-tso,8 +lre22_dev_lxmsa,zul-zul,11 +lre22_dev_lxugv,zul-zul,13 +lre22_dev_lxwig,tso-tso,4 +lre22_dev_lyigi,xho-xho,4 +lre22_dev_lymzv,ara-arq,6 +lre22_dev_lyuls,ara-arq,4 +lre22_dev_lyyzw,ara-ayl,5 +lre22_dev_lzhrm,ara-arq,8 +lre22_dev_lzjgb,xho-xho,12 +lre22_dev_lzrpe,xho-xho,8 +lre22_dev_lzvmq,fra-ntf,13 +lre22_dev_maagy,ven-ven,6 +lre22_dev_mabmx,ara-arq,4 +lre22_dev_macre,zul-zul,7 +lre22_dev_maggb,nbl-nbl,7 +lre22_dev_margf,ara-ayl,6 +lre22_dev_maydg,eng-iaf,4 +lre22_dev_mbsgm,zul-zul,7 +lre22_dev_mbttd,fra-ntf,14 +lre22_dev_mcebh,tso-tso,8 +lre22_dev_mcfve,ara-ayl,3 +lre22_dev_mclrc,zul-zul,12 +lre22_dev_mcvgl,ara-ayl,5 +lre22_dev_mdgok,ara-aeb,5 +lre22_dev_mdilb,ven-ven,3 +lre22_dev_mdzqr,nbl-nbl,11 +lre22_dev_mehfu,ara-arq,3 +lre22_dev_meiyg,eng-ens,11 +lre22_dev_merbq,orm-orm,9 +lre22_dev_mfoys,afr-afr,8 +lre22_dev_mgpfx,xho-xho,8 +lre22_dev_mgtzj,zul-zul,12 +lre22_dev_mgxxc,ven-ven,11 +lre22_dev_mhldj,nbl-nbl,14 +lre22_dev_mhvio,eng-iaf,6 +lre22_dev_mhxgi,tir-tir,9 +lre22_dev_miegc,fra-ntf,6 +lre22_dev_miwyu,ara-aeb,8 +lre22_dev_mjocm,ara-aeb,2 +lre22_dev_mjqij,orm-orm,12 +lre22_dev_mjxgy,afr-afr,8 +lre22_dev_mkeyt,tir-tir,12 +lre22_dev_mklub,ven-ven,4 +lre22_dev_mknzf,ara-aeb,10 +lre22_dev_mlhes,ara-arq,9 +lre22_dev_mlhse,tso-tso,3 +lre22_dev_mlhtc,orm-orm,8 +lre22_dev_mlpuq,ven-ven,10 +lre22_dev_mluow,orm-orm,2 +lre22_dev_mmwtu,ara-arq,4 +lre22_dev_mmwzf,tso-tso,7 +lre22_dev_mnjdq,tir-tir,10 +lre22_dev_mnkfe,nbl-nbl,4 +lre22_dev_mnmcm,ara-arq,3 +lre22_dev_mocss,xho-xho,9 +lre22_dev_mohxo,zul-zul,12 +lre22_dev_mojui,fra-ntf,1 +lre22_dev_mojvy,xho-xho,7 +lre22_dev_molqa,fra-ntf,14 +lre22_dev_mopiq,nbl-nbl,14 +lre22_dev_moqto,tir-tir,12 +lre22_dev_morri,ara-aeb,8 +lre22_dev_mpxyg,eng-ens,4 +lre22_dev_mqiap,xho-xho,14 +lre22_dev_mqxep,ara-ayl,2 +lre22_dev_mrcoe,ara-ayl,7 +lre22_dev_mriiq,tso-tso,4 +lre22_dev_mryoy,eng-ens,11 +lre22_dev_mryzh,ara-arq,4 +lre22_dev_msadm,ven-ven,2 +lre22_dev_msghz,nbl-nbl,11 +lre22_dev_mtpfp,ara-aeb,9 +lre22_dev_mtqft,orm-orm,14 +lre22_dev_mtzvt,ara-aeb,10 +lre22_dev_munim,xho-xho,15 +lre22_dev_murhb,nbl-nbl,1 +lre22_dev_mvbra,xho-xho,4 +lre22_dev_mvhza,afr-afr,13 +lre22_dev_mviud,xho-xho,12 +lre22_dev_mvxjk,afr-afr,9 +lre22_dev_mwnkm,orm-orm,8 +lre22_dev_mwoml,xho-xho,9 +lre22_dev_mxhup,eng-ens,8 +lre22_dev_mykuh,ara-ayl,5 +lre22_dev_myqfn,eng-iaf,4 +lre22_dev_mywmj,ven-ven,9 +lre22_dev_mzbrr,ara-arq,10 +lre22_dev_mzsiq,afr-afr,9 +lre22_dev_mztms,eng-ens,3 +lre22_dev_mzuxc,ara-arq,9 +lre22_dev_nbdbe,ara-ayl,7 +lre22_dev_nbjqz,ara-aeb,9 +lre22_dev_nbyhp,afr-afr,3 +lre22_dev_ncnyb,ven-ven,8 +lre22_dev_ncocl,nbl-nbl,6 +lre22_dev_ndecq,ara-ayl,8 +lre22_dev_ndjsl,nbl-nbl,6 +lre22_dev_nelsk,orm-orm,0 +lre22_dev_nenly,eng-iaf,11 +lre22_dev_neqkb,ven-ven,2 +lre22_dev_nfjid,orm-orm,12 +lre22_dev_nfkqr,orm-orm,8 +lre22_dev_nfoas,orm-orm,15 +lre22_dev_ngjbm,eng-ens,10 +lre22_dev_ngmbz,eng-iaf,9 +lre22_dev_ngnua,fra-ntf,10 +lre22_dev_nguuu,fra-ntf,13 +lre22_dev_ngyse,ven-ven,7 +lre22_dev_nhfso,fra-ntf,14 +lre22_dev_nhuue,zul-zul,1 +lre22_dev_niack,ara-ayl,8 +lre22_dev_niari,ven-ven,7 +lre22_dev_nibme,ara-arq,9 +lre22_dev_nikby,tso-tso,10 +lre22_dev_nimex,ara-ayl,8 +lre22_dev_nivmv,xho-xho,11 +lre22_dev_nkebu,eng-ens,5 +lre22_dev_nkgml,eng-ens,10 +lre22_dev_nkofi,fra-ntf,11 +lre22_dev_nkrez,xho-xho,5 +lre22_dev_nkscn,tso-tso,5 +lre22_dev_nkwrs,ara-aeb,2 +lre22_dev_nkxcy,afr-afr,4 +lre22_dev_nlast,xho-xho,12 +lre22_dev_nlcun,eng-ens,0 +lre22_dev_nljyr,afr-afr,5 +lre22_dev_nlkdv,eng-iaf,12 +lre22_dev_nlpcs,ara-ayl,7 +lre22_dev_nlrcn,ara-ayl,4 +lre22_dev_nlxla,xho-xho,0 +lre22_dev_nmmij,ara-ayl,4 +lre22_dev_nmrkv,fra-ntf,12 +lre22_dev_nmufp,tso-tso,10 +lre22_dev_nnbmo,tso-tso,10 +lre22_dev_nnnpi,afr-afr,4 +lre22_dev_nnzok,tir-tir,5 +lre22_dev_noqch,fra-ntf,12 +lre22_dev_nownd,xho-xho,2 +lre22_dev_npabl,nbl-nbl,5 +lre22_dev_npjhu,afr-afr,6 +lre22_dev_nqbks,afr-afr,11 +lre22_dev_nqijo,orm-orm,7 +lre22_dev_nqljj,ara-arq,6 +lre22_dev_nqvfr,tir-tir,7 +lre22_dev_nrtej,tir-tir,11 +lre22_dev_nshvj,nbl-nbl,7 +lre22_dev_nsmyy,tir-tir,12 +lre22_dev_nsqcm,fra-ntf,13 +lre22_dev_nstrj,nbl-nbl,9 +lre22_dev_nsvla,nbl-nbl,10 +lre22_dev_nthbx,eng-ens,0 +lre22_dev_nvwkf,ven-ven,0 +lre22_dev_nvwzy,tso-tso,11 +lre22_dev_nvyyg,orm-orm,7 +lre22_dev_nxdml,eng-ens,1 +lre22_dev_nxmxb,zul-zul,12 +lre22_dev_nxqpl,nbl-nbl,13 +lre22_dev_nxslf,fra-ntf,9 +lre22_dev_nyaof,nbl-nbl,5 +lre22_dev_nzeot,zul-zul,12 +lre22_dev_nzhhf,ara-ayl,7 +lre22_dev_nzpbh,fra-ntf,14 +lre22_dev_nzyjp,orm-orm,4 +lre22_dev_nzzyd,xho-xho,11 +lre22_dev_oaiij,ven-ven,7 +lre22_dev_oaimr,orm-orm,14 +lre22_dev_oatzl,fra-ntf,13 +lre22_dev_oaycx,ara-ayl,8 +lre22_dev_objwd,eng-ens,1 +lre22_dev_oboem,tir-tir,9 +lre22_dev_obzyj,xho-xho,5 +lre22_dev_occhn,fra-ntf,9 +lre22_dev_ocfcr,ven-ven,7 +lre22_dev_ochni,ven-ven,13 +lre22_dev_ociva,tir-tir,5 +lre22_dev_odofq,xho-xho,5 +lre22_dev_odtjr,eng-ens,11 +lre22_dev_oejjy,fra-ntf,4 +lre22_dev_offnw,afr-afr,8 +lre22_dev_ofgqs,ara-ayl,6 +lre22_dev_ofkvj,xho-xho,15 +lre22_dev_ofzhh,orm-orm,11 +lre22_dev_ogilp,afr-afr,6 +lre22_dev_oglxd,ara-ayl,4 +lre22_dev_ogoyt,tso-tso,8 +lre22_dev_ogpou,ven-ven,3 +lre22_dev_ohatz,eng-ens,10 +lre22_dev_ohlzs,nbl-nbl,15 +lre22_dev_ohpzj,tir-tir,4 +lre22_dev_ohzdt,ara-aeb,5 +lre22_dev_oicrh,eng-ens,9 +lre22_dev_oigem,orm-orm,14 +lre22_dev_ojbnw,ara-arq,4 +lre22_dev_ojebm,ven-ven,7 +lre22_dev_ojila,ara-arq,4 +lre22_dev_ojiso,fra-ntf,5 +lre22_dev_ojpdy,tso-tso,9 +lre22_dev_ojtki,tir-tir,11 +lre22_dev_ojxso,nbl-nbl,4 +lre22_dev_okdqa,fra-ntf,14 +lre22_dev_oktvp,ara-ayl,7 +lre22_dev_okvsg,zul-zul,10 +lre22_dev_okyah,tso-tso,11 +lre22_dev_olabw,ara-arq,4 +lre22_dev_omhry,tir-tir,4 +lre22_dev_omnrf,eng-iaf,13 +lre22_dev_omptm,ven-ven,6 +lre22_dev_omqfq,fra-ntf,4 +lre22_dev_onqdn,fra-ntf,13 +lre22_dev_onsyx,tso-tso,9 +lre22_dev_onvgj,tir-tir,6 +lre22_dev_onzha,zul-zul,10 +lre22_dev_ooptw,nbl-nbl,5 +lre22_dev_oowvo,eng-ens,11 +lre22_dev_ooyea,tso-tso,2 +lre22_dev_oozri,ven-ven,0 +lre22_dev_opazz,ara-ayl,1 +lre22_dev_opqkl,nbl-nbl,11 +lre22_dev_oqsva,ara-ayl,2 +lre22_dev_oquxw,nbl-nbl,15 +lre22_dev_orktv,afr-afr,5 +lre22_dev_ornjf,ara-ayl,6 +lre22_dev_ortbp,ara-arq,0 +lre22_dev_osauy,fra-ntf,12 +lre22_dev_osnch,afr-afr,1 +lre22_dev_otelo,eng-iaf,7 +lre22_dev_otewx,tso-tso,10 +lre22_dev_otnwj,eng-ens,3 +lre22_dev_ouecw,ara-aeb,10 +lre22_dev_ouzui,ara-arq,3 +lre22_dev_ovdtj,ara-ayl,6 +lre22_dev_ovjny,tso-tso,1 +lre22_dev_ovqwp,ara-ayl,7 +lre22_dev_ovvkn,afr-afr,11 +lre22_dev_ovvmi,tso-tso,2 +lre22_dev_owyeq,ara-arq,6 +lre22_dev_oxlrt,ara-aeb,10 +lre22_dev_oybst,zul-zul,9 +lre22_dev_oybua,nbl-nbl,2 +lre22_dev_oykjs,tso-tso,4 +lre22_dev_oyswm,ara-arq,8 +lre22_dev_oyxbj,ven-ven,8 +lre22_dev_oyxtq,eng-ens,11 +lre22_dev_oyyxh,ara-arq,8 +lre22_dev_ozbct,tir-tir,12 +lre22_dev_ozcvt,ara-aeb,10 +lre22_dev_ozjel,ara-arq,10 +lre22_dev_ozmuj,zul-zul,3 +lre22_dev_ozuvk,tir-tir,10 +lre22_dev_paguh,fra-ntf,1 +lre22_dev_paspj,tir-tir,6 +lre22_dev_pbmai,fra-ntf,6 +lre22_dev_pbpug,zul-zul,10 +lre22_dev_pbsbs,tso-tso,10 +lre22_dev_pbszl,tso-tso,1 +lre22_dev_pbxxf,eng-iaf,2 +lre22_dev_pcgvn,eng-iaf,3 +lre22_dev_pcmbn,eng-ens,1 +lre22_dev_pcqce,ara-arq,8 +lre22_dev_pdlnr,tso-tso,2 +lre22_dev_pdrus,orm-orm,1 +lre22_dev_pedyx,eng-iaf,12 +lre22_dev_pegyr,nbl-nbl,11 +lre22_dev_pesej,ara-arq,4 +lre22_dev_pevhh,tir-tir,12 +lre22_dev_peykl,xho-xho,13 +lre22_dev_pezwc,tso-tso,4 +lre22_dev_pfemh,eng-iaf,4 +lre22_dev_pfrfc,ven-ven,8 +lre22_dev_pfsoa,nbl-nbl,15 +lre22_dev_pgeoo,tso-tso,9 +lre22_dev_pgwei,orm-orm,2 +lre22_dev_pgxyv,tso-tso,4 +lre22_dev_phofb,ara-ayl,8 +lre22_dev_phula,nbl-nbl,14 +lre22_dev_phwnf,tso-tso,9 +lre22_dev_pifyx,orm-orm,9 +lre22_dev_pilvp,tso-tso,11 +lre22_dev_pinzj,nbl-nbl,11 +lre22_dev_piocw,ara-aeb,8 +lre22_dev_pipas,zul-zul,13 +lre22_dev_pipgo,afr-afr,3 +lre22_dev_pitmn,ara-arq,10 +lre22_dev_pizdz,ara-aeb,2 +lre22_dev_pizlx,ara-ayl,6 +lre22_dev_pjatg,ven-ven,9 +lre22_dev_pjavt,orm-orm,11 +lre22_dev_pjcec,eng-iaf,12 +lre22_dev_pjdwy,afr-afr,1 +lre22_dev_pjlmw,ara-ayl,7 +lre22_dev_pjsqe,eng-ens,7 +lre22_dev_pkdij,ara-ayl,3 +lre22_dev_pkekq,ara-aeb,3 +lre22_dev_pkpst,eng-iaf,9 +lre22_dev_plhqb,nbl-nbl,13 +lre22_dev_plowv,nbl-nbl,5 +lre22_dev_plrjb,xho-xho,12 +lre22_dev_pmove,eng-iaf,4 +lre22_dev_pneax,eng-ens,11 +lre22_dev_pnexr,nbl-nbl,9 +lre22_dev_pngea,nbl-nbl,11 +lre22_dev_pnipe,eng-ens,9 +lre22_dev_pnmlr,ara-arq,5 +lre22_dev_pnsuk,xho-xho,2 +lre22_dev_pnuct,tir-tir,10 +lre22_dev_pocev,ara-arq,4 +lre22_dev_powkd,eng-ens,9 +lre22_dev_pprvm,ara-ayl,7 +lre22_dev_ppyle,ara-aeb,7 +lre22_dev_pqfda,fra-ntf,5 +lre22_dev_pqryo,afr-afr,4 +lre22_dev_prrzc,afr-afr,9 +lre22_dev_psjuf,afr-afr,13 +lre22_dev_psngm,zul-zul,13 +lre22_dev_psroz,fra-ntf,13 +lre22_dev_pssqo,orm-orm,10 +lre22_dev_psvlh,fra-ntf,13 +lre22_dev_pswld,tir-tir,10 +lre22_dev_ptcns,nbl-nbl,11 +lre22_dev_ptobm,afr-afr,6 +lre22_dev_ptowg,tir-tir,8 +lre22_dev_ptreu,xho-xho,15 +lre22_dev_ptwru,fra-ntf,14 +lre22_dev_ptyff,ara-ayl,1 +lre22_dev_ptygm,tir-tir,3 +lre22_dev_pudne,ara-arq,4 +lre22_dev_puelp,zul-zul,9 +lre22_dev_purej,nbl-nbl,9 +lre22_dev_puyvb,ara-ayl,3 +lre22_dev_pvrdh,ara-aeb,9 +lre22_dev_pvryr,eng-ens,11 +lre22_dev_pwets,tir-tir,9 +lre22_dev_pwgnk,tir-tir,10 +lre22_dev_pwhyy,tir-tir,11 +lre22_dev_pwkgs,zul-zul,2 +lre22_dev_pwtdp,eng-iaf,0 +lre22_dev_pxccc,ara-ayl,5 +lre22_dev_pxpdo,xho-xho,14 +lre22_dev_pxsot,xho-xho,14 +lre22_dev_pxuhy,ara-aeb,6 +lre22_dev_pybxn,eng-iaf,11 +lre22_dev_pyoft,eng-iaf,12 +lre22_dev_pyvql,eng-iaf,7 +lre22_dev_pzcnz,nbl-nbl,2 +lre22_dev_pzhrk,ara-aeb,4 +lre22_dev_qadjy,ven-ven,7 +lre22_dev_qaeek,ven-ven,7 +lre22_dev_qafse,eng-iaf,11 +lre22_dev_qahft,ven-ven,13 +lre22_dev_qakoa,zul-zul,9 +lre22_dev_qalhd,ara-ayl,2 +lre22_dev_qazjh,ven-ven,11 +lre22_dev_qbfkw,eng-iaf,6 +lre22_dev_qbgcd,fra-ntf,14 +lre22_dev_qbisr,ara-ayl,3 +lre22_dev_qcnbm,ven-ven,3 +lre22_dev_qdcbb,tir-tir,5 +lre22_dev_qdfgi,zul-zul,12 +lre22_dev_qdmbj,eng-ens,4 +lre22_dev_qdwtg,fra-ntf,11 +lre22_dev_qefvt,ara-ayl,7 +lre22_dev_qffki,orm-orm,13 +lre22_dev_qfplk,tir-tir,8 +lre22_dev_qgxdl,xho-xho,14 +lre22_dev_qhadd,afr-afr,2 +lre22_dev_qhgaf,ara-ayl,7 +lre22_dev_qhinf,tir-tir,6 +lre22_dev_qhkjz,ara-aeb,6 +lre22_dev_qhlwj,ara-arq,8 +lre22_dev_qiarf,ara-arq,4 +lre22_dev_qidwl,ara-arq,5 +lre22_dev_qivzc,orm-orm,12 +lre22_dev_qizyt,ara-ayl,2 +lre22_dev_qjeue,ara-arq,9 +lre22_dev_qjgxh,ara-arq,1 +lre22_dev_qkdhb,afr-afr,1 +lre22_dev_qkiqi,orm-orm,4 +lre22_dev_qkoth,tir-tir,5 +lre22_dev_qkucq,fra-ntf,3 +lre22_dev_qltea,nbl-nbl,2 +lre22_dev_qlube,ara-aeb,5 +lre22_dev_qmcji,nbl-nbl,15 +lre22_dev_qmpzc,nbl-nbl,11 +lre22_dev_qmsog,tir-tir,3 +lre22_dev_qoech,eng-iaf,7 +lre22_dev_qovfg,ara-arq,10 +lre22_dev_qozzv,tir-tir,2 +lre22_dev_qpasx,tir-tir,3 +lre22_dev_qpauj,ara-aeb,4 +lre22_dev_qpfch,orm-orm,6 +lre22_dev_qpvea,orm-orm,9 +lre22_dev_qrgka,ara-arq,8 +lre22_dev_qrqmm,ara-ayl,7 +lre22_dev_qsaol,xho-xho,14 +lre22_dev_qsgpx,ara-arq,10 +lre22_dev_qspeg,eng-ens,7 +lre22_dev_qsvbe,fra-ntf,3 +lre22_dev_qsxoh,fra-ntf,5 +lre22_dev_qtbnc,xho-xho,7 +lre22_dev_qthzi,afr-afr,12 +lre22_dev_qtmaw,fra-ntf,13 +lre22_dev_qtnqh,eng-iaf,13 +lre22_dev_qtpsb,tso-tso,8 +lre22_dev_qtqpc,eng-iaf,12 +lre22_dev_qtwfv,eng-iaf,4 +lre22_dev_qvamq,fra-ntf,9 +lre22_dev_qveuq,tir-tir,9 +lre22_dev_qvffg,orm-orm,0 +lre22_dev_qvplf,xho-xho,6 +lre22_dev_qvqvi,ven-ven,7 +lre22_dev_qwhsh,afr-afr,7 +lre22_dev_qwiwm,eng-ens,9 +lre22_dev_qxbch,ara-aeb,9 +lre22_dev_qxlca,nbl-nbl,2 +lre22_dev_qxscb,afr-afr,2 +lre22_dev_qyoqn,fra-ntf,9 +lre22_dev_qyrgs,nbl-nbl,3 +lre22_dev_qytdl,fra-ntf,9 +lre22_dev_qyyeb,eng-iaf,12 +lre22_dev_qyzqb,tso-tso,8 +lre22_dev_qzayi,orm-orm,12 +lre22_dev_qzexr,eng-iaf,5 +lre22_dev_qzrfi,ara-arq,10 +lre22_dev_qztjh,orm-orm,3 +lre22_dev_qztze,eng-iaf,12 +lre22_dev_raent,eng-iaf,2 +lre22_dev_ragjh,orm-orm,14 +lre22_dev_ramzu,ara-ayl,6 +lre22_dev_ratmr,ven-ven,7 +lre22_dev_rawak,ara-arq,9 +lre22_dev_rbbne,ven-ven,7 +lre22_dev_rbcul,eng-iaf,10 +lre22_dev_rbsoy,eng-iaf,12 +lre22_dev_rbxqy,tso-tso,9 +lre22_dev_rcejf,xho-xho,7 +lre22_dev_rdbzt,zul-zul,7 +lre22_dev_rdhpu,ara-aeb,8 +lre22_dev_rdsew,ven-ven,2 +lre22_dev_rdtkf,ven-ven,11 +lre22_dev_reeba,ara-ayl,6 +lre22_dev_relip,eng-iaf,11 +lre22_dev_rfdoh,ara-aeb,9 +lre22_dev_rfkja,xho-xho,11 +lre22_dev_rflev,ven-ven,3 +lre22_dev_rfqcx,nbl-nbl,14 +lre22_dev_rfwuv,eng-ens,1 +lre22_dev_rgsil,fra-ntf,6 +lre22_dev_rhcuj,ara-aeb,8 +lre22_dev_rhdgz,eng-iaf,12 +lre22_dev_rhpmn,ven-ven,7 +lre22_dev_rhtoe,eng-iaf,11 +lre22_dev_rhyqq,ara-aeb,2 +lre22_dev_riltn,ara-aeb,10 +lre22_dev_rinti,xho-xho,12 +lre22_dev_rioxh,xho-xho,12 +lre22_dev_ripix,tir-tir,10 +lre22_dev_rjbji,ven-ven,10 +lre22_dev_rjqbz,eng-iaf,0 +lre22_dev_rkemd,tir-tir,8 +lre22_dev_rktzl,nbl-nbl,13 +lre22_dev_rkuni,xho-xho,15 +lre22_dev_rlsgd,fra-ntf,5 +lre22_dev_rlypa,afr-afr,7 +lre22_dev_rmeav,ven-ven,8 +lre22_dev_rmejy,fra-ntf,12 +lre22_dev_rmeuz,zul-zul,6 +lre22_dev_rmjsj,nbl-nbl,5 +lre22_dev_rmtxj,eng-iaf,13 +lre22_dev_rnpyc,ara-ayl,2 +lre22_dev_rnunw,orm-orm,9 +lre22_dev_rnvvw,tso-tso,9 +lre22_dev_roavh,fra-ntf,6 +lre22_dev_rodbi,xho-xho,15 +lre22_dev_roeph,xho-xho,13 +lre22_dev_rolun,ara-ayl,3 +lre22_dev_roydh,xho-xho,7 +lre22_dev_rpajy,ara-aeb,8 +lre22_dev_rpdsm,ara-ayl,5 +lre22_dev_rpfae,afr-afr,9 +lre22_dev_rpvyc,eng-iaf,9 +lre22_dev_rqxot,tso-tso,9 +lre22_dev_rumiv,ara-aeb,9 +lre22_dev_runhh,afr-afr,6 +lre22_dev_ruvpd,eng-iaf,4 +lre22_dev_rvpkd,fra-ntf,1 +lre22_dev_rvqxq,orm-orm,12 +lre22_dev_rvstc,ara-arq,7 +lre22_dev_rwbea,tir-tir,9 +lre22_dev_rweyk,nbl-nbl,2 +lre22_dev_rwnfb,eng-ens,8 +lre22_dev_rwrhn,afr-afr,11 +lre22_dev_rxhkp,ara-arq,3 +lre22_dev_rxixz,nbl-nbl,15 +lre22_dev_rxmft,zul-zul,7 +lre22_dev_ryknh,ara-ayl,5 +lre22_dev_rytyf,zul-zul,12 +lre22_dev_rywss,tso-tso,1 +lre22_dev_rzjrd,nbl-nbl,7 +lre22_dev_rzpyx,tso-tso,2 +lre22_dev_satbk,ven-ven,7 +lre22_dev_sbfhc,fra-ntf,6 +lre22_dev_sboxi,xho-xho,15 +lre22_dev_scxxn,eng-iaf,5 +lre22_dev_scyvp,ara-aeb,6 +lre22_dev_sdbou,tir-tir,10 +lre22_dev_sddua,tir-tir,11 +lre22_dev_seasj,afr-afr,7 +lre22_dev_sevcw,tir-tir,12 +lre22_dev_sfevx,tso-tso,4 +lre22_dev_sfqgm,fra-ntf,1 +lre22_dev_sgaza,ara-aeb,8 +lre22_dev_sgkrh,afr-afr,9 +lre22_dev_sgmjh,nbl-nbl,14 +lre22_dev_shafn,ven-ven,8 +lre22_dev_shaob,orm-orm,10 +lre22_dev_shnns,afr-afr,6 +lre22_dev_siprc,ven-ven,7 +lre22_dev_sisge,afr-afr,13 +lre22_dev_siuwu,ara-arq,10 +lre22_dev_sivik,fra-ntf,2 +lre22_dev_sjyoo,afr-afr,1 +lre22_dev_skacz,fra-ntf,13 +lre22_dev_skcai,orm-orm,12 +lre22_dev_skctw,nbl-nbl,0 +lre22_dev_skygk,afr-afr,13 +lre22_dev_slraf,ara-aeb,6 +lre22_dev_slrzl,eng-ens,11 +lre22_dev_sltzh,xho-xho,6 +lre22_dev_sluki,ven-ven,1 +lre22_dev_slyez,tso-tso,8 +lre22_dev_slzuh,xho-xho,15 +lre22_dev_smdsm,nbl-nbl,7 +lre22_dev_smhae,ara-ayl,3 +lre22_dev_smxhe,ara-aeb,10 +lre22_dev_snayr,afr-afr,2 +lre22_dev_snbxs,eng-ens,8 +lre22_dev_sngol,tso-tso,9 +lre22_dev_snhun,fra-ntf,13 +lre22_dev_snkib,ven-ven,8 +lre22_dev_snqld,eng-iaf,2 +lre22_dev_sntvb,eng-ens,11 +lre22_dev_snzbl,tir-tir,12 +lre22_dev_sobid,afr-afr,3 +lre22_dev_soknx,orm-orm,15 +lre22_dev_spesw,ven-ven,13 +lre22_dev_sphuq,eng-iaf,12 +lre22_dev_spqcy,xho-xho,11 +lre22_dev_sqcyu,zul-zul,9 +lre22_dev_sqdkr,eng-iaf,13 +lre22_dev_sqfnt,ara-aeb,9 +lre22_dev_sqhrr,eng-ens,11 +lre22_dev_sqyiu,ara-ayl,4 +lre22_dev_srbwp,ara-aeb,10 +lre22_dev_srokn,afr-afr,6 +lre22_dev_srzck,ara-ayl,3 +lre22_dev_ssbei,tso-tso,10 +lre22_dev_ssfmz,eng-iaf,12 +lre22_dev_ssmgk,xho-xho,10 +lre22_dev_ssmsy,xho-xho,4 +lre22_dev_stgcb,afr-afr,10 +lre22_dev_stihb,afr-afr,0 +lre22_dev_stkav,ara-aeb,9 +lre22_dev_stkrw,xho-xho,3 +lre22_dev_sttnk,fra-ntf,8 +lre22_dev_stwkk,eng-iaf,12 +lre22_dev_stwrt,nbl-nbl,1 +lre22_dev_subio,afr-afr,1 +lre22_dev_sumjk,ara-arq,6 +lre22_dev_suocb,nbl-nbl,6 +lre22_dev_svcbx,tso-tso,9 +lre22_dev_svllg,fra-ntf,14 +lre22_dev_svvqs,afr-afr,3 +lre22_dev_svxyz,ara-ayl,1 +lre22_dev_swhlf,ara-aeb,10 +lre22_dev_swhnk,fra-ntf,12 +lre22_dev_swnrg,ven-ven,12 +lre22_dev_swofz,zul-zul,4 +lre22_dev_swuls,tso-tso,8 +lre22_dev_sxfkn,ara-aeb,2 +lre22_dev_sycoz,tir-tir,10 +lre22_dev_syoek,fra-ntf,5 +lre22_dev_sypnb,ven-ven,13 +lre22_dev_syvrt,eng-iaf,8 +lre22_dev_szmoc,ven-ven,6 +lre22_dev_szmwp,eng-ens,8 +lre22_dev_talec,ven-ven,11 +lre22_dev_tasfs,ven-ven,7 +lre22_dev_tbbrr,xho-xho,5 +lre22_dev_tbcun,ara-aeb,3 +lre22_dev_tbhnw,nbl-nbl,15 +lre22_dev_tblhf,ven-ven,12 +lre22_dev_tbozq,xho-xho,1 +lre22_dev_tcckd,ara-ayl,3 +lre22_dev_tcele,tso-tso,11 +lre22_dev_tciob,tso-tso,10 +lre22_dev_tcpxj,tir-tir,9 +lre22_dev_tdejo,tir-tir,6 +lre22_dev_tdfqo,tso-tso,0 +lre22_dev_tdhhf,zul-zul,10 +lre22_dev_tdjje,ven-ven,10 +lre22_dev_tdkrp,orm-orm,6 +lre22_dev_tebop,tso-tso,10 +lre22_dev_teeqm,ven-ven,6 +lre22_dev_tejsn,tir-tir,12 +lre22_dev_teptc,ara-arq,10 +lre22_dev_tetmt,orm-orm,9 +lre22_dev_tfkij,ara-aeb,2 +lre22_dev_tfnin,tir-tir,3 +lre22_dev_tfyqz,tir-tir,3 +lre22_dev_tgbui,ara-aeb,5 +lre22_dev_tgixi,xho-xho,13 +lre22_dev_tgmud,eng-iaf,6 +lre22_dev_tgult,eng-ens,2 +lre22_dev_thcjv,tso-tso,5 +lre22_dev_thzir,eng-ens,11 +lre22_dev_tisfm,fra-ntf,9 +lre22_dev_tixou,xho-xho,2 +lre22_dev_tiyuw,afr-afr,5 +lre22_dev_tjdcc,afr-afr,13 +lre22_dev_tjikt,zul-zul,12 +lre22_dev_tjpdw,ara-arq,8 +lre22_dev_tkadi,ven-ven,12 +lre22_dev_tkcbm,afr-afr,6 +lre22_dev_tkgfw,eng-ens,11 +lre22_dev_tkiks,ara-aeb,6 +lre22_dev_tlgzi,xho-xho,1 +lre22_dev_tlhlw,tir-tir,6 +lre22_dev_tloqn,afr-afr,6 +lre22_dev_tmcje,eng-ens,4 +lre22_dev_tmjpw,eng-iaf,2 +lre22_dev_tmxtu,ven-ven,2 +lre22_dev_tngwh,tir-tir,8 +lre22_dev_tnqdv,ara-aeb,9 +lre22_dev_tnqro,xho-xho,15 +lre22_dev_tnqzy,orm-orm,7 +lre22_dev_tnskm,xho-xho,12 +lre22_dev_tnvhc,ven-ven,12 +lre22_dev_tofhy,zul-zul,6 +lre22_dev_tohkd,zul-zul,9 +lre22_dev_tonqb,ven-ven,6 +lre22_dev_tpbib,tso-tso,1 +lre22_dev_tpejq,ara-arq,3 +lre22_dev_tpfir,eng-ens,11 +lre22_dev_tphgn,zul-zul,12 +lre22_dev_tpidd,ara-arq,6 +lre22_dev_tpkce,eng-ens,11 +lre22_dev_tpszi,orm-orm,15 +lre22_dev_tpwcn,eng-iaf,6 +lre22_dev_trdfy,ara-ayl,3 +lre22_dev_tsbms,ara-ayl,4 +lre22_dev_tslui,tso-tso,6 +lre22_dev_tsvvy,zul-zul,10 +lre22_dev_tsyey,xho-xho,10 +lre22_dev_ttlco,eng-iaf,12 +lre22_dev_tubpr,orm-orm,13 +lre22_dev_tugpl,eng-ens,9 +lre22_dev_tuoiq,tir-tir,4 +lre22_dev_tuxfx,zul-zul,3 +lre22_dev_tvahj,tir-tir,9 +lre22_dev_tvewc,eng-iaf,3 +lre22_dev_tvfvc,ara-ayl,8 +lre22_dev_tvkod,xho-xho,5 +lre22_dev_tvkwe,zul-zul,9 +lre22_dev_tvopo,xho-xho,12 +lre22_dev_tvqui,eng-ens,7 +lre22_dev_tvsbw,ara-arq,6 +lre22_dev_tvxvk,ven-ven,8 +lre22_dev_twbkf,nbl-nbl,9 +lre22_dev_twfot,ara-arq,6 +lre22_dev_twkns,ara-ayl,4 +lre22_dev_twuvf,eng-ens,10 +lre22_dev_txahv,eng-ens,8 +lre22_dev_txcob,ara-aeb,6 +lre22_dev_txnvi,zul-zul,3 +lre22_dev_txurh,afr-afr,7 +lre22_dev_txzkl,ara-arq,5 +lre22_dev_tyfad,tso-tso,7 +lre22_dev_tyhwp,ara-aeb,8 +lre22_dev_tzism,tir-tir,12 +lre22_dev_tzsfj,tir-tir,12 +lre22_dev_tzwof,eng-iaf,9 +lre22_dev_uahzm,afr-afr,5 +lre22_dev_uajwt,tso-tso,7 +lre22_dev_uanlr,zul-zul,13 +lre22_dev_uaoju,zul-zul,8 +lre22_dev_uaryk,xho-xho,15 +lre22_dev_ubfaf,ven-ven,12 +lre22_dev_ucbje,ara-aeb,8 +lre22_dev_ucrpa,ara-arq,3 +lre22_dev_udtzx,eng-iaf,7 +lre22_dev_uduja,fra-ntf,6 +lre22_dev_udxpl,tso-tso,2 +lre22_dev_uesmx,eng-iaf,5 +lre22_dev_ufewk,eng-iaf,8 +lre22_dev_ugjxy,tir-tir,4 +lre22_dev_ugsxl,eng-ens,3 +lre22_dev_ugvov,tso-tso,8 +lre22_dev_uhmdw,tso-tso,10 +lre22_dev_uhqng,nbl-nbl,12 +lre22_dev_uhymw,tir-tir,8 +lre22_dev_uhzmr,eng-ens,2 +lre22_dev_uimtg,ara-ayl,4 +lre22_dev_uirdr,nbl-nbl,13 +lre22_dev_uiszj,ara-aeb,8 +lre22_dev_ujada,ara-ayl,9 +lre22_dev_ujmqw,ven-ven,4 +lre22_dev_ujswr,afr-afr,11 +lre22_dev_ujvve,xho-xho,10 +lre22_dev_ukfha,ara-ayl,6 +lre22_dev_ukkpr,eng-ens,10 +lre22_dev_ukpdg,fra-ntf,13 +lre22_dev_ukpoy,nbl-nbl,15 +lre22_dev_uktod,ara-ayl,4 +lre22_dev_uktvh,zul-zul,13 +lre22_dev_ukuwo,ara-ayl,5 +lre22_dev_ukynv,zul-zul,12 +lre22_dev_ulepv,ara-ayl,5 +lre22_dev_ulgtj,zul-zul,7 +lre22_dev_ulofk,eng-iaf,11 +lre22_dev_uluog,ara-arq,3 +lre22_dev_umbpy,zul-zul,13 +lre22_dev_umjzo,tso-tso,5 +lre22_dev_uncdb,ara-arq,9 +lre22_dev_unffr,ara-ayl,8 +lre22_dev_unpif,eng-ens,9 +lre22_dev_uoikj,eng-iaf,13 +lre22_dev_uopfp,nbl-nbl,7 +lre22_dev_upenl,eng-iaf,13 +lre22_dev_uphuw,xho-xho,11 +lre22_dev_upkbw,ara-ayl,4 +lre22_dev_uplen,xho-xho,9 +lre22_dev_upqod,orm-orm,6 +lre22_dev_upspe,afr-afr,12 +lre22_dev_uqnkk,tir-tir,12 +lre22_dev_uqvxc,eng-ens,0 +lre22_dev_urgqx,ara-ayl,8 +lre22_dev_urkgk,tir-tir,12 +lre22_dev_uscky,xho-xho,3 +lre22_dev_usiwx,tir-tir,9 +lre22_dev_usnzj,zul-zul,5 +lre22_dev_usopt,xho-xho,8 +lre22_dev_uswgv,nbl-nbl,11 +lre22_dev_uszcb,ara-arq,4 +lre22_dev_utahf,ara-ayl,7 +lre22_dev_utaxq,tso-tso,9 +lre22_dev_utcwb,afr-afr,10 +lre22_dev_uuhry,tir-tir,9 +lre22_dev_uuprr,eng-ens,7 +lre22_dev_uuvqh,zul-zul,2 +lre22_dev_uwcmh,orm-orm,4 +lre22_dev_uwiev,zul-zul,13 +lre22_dev_uwjzb,ven-ven,10 +lre22_dev_uwony,orm-orm,1 +lre22_dev_uwqeq,orm-orm,2 +lre22_dev_uwvfl,nbl-nbl,5 +lre22_dev_uxdjn,xho-xho,12 +lre22_dev_uxqte,zul-zul,13 +lre22_dev_uxryh,ven-ven,11 +lre22_dev_uyhzp,orm-orm,15 +lre22_dev_uyrjl,tso-tso,10 +lre22_dev_uyzcl,eng-ens,11 +lre22_dev_uzbqz,fra-ntf,4 +lre22_dev_uzoxq,ara-aeb,9 +lre22_dev_vabxl,nbl-nbl,11 +lre22_dev_vafyo,nbl-nbl,15 +lre22_dev_vascl,nbl-nbl,0 +lre22_dev_vauqx,ara-arq,10 +lre22_dev_vbscm,xho-xho,3 +lre22_dev_vbulh,xho-xho,12 +lre22_dev_vbwwp,xho-xho,15 +lre22_dev_vbznk,ara-arq,6 +lre22_dev_vcibu,nbl-nbl,9 +lre22_dev_vcjun,zul-zul,12 +lre22_dev_vckxt,xho-xho,7 +lre22_dev_vdkjy,fra-ntf,14 +lre22_dev_vdmyt,ara-ayl,0 +lre22_dev_vdoif,ven-ven,13 +lre22_dev_vdvjv,orm-orm,12 +lre22_dev_vebet,ara-aeb,1 +lre22_dev_velkr,ara-aeb,1 +lre22_dev_vgbmm,tir-tir,9 +lre22_dev_vgucw,nbl-nbl,7 +lre22_dev_vhiyb,afr-afr,9 +lre22_dev_vhoej,tir-tir,5 +lre22_dev_vhryd,orm-orm,13 +lre22_dev_vhzdh,tso-tso,10 +lre22_dev_viapx,tso-tso,3 +lre22_dev_vifdj,ara-ayl,4 +lre22_dev_vijbo,zul-zul,12 +lre22_dev_virnr,eng-ens,6 +lre22_dev_vjhbd,orm-orm,6 +lre22_dev_vjoca,ara-aeb,10 +lre22_dev_vjtou,eng-ens,5 +lre22_dev_vjxpv,ara-aeb,10 +lre22_dev_vkmab,fra-ntf,2 +lre22_dev_vkrvz,tir-tir,8 +lre22_dev_vkwwf,tso-tso,9 +lre22_dev_vlbdk,zul-zul,6 +lre22_dev_vliie,orm-orm,9 +lre22_dev_vlrve,eng-iaf,2 +lre22_dev_vmaet,tir-tir,3 +lre22_dev_vmdhi,eng-ens,10 +lre22_dev_vmdjw,nbl-nbl,13 +lre22_dev_vmjut,fra-ntf,9 +lre22_dev_vmrrg,eng-ens,3 +lre22_dev_vnjxn,nbl-nbl,7 +lre22_dev_vnmxm,ven-ven,12 +lre22_dev_vnykj,zul-zul,10 +lre22_dev_vovab,zul-zul,11 +lre22_dev_vovvl,zul-zul,11 +lre22_dev_vpcey,tir-tir,6 +lre22_dev_vpodd,nbl-nbl,11 +lre22_dev_vptke,eng-ens,4 +lre22_dev_vpulr,xho-xho,15 +lre22_dev_vpuve,tir-tir,8 +lre22_dev_vqttr,eng-iaf,12 +lre22_dev_vqzae,eng-iaf,11 +lre22_dev_vrnsg,tso-tso,8 +lre22_dev_vshpc,ara-aeb,6 +lre22_dev_vslbh,ara-arq,9 +lre22_dev_vsmaz,tir-tir,5 +lre22_dev_vsnez,tso-tso,8 +lre22_dev_vsnjp,fra-ntf,14 +lre22_dev_vsocn,ven-ven,7 +lre22_dev_vsvom,afr-afr,8 +lre22_dev_vtnfc,tir-tir,4 +lre22_dev_vtnlb,eng-ens,4 +lre22_dev_vubwb,eng-ens,8 +lre22_dev_vufsn,ara-aeb,3 +lre22_dev_vuiqu,tir-tir,8 +lre22_dev_vumeq,xho-xho,0 +lre22_dev_vupse,ven-ven,6 +lre22_dev_vvauz,xho-xho,14 +lre22_dev_vvfze,eng-ens,11 +lre22_dev_vviyr,zul-zul,12 +lre22_dev_vvwiq,fra-ntf,5 +lre22_dev_vwnkj,zul-zul,5 +lre22_dev_vwoww,orm-orm,7 +lre22_dev_vwtne,afr-afr,5 +lre22_dev_vwxgt,ara-arq,10 +lre22_dev_vxabl,eng-ens,8 +lre22_dev_vxnsl,afr-afr,7 +lre22_dev_vxslj,tir-tir,10 +lre22_dev_vxsvc,tir-tir,11 +lre22_dev_vxuiz,ara-aeb,10 +lre22_dev_vzarl,ara-ayl,7 +lre22_dev_vzeew,ven-ven,6 +lre22_dev_vzjtc,ara-arq,0 +lre22_dev_vzkdb,tso-tso,10 +lre22_dev_vzvpq,ara-arq,9 +lre22_dev_waqyh,xho-xho,15 +lre22_dev_wawwu,xho-xho,14 +lre22_dev_wbgqi,tso-tso,11 +lre22_dev_wcctp,eng-ens,10 +lre22_dev_wdcer,afr-afr,3 +lre22_dev_wdeor,fra-ntf,14 +lre22_dev_wdfdd,eng-iaf,2 +lre22_dev_wdkvb,eng-ens,11 +lre22_dev_wdogx,ara-aeb,7 +lre22_dev_wdqdq,ara-arq,10 +lre22_dev_wdxwu,tir-tir,5 +lre22_dev_weaek,ara-arq,4 +lre22_dev_wefui,tso-tso,10 +lre22_dev_wehjh,tir-tir,10 +lre22_dev_weypz,nbl-nbl,12 +lre22_dev_wffdy,zul-zul,12 +lre22_dev_wffgq,tso-tso,8 +lre22_dev_wfvlh,ven-ven,8 +lre22_dev_wgago,eng-ens,5 +lre22_dev_wglzd,afr-afr,11 +lre22_dev_wgsbu,afr-afr,5 +lre22_dev_whdhw,nbl-nbl,7 +lre22_dev_whogu,eng-iaf,13 +lre22_dev_whpee,tso-tso,9 +lre22_dev_whqpd,ara-aeb,9 +lre22_dev_wikrr,ven-ven,11 +lre22_dev_witju,fra-ntf,11 +lre22_dev_wjcme,orm-orm,10 +lre22_dev_wkare,ara-arq,2 +lre22_dev_wkbfe,afr-afr,9 +lre22_dev_wkecn,xho-xho,13 +lre22_dev_wkhxo,afr-afr,9 +lre22_dev_wlgae,ara-arq,6 +lre22_dev_wlnls,eng-iaf,7 +lre22_dev_wlsxb,eng-ens,1 +lre22_dev_wlwuc,nbl-nbl,8 +lre22_dev_wnaqr,nbl-nbl,9 +lre22_dev_wndpq,fra-ntf,13 +lre22_dev_wnkdc,ara-ayl,2 +lre22_dev_wnknc,nbl-nbl,9 +lre22_dev_wnppz,orm-orm,15 +lre22_dev_wpzgm,afr-afr,13 +lre22_dev_wqhqj,ara-ayl,9 +lre22_dev_wqreb,afr-afr,11 +lre22_dev_wqrez,eng-ens,4 +lre22_dev_wqtsf,ara-arq,8 +lre22_dev_wqwtc,orm-orm,3 +lre22_dev_wrfwf,ven-ven,7 +lre22_dev_wrqqt,orm-orm,15 +lre22_dev_wrutf,afr-afr,7 +lre22_dev_wrvzk,nbl-nbl,1 +lre22_dev_wrxly,fra-ntf,13 +lre22_dev_wsbiw,ara-aeb,8 +lre22_dev_wshay,zul-zul,8 +lre22_dev_wsous,tso-tso,5 +lre22_dev_wszpj,ven-ven,7 +lre22_dev_wtksi,afr-afr,8 +lre22_dev_wugbw,xho-xho,6 +lre22_dev_wujfv,afr-afr,11 +lre22_dev_wuwek,xho-xho,12 +lre22_dev_wvhhk,fra-ntf,2 +lre22_dev_wvosz,nbl-nbl,3 +lre22_dev_wwagu,xho-xho,14 +lre22_dev_wwbuj,eng-iaf,2 +lre22_dev_wwgnr,afr-afr,10 +lre22_dev_wwjev,afr-afr,12 +lre22_dev_wwmsu,ara-arq,4 +lre22_dev_wwrmy,ven-ven,7 +lre22_dev_wwvhd,ara-arq,9 +lre22_dev_wxdjv,ara-ayl,6 +lre22_dev_wygox,tir-tir,6 +lre22_dev_wyhuq,zul-zul,13 +lre22_dev_wzoir,xho-xho,15 +lre22_dev_wzvwa,orm-orm,6 +lre22_dev_xapvn,tso-tso,8 +lre22_dev_xarkl,eng-ens,5 +lre22_dev_xavhh,nbl-nbl,10 +lre22_dev_xazuy,orm-orm,3 +lre22_dev_xbnft,eng-iaf,0 +lre22_dev_xbqbc,fra-ntf,7 +lre22_dev_xbzfw,tir-tir,11 +lre22_dev_xccde,ara-arq,3 +lre22_dev_xcdty,zul-zul,8 +lre22_dev_xcjkb,ara-ayl,7 +lre22_dev_xcmty,ara-arq,10 +lre22_dev_xcsbc,tso-tso,1 +lre22_dev_xdkjb,nbl-nbl,11 +lre22_dev_xdknq,nbl-nbl,11 +lre22_dev_xdoik,eng-ens,10 +lre22_dev_xdtyd,nbl-nbl,4 +lre22_dev_xearl,eng-iaf,3 +lre22_dev_xedqa,nbl-nbl,11 +lre22_dev_xefnx,eng-ens,11 +lre22_dev_xeipr,tir-tir,11 +lre22_dev_xekhs,zul-zul,9 +lre22_dev_xelzr,ara-aeb,9 +lre22_dev_xenhb,ara-aeb,3 +lre22_dev_xfdsx,xho-xho,12 +lre22_dev_xfggl,xho-xho,9 +lre22_dev_xgspz,eng-iaf,13 +lre22_dev_xgwmu,tso-tso,8 +lre22_dev_xhbmk,orm-orm,15 +lre22_dev_xhdtl,orm-orm,3 +lre22_dev_xisjn,ara-arq,8 +lre22_dev_xitdz,nbl-nbl,10 +lre22_dev_xizbg,xho-xho,14 +lre22_dev_xjcph,xho-xho,10 +lre22_dev_xjcvd,zul-zul,7 +lre22_dev_xjlgm,ara-aeb,3 +lre22_dev_xjxzy,eng-ens,2 +lre22_dev_xkfsd,ven-ven,12 +lre22_dev_xkktj,eng-iaf,12 +lre22_dev_xkmmy,ara-aeb,10 +lre22_dev_xltgz,ara-ayl,5 +lre22_dev_xmbby,orm-orm,3 +lre22_dev_xmcmv,xho-xho,14 +lre22_dev_xngam,fra-ntf,14 +lre22_dev_xnsev,ara-ayl,8 +lre22_dev_xnwsq,ara-arq,8 +lre22_dev_xnwwh,zul-zul,13 +lre22_dev_xobeh,tir-tir,11 +lre22_dev_xolau,ven-ven,13 +lre22_dev_xoqtn,eng-iaf,10 +lre22_dev_xovpd,eng-iaf,10 +lre22_dev_xpaff,eng-ens,9 +lre22_dev_xpahm,ara-arq,4 +lre22_dev_xpcrs,tso-tso,5 +lre22_dev_xpdsg,eng-iaf,5 +lre22_dev_xpjqj,nbl-nbl,6 +lre22_dev_xqwtk,ara-arq,10 +lre22_dev_xrfge,ara-arq,8 +lre22_dev_xrhka,orm-orm,9 +lre22_dev_xrpup,zul-zul,8 +lre22_dev_xsbff,ara-aeb,9 +lre22_dev_xsffv,tso-tso,1 +lre22_dev_xstnu,eng-ens,5 +lre22_dev_xthfd,ara-aeb,8 +lre22_dev_xthzz,ven-ven,4 +lre22_dev_xtmgg,eng-iaf,13 +lre22_dev_xtyic,nbl-nbl,14 +lre22_dev_xucyl,eng-ens,7 +lre22_dev_xudii,ara-ayl,3 +lre22_dev_xugux,afr-afr,0 +lre22_dev_xuqnj,ara-ayl,4 +lre22_dev_xvaoh,nbl-nbl,9 +lre22_dev_xvclh,afr-afr,9 +lre22_dev_xveae,xho-xho,4 +lre22_dev_xxpqz,ara-arq,9 +lre22_dev_xxqad,tso-tso,10 +lre22_dev_xybed,tir-tir,9 +lre22_dev_xyrex,eng-ens,11 +lre22_dev_xzlas,eng-iaf,9 +lre22_dev_xztyr,orm-orm,9 +lre22_dev_yaxkb,zul-zul,12 +lre22_dev_ybcvu,xho-xho,13 +lre22_dev_ybjon,orm-orm,2 +lre22_dev_ybubm,ven-ven,5 +lre22_dev_ycarc,eng-ens,6 +lre22_dev_ychjj,orm-orm,2 +lre22_dev_ycnyc,tir-tir,7 +lre22_dev_ycsvt,afr-afr,12 +lre22_dev_ydaxa,nbl-nbl,8 +lre22_dev_ydrxu,nbl-nbl,1 +lre22_dev_yeekw,fra-ntf,13 +lre22_dev_yevan,tir-tir,11 +lre22_dev_yfaan,tir-tir,10 +lre22_dev_yfayx,afr-afr,6 +lre22_dev_yfpsd,fra-ntf,1 +lre22_dev_yfxkm,ven-ven,7 +lre22_dev_yguqk,ven-ven,3 +lre22_dev_yhrgj,afr-afr,8 +lre22_dev_yhzyq,ara-ayl,5 +lre22_dev_yiqui,eng-iaf,12 +lre22_dev_yjens,ara-ayl,7 +lre22_dev_yjkxx,eng-ens,8 +lre22_dev_yjypk,ara-ayl,9 +lre22_dev_ykchd,ven-ven,8 +lre22_dev_ykktl,xho-xho,0 +lre22_dev_ylhwh,orm-orm,9 +lre22_dev_ylnms,tso-tso,2 +lre22_dev_ylsdz,ven-ven,7 +lre22_dev_ymcmp,eng-iaf,8 +lre22_dev_ymfzx,tso-tso,7 +lre22_dev_ymizm,fra-ntf,0 +lre22_dev_ympvj,tir-tir,9 +lre22_dev_ymslh,tir-tir,12 +lre22_dev_ynavg,zul-zul,9 +lre22_dev_ynhlk,tir-tir,9 +lre22_dev_ynnkb,eng-ens,10 +lre22_dev_yogkc,fra-ntf,7 +lre22_dev_yokld,eng-ens,4 +lre22_dev_yokve,tir-tir,6 +lre22_dev_yomdz,ara-ayl,6 +lre22_dev_yomuu,xho-xho,12 +lre22_dev_yoobm,ara-ayl,8 +lre22_dev_yoocz,eng-ens,10 +lre22_dev_yopyf,eng-iaf,5 +lre22_dev_yoxoc,tir-tir,8 +lre22_dev_ypaem,afr-afr,5 +lre22_dev_ypamp,afr-afr,7 +lre22_dev_ypjpq,tir-tir,8 +lre22_dev_yplba,ara-arq,9 +lre22_dev_ypnrh,fra-ntf,1 +lre22_dev_ypqfg,eng-ens,7 +lre22_dev_yrdsl,eng-ens,2 +lre22_dev_yrtkv,afr-afr,7 +lre22_dev_yrwrb,nbl-nbl,9 +lre22_dev_ysmlk,eng-ens,11 +lre22_dev_yspja,orm-orm,5 +lre22_dev_ytfnn,fra-ntf,14 +lre22_dev_yturp,ara-aeb,6 +lre22_dev_ytvbd,afr-afr,4 +lre22_dev_yuhvo,tso-tso,8 +lre22_dev_yundi,ara-arq,3 +lre22_dev_yvmnx,ara-arq,10 +lre22_dev_yvqud,xho-xho,15 +lre22_dev_yvxdd,ara-ayl,4 +lre22_dev_ywjtq,xho-xho,5 +lre22_dev_ywnza,fra-ntf,12 +lre22_dev_yxnno,tso-tso,10 +lre22_dev_yxoww,tir-tir,7 +lre22_dev_yxpgi,ara-arq,5 +lre22_dev_yxsta,eng-ens,7 +lre22_dev_yyltz,xho-xho,8 +lre22_dev_yyqqx,fra-ntf,12 +lre22_dev_yzloh,ara-ayl,7 +lre22_dev_zacdy,ara-ayl,3 +lre22_dev_zadkk,tir-tir,9 +lre22_dev_zalpc,afr-afr,6 +lre22_dev_zarod,orm-orm,8 +lre22_dev_zasvb,afr-afr,11 +lre22_dev_zazom,ara-arq,9 +lre22_dev_zbfqk,afr-afr,13 +lre22_dev_zbqew,tso-tso,2 +lre22_dev_zbrkn,eng-ens,7 +lre22_dev_zbubp,zul-zul,9 +lre22_dev_zbytc,ara-arq,8 +lre22_dev_zcfns,tir-tir,6 +lre22_dev_zcfzk,afr-afr,7 +lre22_dev_zcrgv,ara-arq,10 +lre22_dev_zdxdn,ara-ayl,7 +lre22_dev_zdydi,eng-ens,1 +lre22_dev_zebzq,ven-ven,4 +lre22_dev_zedlk,xho-xho,14 +lre22_dev_zeqpp,tir-tir,12 +lre22_dev_zfjbm,ara-arq,10 +lre22_dev_zfkne,nbl-nbl,13 +lre22_dev_zflnr,ven-ven,13 +lre22_dev_zfoyd,xho-xho,4 +lre22_dev_zgdyu,eng-iaf,8 +lre22_dev_zgmja,zul-zul,9 +lre22_dev_zgvfs,ara-arq,6 +lre22_dev_zhmud,orm-orm,14 +lre22_dev_zhoml,tso-tso,9 +lre22_dev_zijcb,xho-xho,10 +lre22_dev_ziktm,ara-aeb,10 +lre22_dev_zipxy,ara-arq,9 +lre22_dev_ziqxc,eng-iaf,1 +lre22_dev_zjhir,ven-ven,7 +lre22_dev_zjmqp,orm-orm,13 +lre22_dev_zjrrk,tso-tso,11 +lre22_dev_zjtwd,ara-aeb,3 +lre22_dev_zkfcf,xho-xho,6 +lre22_dev_zkftc,nbl-nbl,4 +lre22_dev_zkqei,ara-ayl,7 +lre22_dev_zkwqo,zul-zul,11 +lre22_dev_zlamn,nbl-nbl,6 +lre22_dev_zlbor,xho-xho,14 +lre22_dev_zloet,ven-ven,8 +lre22_dev_zlvhk,zul-zul,5 +lre22_dev_zlzqv,fra-ntf,12 +lre22_dev_zmobq,ara-ayl,7 +lre22_dev_zmuiv,zul-zul,9 +lre22_dev_znvqw,zul-zul,4 +lre22_dev_znzuu,tir-tir,0 +lre22_dev_zoava,eng-iaf,6 +lre22_dev_zodvu,tso-tso,0 +lre22_dev_zosdw,nbl-nbl,15 +lre22_dev_zpnvq,xho-xho,6 +lre22_dev_zqeby,eng-iaf,12 +lre22_dev_zqgdd,nbl-nbl,9 +lre22_dev_zqhaw,nbl-nbl,5 +lre22_dev_zqkau,orm-orm,8 +lre22_dev_zqkel,ara-ayl,9 +lre22_dev_zqlnd,ara-aeb,8 +lre22_dev_zrnpw,orm-orm,8 +lre22_dev_zrqvc,afr-afr,9 +lre22_dev_zrrgq,ven-ven,8 +lre22_dev_zryit,zul-zul,8 +lre22_dev_zsckt,zul-zul,4 +lre22_dev_zucqq,orm-orm,4 +lre22_dev_zusln,orm-orm,11 +lre22_dev_zuxzw,tir-tir,0 +lre22_dev_zvabs,tir-tir,11 +lre22_dev_zvlid,tso-tso,11 +lre22_dev_zvned,eng-iaf,5 +lre22_dev_zvtwr,xho-xho,11 +lre22_dev_zwmim,orm-orm,11 +lre22_dev_zwnsu,ara-arq,8 +lre22_dev_zwtxn,ara-arq,10 +lre22_dev_zxfcm,orm-orm,3 +lre22_dev_zxsgm,tir-tir,5 +lre22_dev_zybya,eng-iaf,10 +lre22_dev_zygak,zul-zul,1 +lre22_dev_zylqc,eng-ens,3 +lre22_dev_zyppc,fra-ntf,8 +lre22_dev_zywem,eng-ens,8 +lre22_dev_zzapx,ara-ayl,5 +lre22_dev_zzumc,ara-arq,2 +lre22_dev_zzvdl,fra-ntf,5 +lre22_dev_zzvjv,nbl-nbl,14 diff --git a/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/train_segments.csv b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/train_segments.csv new file mode 100644 index 00000000..4d50b6a5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_0/train_segments.csv @@ -0,0 +1,2088 @@ +id,class_id,subclass_idx +lre22_dev_aayck,ara-aeb,12 +lre22_dev_aayto,eng-iaf,14 +lre22_dev_abaha,zul-zul,17 +lre22_dev_abetm,fra-ntf,15 +lre22_dev_abnwz,zul-zul,19 +lre22_dev_abvjt,zul-zul,19 +lre22_dev_abwgm,ara-ayl,13 +lre22_dev_acepd,eng-iaf,19 +lre22_dev_acspt,eng-ens,12 +lre22_dev_aczdh,eng-ens,13 +lre22_dev_adkkm,tso-tso,19 +lre22_dev_adpus,tso-tso,13 +lre22_dev_adwju,ara-aeb,14 +lre22_dev_afnfn,afr-afr,20 +lre22_dev_afohq,ara-aeb,13 +lre22_dev_agnnp,afr-afr,17 +lre22_dev_agquw,fra-ntf,20 +lre22_dev_ahoow,ara-ayl,15 +lre22_dev_ahqxq,fra-ntf,22 +lre22_dev_aieqr,eng-iaf,17 +lre22_dev_ainix,eng-iaf,16 +lre22_dev_aiojl,fra-ntf,18 +lre22_dev_aiypg,nbl-nbl,17 +lre22_dev_ajcpi,orm-orm,22 +lre22_dev_ajeqv,ara-aeb,11 +lre22_dev_ajlqy,xho-xho,16 +lre22_dev_ajlyw,orm-orm,21 +lre22_dev_ajmrs,ara-aeb,11 +lre22_dev_ajzjc,eng-iaf,16 +lre22_dev_ajzyq,ara-ayl,14 +lre22_dev_akmfp,orm-orm,19 +lre22_dev_aleeu,ara-arq,14 +lre22_dev_aliba,ara-aeb,15 +lre22_dev_alkwi,eng-iaf,14 +lre22_dev_aluwk,nbl-nbl,16 +lre22_dev_alvdl,ara-arq,14 +lre22_dev_amrca,ara-aeb,11 +lre22_dev_aoanh,ara-ayl,15 +lre22_dev_aoeql,eng-ens,16 +lre22_dev_apfpk,eng-iaf,14 +lre22_dev_apufs,tir-tir,17 +lre22_dev_apvko,orm-orm,20 +lre22_dev_arefe,orm-orm,23 +lre22_dev_arvyp,ara-ayl,11 +lre22_dev_arwsc,fra-ntf,20 +lre22_dev_asqwa,ara-aeb,14 +lre22_dev_asrng,fra-ntf,18 +lre22_dev_aswjo,afr-afr,18 +lre22_dev_aulzk,ven-ven,21 +lre22_dev_aupcr,zul-zul,18 +lre22_dev_auqcy,eng-ens,18 +lre22_dev_auxdy,nbl-nbl,16 +lre22_dev_auycg,ara-ayl,11 +lre22_dev_aviiv,tso-tso,14 +lre22_dev_avrwo,tso-tso,19 +lre22_dev_avwim,ara-arq,13 +lre22_dev_avzdv,zul-zul,18 +lre22_dev_awtna,ara-arq,13 +lre22_dev_awxbj,orm-orm,23 +lre22_dev_axejc,fra-ntf,17 +lre22_dev_axtso,eng-ens,16 +lre22_dev_axwoo,ara-aeb,15 +lre22_dev_axyma,ara-arq,15 +lre22_dev_aycai,ven-ven,17 +lre22_dev_ayfjz,orm-orm,20 +lre22_dev_aylrz,eng-iaf,16 +lre22_dev_aynwz,tso-tso,18 +lre22_dev_aypyt,ara-aeb,11 +lre22_dev_ayszn,zul-zul,18 +lre22_dev_ayvge,ara-aeb,11 +lre22_dev_ayvmo,afr-afr,23 +lre22_dev_ayzdz,xho-xho,20 +lre22_dev_azbmt,xho-xho,19 +lre22_dev_azjsr,tir-tir,19 +lre22_dev_azkdh,nbl-nbl,20 +lre22_dev_azwrd,fra-ntf,15 +lre22_dev_badwe,ara-aeb,13 +lre22_dev_baiaf,zul-zul,17 +lre22_dev_baiwb,ara-aeb,13 +lre22_dev_baxuo,zul-zul,18 +lre22_dev_bbbtf,eng-ens,18 +lre22_dev_bbdws,ara-ayl,12 +lre22_dev_bbitq,eng-ens,16 +lre22_dev_bbnvu,ara-arq,13 +lre22_dev_bbunq,eng-iaf,14 +lre22_dev_bcinm,ara-aeb,14 +lre22_dev_bcrhs,zul-zul,17 +lre22_dev_bcwpu,ara-aeb,13 +lre22_dev_bcxdq,fra-ntf,21 +lre22_dev_bdgbr,ara-aeb,12 +lre22_dev_bdgrw,orm-orm,17 +lre22_dev_bdiml,ara-aeb,11 +lre22_dev_bdyue,xho-xho,21 +lre22_dev_bdzsj,tir-tir,13 +lre22_dev_beanp,tso-tso,12 +lre22_dev_beigo,ara-aeb,14 +lre22_dev_belhi,orm-orm,23 +lre22_dev_bfoej,ven-ven,20 +lre22_dev_bfznf,ara-ayl,11 +lre22_dev_bgeiq,ven-ven,15 +lre22_dev_bgeyp,ara-aeb,11 +lre22_dev_bgomt,afr-afr,14 +lre22_dev_bgrfd,nbl-nbl,19 +lre22_dev_bgwlu,tir-tir,17 +lre22_dev_bifkp,nbl-nbl,18 +lre22_dev_bipvh,nbl-nbl,17 +lre22_dev_biuyu,eng-ens,12 +lre22_dev_bixnf,ara-ayl,11 +lre22_dev_bjhdf,tso-tso,17 +lre22_dev_bjsmm,ara-ayl,10 +lre22_dev_bkhqg,eng-ens,17 +lre22_dev_bkpah,ven-ven,14 +lre22_dev_blaco,afr-afr,17 +lre22_dev_bleum,xho-xho,18 +lre22_dev_bnhvt,nbl-nbl,16 +lre22_dev_bowyn,ara-arq,14 +lre22_dev_bpeqb,xho-xho,21 +lre22_dev_bpgqs,tir-tir,13 +lre22_dev_bpzpv,afr-afr,16 +lre22_dev_bqenu,eng-ens,12 +lre22_dev_bqfxw,zul-zul,14 +lre22_dev_bqowg,tir-tir,19 +lre22_dev_bqxyq,tir-tir,19 +lre22_dev_brjud,xho-xho,21 +lre22_dev_bruwl,xho-xho,16 +lre22_dev_brzld,fra-ntf,20 +lre22_dev_bsgqz,eng-ens,13 +lre22_dev_bsocl,eng-ens,12 +lre22_dev_bszou,ara-arq,13 +lre22_dev_btapz,zul-zul,15 +lre22_dev_btjlk,ara-aeb,14 +lre22_dev_btkry,xho-xho,19 +lre22_dev_btyeu,ara-ayl,15 +lre22_dev_bvnuu,fra-ntf,19 +lre22_dev_bvqag,eng-iaf,20 +lre22_dev_bvvho,eng-ens,16 +lre22_dev_bvwaj,tir-tir,14 +lre22_dev_bvymi,eng-ens,15 +lre22_dev_bwgmj,eng-iaf,20 +lre22_dev_bwqpz,ara-arq,14 +lre22_dev_bwyrh,ara-aeb,12 +lre22_dev_bxkrj,ven-ven,18 +lre22_dev_bxkti,afr-afr,20 +lre22_dev_bxzms,nbl-nbl,17 +lre22_dev_bygrw,tso-tso,18 +lre22_dev_byjqr,ven-ven,18 +lre22_dev_bylkl,eng-iaf,16 +lre22_dev_bzmkn,fra-ntf,22 +lre22_dev_bzntz,ara-arq,13 +lre22_dev_bzwkf,eng-iaf,19 +lre22_dev_caijh,ven-ven,18 +lre22_dev_canou,tir-tir,19 +lre22_dev_caqxh,afr-afr,20 +lre22_dev_cayuc,eng-ens,12 +lre22_dev_cbruy,xho-xho,23 +lre22_dev_cbyyw,ara-arq,14 +lre22_dev_cbzbe,afr-afr,22 +lre22_dev_cclfh,ara-arq,15 +lre22_dev_ccovd,ara-arq,11 +lre22_dev_ccpns,eng-ens,17 +lre22_dev_ccsjt,eng-iaf,16 +lre22_dev_ccsql,fra-ntf,21 +lre22_dev_ccugm,eng-ens,18 +lre22_dev_ccyfn,afr-afr,23 +lre22_dev_cdmgw,tir-tir,16 +lre22_dev_cdshg,eng-iaf,17 +lre22_dev_ceccy,orm-orm,20 +lre22_dev_cecwt,fra-ntf,22 +lre22_dev_cegvk,ara-arq,11 +lre22_dev_cferi,zul-zul,15 +lre22_dev_cfojx,ara-arq,11 +lre22_dev_cfzoe,tir-tir,20 +lre22_dev_cgfna,zul-zul,18 +lre22_dev_cggzh,ara-ayl,13 +lre22_dev_cgims,tir-tir,20 +lre22_dev_cgixe,tir-tir,19 +lre22_dev_cgjov,zul-zul,14 +lre22_dev_chhio,ara-aeb,14 +lre22_dev_chnvd,tir-tir,13 +lre22_dev_chpww,nbl-nbl,21 +lre22_dev_churq,ara-ayl,13 +lre22_dev_cifqp,zul-zul,17 +lre22_dev_cijnx,xho-xho,22 +lre22_dev_ciozp,nbl-nbl,16 +lre22_dev_citpi,ara-aeb,12 +lre22_dev_cjrav,tir-tir,15 +lre22_dev_cksrw,ara-aeb,14 +lre22_dev_cktce,tir-tir,17 +lre22_dev_ckzhf,nbl-nbl,20 +lre22_dev_cleyn,ara-aeb,11 +lre22_dev_clhmt,fra-ntf,19 +lre22_dev_clrjd,orm-orm,21 +lre22_dev_clssx,eng-iaf,14 +lre22_dev_cluxm,ara-ayl,13 +lre22_dev_clzwe,ara-aeb,14 +lre22_dev_cminq,ara-aeb,11 +lre22_dev_cmmap,afr-afr,23 +lre22_dev_cmssr,orm-orm,20 +lre22_dev_cmufu,tso-tso,16 +lre22_dev_cnapz,orm-orm,19 +lre22_dev_cndba,tso-tso,12 +lre22_dev_cnkjh,tso-tso,15 +lre22_dev_cnvfe,orm-orm,18 +lre22_dev_cobbz,ara-arq,12 +lre22_dev_coppu,nbl-nbl,21 +lre22_dev_coqoj,eng-ens,17 +lre22_dev_cotun,ven-ven,16 +lre22_dev_cowrt,xho-xho,19 +lre22_dev_cppma,afr-afr,20 +lre22_dev_cpqkz,ara-arq,14 +lre22_dev_cpraw,afr-afr,17 +lre22_dev_cpsrb,fra-ntf,20 +lre22_dev_cpuax,zul-zul,16 +lre22_dev_cpudb,nbl-nbl,16 +lre22_dev_cqqds,afr-afr,22 +lre22_dev_cquib,ven-ven,21 +lre22_dev_cqwxe,nbl-nbl,16 +lre22_dev_cqyad,eng-iaf,15 +lre22_dev_crkut,eng-ens,17 +lre22_dev_crozj,fra-ntf,17 +lre22_dev_crrro,orm-orm,16 +lre22_dev_csavn,ara-aeb,15 +lre22_dev_cschy,afr-afr,16 +lre22_dev_csegr,tso-tso,14 +lre22_dev_csgvq,fra-ntf,17 +lre22_dev_csltj,ara-aeb,14 +lre22_dev_csmtr,ara-ayl,14 +lre22_dev_csqxl,ven-ven,20 +lre22_dev_ctjqw,nbl-nbl,16 +lre22_dev_ctxxt,nbl-nbl,17 +lre22_dev_cuaoy,ara-aeb,13 +lre22_dev_cudpj,ara-arq,13 +lre22_dev_cuhdf,afr-afr,21 +lre22_dev_cuoju,ven-ven,21 +lre22_dev_cupti,nbl-nbl,21 +lre22_dev_cusej,ara-aeb,14 +lre22_dev_cvfle,tir-tir,14 +lre22_dev_cvnqu,eng-ens,14 +lre22_dev_cvvjc,zul-zul,18 +lre22_dev_cvwht,fra-ntf,18 +lre22_dev_cvwtu,fra-ntf,21 +lre22_dev_cwlvk,tso-tso,16 +lre22_dev_cwnky,xho-xho,17 +lre22_dev_cxdlr,afr-afr,14 +lre22_dev_cxfii,ara-arq,13 +lre22_dev_cxpzt,zul-zul,16 +lre22_dev_cxqri,fra-ntf,21 +lre22_dev_cyaug,xho-xho,22 +lre22_dev_czdbd,fra-ntf,15 +lre22_dev_czvoy,ven-ven,16 +lre22_dev_czzrm,afr-afr,17 +lre22_dev_dahzr,ven-ven,17 +lre22_dev_dapny,ven-ven,17 +lre22_dev_dapug,nbl-nbl,19 +lre22_dev_dcbnz,xho-xho,16 +lre22_dev_dciaf,nbl-nbl,22 +lre22_dev_dcljn,afr-afr,19 +lre22_dev_dcmrn,afr-afr,20 +lre22_dev_dcobq,xho-xho,16 +lre22_dev_dcohp,tir-tir,16 +lre22_dev_dcsep,tso-tso,12 +lre22_dev_dctlw,ara-arq,12 +lre22_dev_dctvv,ara-arq,12 +lre22_dev_dcyoy,eng-iaf,17 +lre22_dev_ddgeb,xho-xho,23 +lre22_dev_ddsab,eng-ens,18 +lre22_dev_ddtpk,eng-ens,18 +lre22_dev_debjr,xho-xho,16 +lre22_dev_defkv,eng-ens,15 +lre22_dev_dejub,ara-arq,11 +lre22_dev_delok,eng-ens,14 +lre22_dev_dezlg,nbl-nbl,17 +lre22_dev_dffbj,fra-ntf,21 +lre22_dev_dfkox,xho-xho,19 +lre22_dev_dfpcn,ara-ayl,13 +lre22_dev_dfqgl,afr-afr,18 +lre22_dev_dfras,eng-iaf,19 +lre22_dev_dftpm,eng-iaf,20 +lre22_dev_dfvta,tso-tso,17 +lre22_dev_dgarp,eng-ens,13 +lre22_dev_dgntq,zul-zul,17 +lre22_dev_dgssb,tir-tir,19 +lre22_dev_dgvtc,xho-xho,23 +lre22_dev_dhdvp,ara-ayl,10 +lre22_dev_dhmbl,fra-ntf,22 +lre22_dev_diiry,orm-orm,16 +lre22_dev_disrs,afr-afr,16 +lre22_dev_ditsk,xho-xho,21 +lre22_dev_djbbz,ara-arq,14 +lre22_dev_djevu,tso-tso,16 +lre22_dev_djlaf,tir-tir,20 +lre22_dev_djoim,zul-zul,15 +lre22_dev_djvvp,zul-zul,17 +lre22_dev_djwyo,ven-ven,18 +lre22_dev_dkbfm,ara-ayl,12 +lre22_dev_dkpcy,ara-aeb,12 +lre22_dev_dlxzj,orm-orm,19 +lre22_dev_dmnjo,ven-ven,14 +lre22_dev_dmtsm,zul-zul,16 +lre22_dev_dnaql,orm-orm,23 +lre22_dev_dnkpf,ara-aeb,15 +lre22_dev_dnscr,tso-tso,12 +lre22_dev_dnygt,eng-ens,15 +lre22_dev_dobre,xho-xho,19 +lre22_dev_dohlp,xho-xho,23 +lre22_dev_doioo,orm-orm,19 +lre22_dev_donaq,ara-aeb,13 +lre22_dev_dooht,ara-arq,11 +lre22_dev_dpmbt,zul-zul,14 +lre22_dev_dptyy,xho-xho,17 +lre22_dev_dqmud,eng-iaf,15 +lre22_dev_dqmxb,xho-xho,20 +lre22_dev_dqopt,eng-ens,14 +lre22_dev_dqpgr,ara-aeb,14 +lre22_dev_drkux,eng-ens,14 +lre22_dev_dsfha,ven-ven,18 +lre22_dev_dsftc,tso-tso,16 +lre22_dev_dskaq,ven-ven,15 +lre22_dev_dtdmp,zul-zul,18 +lre22_dev_dtdux,afr-afr,14 +lre22_dev_dtyki,ara-arq,11 +lre22_dev_durlr,orm-orm,18 +lre22_dev_dutdz,tso-tso,12 +lre22_dev_dvbol,ara-ayl,15 +lre22_dev_dwesk,nbl-nbl,22 +lre22_dev_dwtjw,ven-ven,14 +lre22_dev_dxckb,tso-tso,12 +lre22_dev_dxizq,eng-iaf,14 +lre22_dev_dxtnq,fra-ntf,18 +lre22_dev_dxvib,zul-zul,14 +lre22_dev_dyago,eng-iaf,16 +lre22_dev_dyipl,eng-iaf,18 +lre22_dev_dyqlo,ara-arq,13 +lre22_dev_dyvml,eng-iaf,15 +lre22_dev_dzkui,tso-tso,12 +lre22_dev_dzqta,ven-ven,20 +lre22_dev_dzxio,eng-ens,18 +lre22_dev_eachn,tir-tir,16 +lre22_dev_eapvu,eng-iaf,20 +lre22_dev_ebfdv,ara-ayl,10 +lre22_dev_ebgbd,eng-ens,17 +lre22_dev_eblhy,eng-iaf,20 +lre22_dev_ebtrq,ara-aeb,13 +lre22_dev_ebymv,tir-tir,14 +lre22_dev_ebzhg,nbl-nbl,21 +lre22_dev_ecbwo,ven-ven,21 +lre22_dev_ecllm,fra-ntf,21 +lre22_dev_eclpf,ven-ven,16 +lre22_dev_ecmhd,ara-aeb,14 +lre22_dev_ecnqi,eng-ens,14 +lre22_dev_ecpdc,ara-ayl,10 +lre22_dev_ecslx,afr-afr,22 +lre22_dev_ecuyo,xho-xho,23 +lre22_dev_edgur,tso-tso,16 +lre22_dev_edjtb,nbl-nbl,22 +lre22_dev_edsls,tso-tso,16 +lre22_dev_edssc,orm-orm,23 +lre22_dev_edvab,zul-zul,19 +lre22_dev_eehzu,zul-zul,18 +lre22_dev_eekci,afr-afr,15 +lre22_dev_eekcw,zul-zul,17 +lre22_dev_efihg,nbl-nbl,16 +lre22_dev_efsxw,tso-tso,16 +lre22_dev_efxjv,ara-aeb,14 +lre22_dev_efymf,ara-aeb,14 +lre22_dev_ehcvr,tir-tir,19 +lre22_dev_ehehw,xho-xho,20 +lre22_dev_ehewh,eng-ens,18 +lre22_dev_ehvyp,zul-zul,14 +lre22_dev_eifqv,zul-zul,19 +lre22_dev_eifxu,ara-ayl,10 +lre22_dev_ejcvy,fra-ntf,18 +lre22_dev_ejeek,eng-ens,16 +lre22_dev_ejfyn,fra-ntf,22 +lre22_dev_ejjqg,tso-tso,12 +lre22_dev_ejtox,ven-ven,19 +lre22_dev_ejwch,fra-ntf,21 +lre22_dev_ejzhx,xho-xho,17 +lre22_dev_ekbkm,afr-afr,21 +lre22_dev_ekzhk,ara-ayl,10 +lre22_dev_elanj,tso-tso,18 +lre22_dev_elvvn,tir-tir,16 +lre22_dev_emadg,xho-xho,22 +lre22_dev_emkzr,afr-afr,21 +lre22_dev_emmck,ara-arq,15 +lre22_dev_enwfu,afr-afr,15 +lre22_dev_eodro,ara-arq,15 +lre22_dev_eoisu,ven-ven,18 +lre22_dev_eomzr,xho-xho,23 +lre22_dev_eorva,xho-xho,21 +lre22_dev_epbwh,nbl-nbl,17 +lre22_dev_epeou,xho-xho,20 +lre22_dev_epifq,nbl-nbl,22 +lre22_dev_epqqo,ara-ayl,14 +lre22_dev_epsld,tso-tso,12 +lre22_dev_epsza,ara-ayl,12 +lre22_dev_eqmgm,ara-aeb,12 +lre22_dev_eqrhr,afr-afr,22 +lre22_dev_eqvan,ara-ayl,13 +lre22_dev_ersgd,orm-orm,22 +lre22_dev_erxig,zul-zul,15 +lre22_dev_esbrw,fra-ntf,19 +lre22_dev_esuug,nbl-nbl,20 +lre22_dev_etczk,tir-tir,14 +lre22_dev_etelz,fra-ntf,21 +lre22_dev_ettsh,fra-ntf,20 +lre22_dev_etuwp,ven-ven,19 +lre22_dev_eubgy,fra-ntf,18 +lre22_dev_euewj,orm-orm,18 +lre22_dev_euzyb,ara-aeb,13 +lre22_dev_ewatn,zul-zul,18 +lre22_dev_ewehs,orm-orm,17 +lre22_dev_ewexz,fra-ntf,18 +lre22_dev_ewgop,tir-tir,20 +lre22_dev_ewmgd,fra-ntf,21 +lre22_dev_ewzma,orm-orm,18 +lre22_dev_expvn,xho-xho,17 +lre22_dev_eyoqu,tir-tir,16 +lre22_dev_eyylz,nbl-nbl,16 +lre22_dev_eyzqu,tir-tir,18 +lre22_dev_ezdty,afr-afr,18 +lre22_dev_ezgcl,ara-aeb,13 +lre22_dev_eznzd,zul-zul,19 +lre22_dev_ezzwj,eng-iaf,18 +lre22_dev_facyr,zul-zul,18 +lre22_dev_faejb,tso-tso,16 +lre22_dev_famjw,orm-orm,18 +lre22_dev_favzh,ara-arq,11 +lre22_dev_fbsre,orm-orm,23 +lre22_dev_fbtkl,fra-ntf,22 +lre22_dev_fbvxh,ara-ayl,14 +lre22_dev_fbyhp,nbl-nbl,20 +lre22_dev_fbysf,nbl-nbl,17 +lre22_dev_fcckx,ara-arq,12 +lre22_dev_fczba,eng-iaf,17 +lre22_dev_fdouw,eng-ens,14 +lre22_dev_fdtmf,tso-tso,13 +lre22_dev_fdtnc,fra-ntf,20 +lre22_dev_fdwme,afr-afr,19 +lre22_dev_fdyhr,eng-ens,18 +lre22_dev_feanh,fra-ntf,22 +lre22_dev_femmc,ara-arq,12 +lre22_dev_fevab,orm-orm,19 +lre22_dev_fexsi,orm-orm,17 +lre22_dev_fflai,ara-aeb,14 +lre22_dev_fgblw,tso-tso,14 +lre22_dev_fglhf,nbl-nbl,22 +lre22_dev_fhucm,ara-ayl,14 +lre22_dev_fhzwp,nbl-nbl,17 +lre22_dev_fifon,eng-iaf,14 +lre22_dev_fipff,orm-orm,19 +lre22_dev_fipyx,zul-zul,14 +lre22_dev_firtn,zul-zul,18 +lre22_dev_fjdqb,nbl-nbl,16 +lre22_dev_fjdxl,tir-tir,14 +lre22_dev_fjocp,ara-ayl,12 +lre22_dev_fjudb,ara-aeb,15 +lre22_dev_fkbjz,afr-afr,22 +lre22_dev_fkwaq,afr-afr,19 +lre22_dev_flbgp,afr-afr,16 +lre22_dev_flgxs,tir-tir,13 +lre22_dev_fljfm,tir-tir,19 +lre22_dev_fmauu,tso-tso,18 +lre22_dev_fmbvf,fra-ntf,19 +lre22_dev_fmhfa,ara-arq,12 +lre22_dev_fmije,ara-ayl,13 +lre22_dev_fnafq,tir-tir,20 +lre22_dev_fofmo,eng-ens,15 +lre22_dev_foikm,tir-tir,16 +lre22_dev_fosfi,eng-iaf,19 +lre22_dev_fotti,eng-ens,13 +lre22_dev_fozzx,zul-zul,15 +lre22_dev_fpehr,ara-aeb,12 +lre22_dev_fpiig,orm-orm,21 +lre22_dev_fqfag,ara-ayl,16 +lre22_dev_fqogo,tir-tir,13 +lre22_dev_frdqe,ara-arq,11 +lre22_dev_fremq,afr-afr,22 +lre22_dev_frjdx,zul-zul,18 +lre22_dev_fruha,ara-ayl,12 +lre22_dev_frxmu,eng-iaf,18 +lre22_dev_fsbeo,tso-tso,13 +lre22_dev_fsijy,fra-ntf,22 +lre22_dev_fsjwh,nbl-nbl,18 +lre22_dev_fspmb,tso-tso,19 +lre22_dev_ftbak,tir-tir,13 +lre22_dev_ftxuo,eng-iaf,20 +lre22_dev_fupee,ara-aeb,13 +lre22_dev_fupla,ara-aeb,11 +lre22_dev_fvmdq,fra-ntf,22 +lre22_dev_fvmjb,fra-ntf,20 +lre22_dev_fvubo,fra-ntf,22 +lre22_dev_fvwze,afr-afr,23 +lre22_dev_fvxxt,ara-arq,13 +lre22_dev_fwcye,ven-ven,21 +lre22_dev_fwkwv,orm-orm,18 +lre22_dev_fxezd,orm-orm,17 +lre22_dev_fxuir,nbl-nbl,19 +lre22_dev_fzgcm,zul-zul,14 +lre22_dev_fzncb,nbl-nbl,16 +lre22_dev_gaezu,ara-aeb,11 +lre22_dev_gawox,ara-aeb,13 +lre22_dev_gbcfq,zul-zul,14 +lre22_dev_gbdkv,orm-orm,17 +lre22_dev_gbevf,eng-iaf,20 +lre22_dev_gchke,ara-aeb,12 +lre22_dev_gcncr,ara-arq,13 +lre22_dev_gdeqd,ara-ayl,14 +lre22_dev_gdncj,eng-iaf,14 +lre22_dev_gdobt,ven-ven,21 +lre22_dev_geeoy,xho-xho,22 +lre22_dev_geraa,afr-afr,20 +lre22_dev_gfigd,nbl-nbl,16 +lre22_dev_gfjzm,ara-ayl,12 +lre22_dev_gftlv,tir-tir,20 +lre22_dev_ggaux,xho-xho,16 +lre22_dev_ggbgc,zul-zul,15 +lre22_dev_gghhn,zul-zul,18 +lre22_dev_ggrwj,eng-iaf,17 +lre22_dev_ghdur,eng-ens,15 +lre22_dev_ghgbo,ara-ayl,14 +lre22_dev_ghhop,nbl-nbl,20 +lre22_dev_ghnwg,ara-ayl,14 +lre22_dev_ghpmd,ara-ayl,14 +lre22_dev_ghqbh,orm-orm,19 +lre22_dev_gihvo,eng-ens,16 +lre22_dev_giueq,tso-tso,19 +lre22_dev_giuix,ara-aeb,15 +lre22_dev_gjaqj,eng-iaf,20 +lre22_dev_gjgcw,xho-xho,18 +lre22_dev_gjirh,eng-iaf,16 +lre22_dev_gjvwy,nbl-nbl,22 +lre22_dev_gkeql,eng-iaf,16 +lre22_dev_gkhas,tso-tso,16 +lre22_dev_glmyp,nbl-nbl,16 +lre22_dev_glqft,eng-ens,18 +lre22_dev_glsnb,afr-afr,17 +lre22_dev_gmfcb,eng-iaf,16 +lre22_dev_gmlwo,afr-afr,16 +lre22_dev_gmpjq,tso-tso,12 +lre22_dev_gmrvk,ara-aeb,14 +lre22_dev_gmryq,ara-ayl,13 +lre22_dev_gmsds,eng-ens,16 +lre22_dev_gmztl,xho-xho,16 +lre22_dev_gnbyu,eng-iaf,15 +lre22_dev_gntym,zul-zul,17 +lre22_dev_gocpa,tso-tso,15 +lre22_dev_gpyxs,orm-orm,17 +lre22_dev_grgvb,afr-afr,16 +lre22_dev_grspj,orm-orm,19 +lre22_dev_grvjm,xho-xho,19 +lre22_dev_gsidj,eng-ens,18 +lre22_dev_gslzy,afr-afr,22 +lre22_dev_gtwcl,tir-tir,14 +lre22_dev_gulky,orm-orm,21 +lre22_dev_gvlhy,tir-tir,20 +lre22_dev_gvljx,tso-tso,15 +lre22_dev_gvmma,tso-tso,13 +lre22_dev_gvtvb,afr-afr,23 +lre22_dev_gweym,xho-xho,19 +lre22_dev_gwljh,ara-aeb,11 +lre22_dev_gwxtn,ara-ayl,14 +lre22_dev_gxdpw,fra-ntf,16 +lre22_dev_gxext,afr-afr,15 +lre22_dev_gxkqq,nbl-nbl,19 +lre22_dev_gxkxo,xho-xho,21 +lre22_dev_gxnkr,xho-xho,18 +lre22_dev_gxxbk,fra-ntf,21 +lre22_dev_gydvv,afr-afr,20 +lre22_dev_gytkt,ara-arq,12 +lre22_dev_gzmvp,afr-afr,18 +lre22_dev_gzoou,ven-ven,19 +lre22_dev_gzvza,tir-tir,15 +lre22_dev_gzwee,eng-iaf,17 +lre22_dev_haewp,tir-tir,19 +lre22_dev_haokb,fra-ntf,19 +lre22_dev_hazis,nbl-nbl,20 +lre22_dev_hbbbc,eng-ens,16 +lre22_dev_hblqa,nbl-nbl,17 +lre22_dev_hbmfy,zul-zul,15 +lre22_dev_hbndl,zul-zul,17 +lre22_dev_hcgfc,eng-ens,13 +lre22_dev_hcjnx,orm-orm,17 +lre22_dev_hcont,tir-tir,17 +lre22_dev_hcvik,tso-tso,13 +lre22_dev_hczom,zul-zul,19 +lre22_dev_hdaca,xho-xho,19 +lre22_dev_hdijt,fra-ntf,15 +lre22_dev_hdkyr,afr-afr,18 +lre22_dev_hdnoq,orm-orm,23 +lre22_dev_hdtlb,eng-iaf,16 +lre22_dev_hever,nbl-nbl,18 +lre22_dev_hfirj,nbl-nbl,17 +lre22_dev_hgbxp,xho-xho,21 +lre22_dev_hgcax,xho-xho,19 +lre22_dev_hgkwa,tso-tso,13 +lre22_dev_hgljd,ara-arq,15 +lre22_dev_hgvrh,nbl-nbl,21 +lre22_dev_hhovn,eng-iaf,16 +lre22_dev_hhpzm,fra-ntf,22 +lre22_dev_hhuab,ven-ven,20 +lre22_dev_hicev,ven-ven,18 +lre22_dev_hickz,ara-arq,12 +lre22_dev_hilii,orm-orm,23 +lre22_dev_hjenx,eng-iaf,19 +lre22_dev_hjiui,orm-orm,18 +lre22_dev_hkfts,eng-ens,18 +lre22_dev_hkhvl,zul-zul,19 +lre22_dev_hkobh,xho-xho,17 +lre22_dev_hkvay,ara-arq,13 +lre22_dev_hkvtj,orm-orm,21 +lre22_dev_hlevc,fra-ntf,17 +lre22_dev_hliut,ara-aeb,14 +lre22_dev_hlntc,zul-zul,18 +lre22_dev_hlprm,zul-zul,18 +lre22_dev_hmeav,ven-ven,17 +lre22_dev_hnelt,tir-tir,15 +lre22_dev_hniiy,ara-arq,15 +lre22_dev_hoepv,ara-aeb,13 +lre22_dev_hofkm,orm-orm,19 +lre22_dev_hoilz,tir-tir,19 +lre22_dev_hookr,ara-aeb,13 +lre22_dev_hpbhl,tir-tir,16 +lre22_dev_hpbzf,ara-aeb,11 +lre22_dev_hpizl,eng-ens,15 +lre22_dev_hplhi,ara-ayl,13 +lre22_dev_hplrq,xho-xho,20 +lre22_dev_hqdva,ven-ven,21 +lre22_dev_hqnus,xho-xho,16 +lre22_dev_hqoiz,orm-orm,18 +lre22_dev_hrerz,eng-ens,14 +lre22_dev_hrgjq,tir-tir,19 +lre22_dev_hrrhr,zul-zul,17 +lre22_dev_hsfbi,ara-ayl,14 +lre22_dev_hsjlg,tir-tir,17 +lre22_dev_hskug,afr-afr,16 +lre22_dev_hszzt,tso-tso,19 +lre22_dev_htgrl,tso-tso,18 +lre22_dev_htxah,zul-zul,17 +lre22_dev_htxrs,xho-xho,23 +lre22_dev_hudwz,nbl-nbl,17 +lre22_dev_huuqj,fra-ntf,18 +lre22_dev_hvsds,afr-afr,21 +lre22_dev_hwbhz,orm-orm,23 +lre22_dev_hwbvs,tso-tso,13 +lre22_dev_hwdlb,tso-tso,19 +lre22_dev_hwyki,eng-iaf,16 +lre22_dev_hxcmj,eng-iaf,20 +lre22_dev_hxdly,ara-arq,11 +lre22_dev_hyeqm,xho-xho,19 +lre22_dev_hyofm,ara-arq,12 +lre22_dev_hyogg,ara-arq,13 +lre22_dev_hyouu,tso-tso,13 +lre22_dev_hzfpc,fra-ntf,16 +lre22_dev_hzkjt,ara-aeb,12 +lre22_dev_hzrgv,fra-ntf,20 +lre22_dev_hzuus,tir-tir,19 +lre22_dev_hzzbp,xho-xho,19 +lre22_dev_iautt,afr-afr,20 +lre22_dev_ibdnu,tir-tir,13 +lre22_dev_ibuww,ara-aeb,13 +lre22_dev_icbuo,ven-ven,21 +lre22_dev_icqmr,tso-tso,14 +lre22_dev_ictwj,tir-tir,14 +lre22_dev_ifumz,ven-ven,14 +lre22_dev_igcgi,tso-tso,19 +lre22_dev_igder,tir-tir,19 +lre22_dev_igexm,xho-xho,21 +lre22_dev_igfxi,fra-ntf,20 +lre22_dev_igoxr,afr-afr,15 +lre22_dev_igxyt,ven-ven,21 +lre22_dev_ihqtn,ara-aeb,11 +lre22_dev_ihxfl,tir-tir,13 +lre22_dev_ihyrb,nbl-nbl,18 +lre22_dev_iifuu,tir-tir,15 +lre22_dev_iiien,xho-xho,20 +lre22_dev_ijccu,eng-iaf,16 +lre22_dev_ijrun,afr-afr,18 +lre22_dev_ijwlx,ara-arq,14 +lre22_dev_ijydw,xho-xho,21 +lre22_dev_ikdjt,xho-xho,23 +lre22_dev_iklbv,ara-arq,13 +lre22_dev_ikyai,fra-ntf,18 +lre22_dev_ildmr,orm-orm,21 +lre22_dev_ilebo,orm-orm,19 +lre22_dev_ilptc,eng-ens,18 +lre22_dev_ilsku,fra-ntf,16 +lre22_dev_ilyti,ara-arq,11 +lre22_dev_imnqh,zul-zul,17 +lre22_dev_imxdr,eng-ens,16 +lre22_dev_indww,fra-ntf,19 +lre22_dev_iokar,eng-iaf,15 +lre22_dev_iomtu,eng-iaf,15 +lre22_dev_ioobz,tir-tir,14 +lre22_dev_iosom,zul-zul,17 +lre22_dev_iowyd,ara-arq,14 +lre22_dev_iphzy,nbl-nbl,18 +lre22_dev_ipmrc,nbl-nbl,16 +lre22_dev_ipomi,ara-aeb,12 +lre22_dev_ipour,afr-afr,15 +lre22_dev_ippjq,ara-ayl,16 +lre22_dev_ipvjc,ara-aeb,13 +lre22_dev_iqfdc,ven-ven,19 +lre22_dev_iqppw,tso-tso,15 +lre22_dev_iqtde,tso-tso,14 +lre22_dev_irlee,eng-iaf,14 +lre22_dev_irxuq,ara-aeb,14 +lre22_dev_isjzo,ara-arq,14 +lre22_dev_isnwz,ara-ayl,14 +lre22_dev_isqvk,afr-afr,15 +lre22_dev_isqww,orm-orm,19 +lre22_dev_istdz,tir-tir,18 +lre22_dev_iszhe,fra-ntf,20 +lre22_dev_itblz,ven-ven,18 +lre22_dev_itfez,ara-arq,13 +lre22_dev_itjqm,zul-zul,18 +lre22_dev_itnap,nbl-nbl,21 +lre22_dev_itrms,xho-xho,21 +lre22_dev_itroi,fra-ntf,17 +lre22_dev_ittds,zul-zul,16 +lre22_dev_iuknz,tso-tso,16 +lre22_dev_iumnm,ara-ayl,15 +lre22_dev_iunul,afr-afr,23 +lre22_dev_iverq,ven-ven,16 +lre22_dev_ivwzd,ara-ayl,14 +lre22_dev_ivzjf,tso-tso,12 +lre22_dev_iwbta,nbl-nbl,16 +lre22_dev_iwdeh,orm-orm,21 +lre22_dev_iwgel,ara-aeb,11 +lre22_dev_ixbhj,ara-aeb,11 +lre22_dev_ixbnl,fra-ntf,16 +lre22_dev_ixcef,ven-ven,20 +lre22_dev_ixfdf,orm-orm,18 +lre22_dev_ixjey,orm-orm,19 +lre22_dev_ixlve,tir-tir,17 +lre22_dev_ixutu,ara-ayl,12 +lre22_dev_ixxoj,xho-xho,23 +lre22_dev_ixyko,afr-afr,22 +lre22_dev_iylls,eng-iaf,19 +lre22_dev_izegw,orm-orm,23 +lre22_dev_izglb,ara-ayl,13 +lre22_dev_iziar,ara-arq,13 +lre22_dev_jadvz,afr-afr,18 +lre22_dev_jajtw,ara-aeb,14 +lre22_dev_janvu,tso-tso,16 +lre22_dev_japrb,xho-xho,21 +lre22_dev_jarvz,ara-aeb,12 +lre22_dev_jazcn,tso-tso,13 +lre22_dev_jbfxj,tso-tso,12 +lre22_dev_jbnfg,fra-ntf,15 +lre22_dev_jbwgd,afr-afr,20 +lre22_dev_jceug,tso-tso,15 +lre22_dev_jcqtd,eng-ens,14 +lre22_dev_jcxry,ven-ven,20 +lre22_dev_jdbli,tir-tir,20 +lre22_dev_jegmb,orm-orm,18 +lre22_dev_jegqj,ara-ayl,12 +lre22_dev_jenns,xho-xho,22 +lre22_dev_jfarf,ven-ven,14 +lre22_dev_jfcve,zul-zul,17 +lre22_dev_jfgyq,xho-xho,23 +lre22_dev_jftnz,afr-afr,14 +lre22_dev_jftsj,afr-afr,22 +lre22_dev_jgnid,nbl-nbl,16 +lre22_dev_jgsju,eng-ens,13 +lre22_dev_jifal,orm-orm,19 +lre22_dev_jihsd,orm-orm,21 +lre22_dev_jihwf,ara-ayl,11 +lre22_dev_jiptp,eng-iaf,15 +lre22_dev_jizij,tir-tir,14 +lre22_dev_jjpzg,orm-orm,23 +lre22_dev_jkezw,fra-ntf,18 +lre22_dev_jkmux,fra-ntf,20 +lre22_dev_jkpnt,orm-orm,22 +lre22_dev_jlkfj,eng-ens,18 +lre22_dev_jlmtf,ven-ven,19 +lre22_dev_jlrfm,ara-arq,12 +lre22_dev_jmojg,orm-orm,19 +lre22_dev_jmrcv,ara-aeb,13 +lre22_dev_jmsxc,eng-iaf,16 +lre22_dev_jnjpw,tir-tir,14 +lre22_dev_jnzvu,ara-aeb,14 +lre22_dev_jocyh,xho-xho,19 +lre22_dev_joezr,tso-tso,16 +lre22_dev_jofqy,ara-arq,11 +lre22_dev_jpbyf,eng-ens,15 +lre22_dev_jppuy,ara-arq,13 +lre22_dev_jptts,ara-aeb,12 +lre22_dev_jqdyx,fra-ntf,22 +lre22_dev_jqjbq,zul-zul,17 +lre22_dev_jqpnb,ven-ven,21 +lre22_dev_jqqin,zul-zul,17 +lre22_dev_jqzkq,ara-ayl,13 +lre22_dev_jrroq,orm-orm,21 +lre22_dev_jruru,eng-ens,16 +lre22_dev_jskbr,ara-arq,11 +lre22_dev_jskdd,nbl-nbl,19 +lre22_dev_jslnc,eng-ens,12 +lre22_dev_jsmat,orm-orm,17 +lre22_dev_jsmdw,ara-aeb,11 +lre22_dev_jsvaz,afr-afr,19 +lre22_dev_jsxcy,afr-afr,21 +lre22_dev_jszgk,eng-iaf,19 +lre22_dev_jthui,ven-ven,20 +lre22_dev_jtpvz,ven-ven,17 +lre22_dev_jtwdi,ven-ven,14 +lre22_dev_jtwfh,ven-ven,18 +lre22_dev_juwid,tir-tir,20 +lre22_dev_jvdww,fra-ntf,21 +lre22_dev_jweyx,tir-tir,19 +lre22_dev_jwuto,afr-afr,19 +lre22_dev_jwwgs,afr-afr,19 +lre22_dev_jxhxf,nbl-nbl,17 +lre22_dev_jxtxk,orm-orm,20 +lre22_dev_jxzvy,eng-ens,15 +lre22_dev_jyjlm,nbl-nbl,19 +lre22_dev_jynvf,ara-ayl,13 +lre22_dev_jyzmh,nbl-nbl,19 +lre22_dev_jzivf,eng-ens,14 +lre22_dev_jzpns,tso-tso,14 +lre22_dev_kadwu,fra-ntf,18 +lre22_dev_kbnbi,tir-tir,13 +lre22_dev_kbqbd,fra-ntf,16 +lre22_dev_kbscm,tso-tso,15 +lre22_dev_kbxko,ara-aeb,12 +lre22_dev_kcegv,tso-tso,15 +lre22_dev_kcibo,afr-afr,17 +lre22_dev_kcmky,ara-ayl,14 +lre22_dev_kctrd,nbl-nbl,22 +lre22_dev_kcvbf,fra-ntf,16 +lre22_dev_kdbqy,zul-zul,15 +lre22_dev_kdgpz,ara-arq,14 +lre22_dev_kdhgq,nbl-nbl,22 +lre22_dev_kdvtu,eng-iaf,16 +lre22_dev_kdyhm,tso-tso,12 +lre22_dev_keeyz,zul-zul,18 +lre22_dev_kejvy,ven-ven,18 +lre22_dev_kerpr,ven-ven,21 +lre22_dev_keweh,ara-aeb,13 +lre22_dev_keysx,orm-orm,23 +lre22_dev_kezyv,ara-ayl,13 +lre22_dev_kgbiq,ven-ven,18 +lre22_dev_kgovz,tso-tso,15 +lre22_dev_kgxka,eng-ens,16 +lre22_dev_khkcx,fra-ntf,20 +lre22_dev_khobl,orm-orm,19 +lre22_dev_khttn,afr-afr,17 +lre22_dev_khvss,tir-tir,15 +lre22_dev_kiezl,tso-tso,16 +lre22_dev_kihlw,eng-ens,14 +lre22_dev_kipuq,ara-arq,14 +lre22_dev_kiqcx,tir-tir,16 +lre22_dev_kjiks,xho-xho,19 +lre22_dev_kjmpa,zul-zul,18 +lre22_dev_kjocf,eng-iaf,16 +lre22_dev_kkbur,ven-ven,16 +lre22_dev_kksdi,xho-xho,22 +lre22_dev_kkytv,ara-aeb,11 +lre22_dev_kmkgx,nbl-nbl,17 +lre22_dev_kmpkm,zul-zul,19 +lre22_dev_kmyzy,ara-ayl,13 +lre22_dev_knfsj,afr-afr,15 +lre22_dev_knyuq,orm-orm,19 +lre22_dev_koacp,orm-orm,19 +lre22_dev_koket,eng-ens,18 +lre22_dev_kovdn,zul-zul,15 +lre22_dev_kowqf,ven-ven,19 +lre22_dev_kozfr,nbl-nbl,21 +lre22_dev_kpmyz,orm-orm,19 +lre22_dev_kqfdc,eng-ens,17 +lre22_dev_kqumw,fra-ntf,22 +lre22_dev_kqwdi,nbl-nbl,16 +lre22_dev_krczb,ven-ven,19 +lre22_dev_kremz,nbl-nbl,16 +lre22_dev_ksruw,ven-ven,18 +lre22_dev_kszdw,eng-iaf,20 +lre22_dev_ktgvi,ara-arq,11 +lre22_dev_ktjax,fra-ntf,20 +lre22_dev_ktlvc,orm-orm,19 +lre22_dev_kvqgp,afr-afr,21 +lre22_dev_kvyoz,afr-afr,20 +lre22_dev_kvzim,afr-afr,14 +lre22_dev_kvzwc,eng-iaf,14 +lre22_dev_kwcwa,ara-arq,14 +lre22_dev_kwomo,zul-zul,19 +lre22_dev_kwxau,xho-xho,18 +lre22_dev_kxawf,tir-tir,19 +lre22_dev_kxjhn,ara-aeb,11 +lre22_dev_kxklh,tir-tir,19 +lre22_dev_kxlgg,tir-tir,16 +lre22_dev_kyqbp,fra-ntf,21 +lre22_dev_kyzio,ven-ven,20 +lre22_dev_kzcgh,ara-ayl,13 +lre22_dev_kzeyf,ven-ven,18 +lre22_dev_kzfwf,fra-ntf,19 +lre22_dev_kzjuz,orm-orm,21 +lre22_dev_kzjwx,ara-ayl,11 +lre22_dev_lamjl,tso-tso,17 +lre22_dev_laowh,xho-xho,16 +lre22_dev_larex,ara-ayl,11 +lre22_dev_laycs,tso-tso,12 +lre22_dev_lbxfn,eng-iaf,20 +lre22_dev_lcrog,zul-zul,18 +lre22_dev_ldczz,xho-xho,17 +lre22_dev_ldkgv,ara-aeb,13 +lre22_dev_ldkst,fra-ntf,20 +lre22_dev_ldkwr,orm-orm,22 +lre22_dev_lenxf,ven-ven,14 +lre22_dev_lfbey,ara-ayl,12 +lre22_dev_lfmml,fra-ntf,18 +lre22_dev_lfmxu,ven-ven,18 +lre22_dev_lfqfj,afr-afr,17 +lre22_dev_lgetu,ara-aeb,14 +lre22_dev_lgleu,ara-ayl,11 +lre22_dev_lgoat,eng-iaf,16 +lre22_dev_lhgaj,tso-tso,15 +lre22_dev_lhqyw,nbl-nbl,17 +lre22_dev_lhrmr,eng-iaf,17 +lre22_dev_lhtsd,tir-tir,19 +lre22_dev_lhydp,fra-ntf,22 +lre22_dev_livbf,tir-tir,15 +lre22_dev_ljdrg,ara-arq,13 +lre22_dev_ljniw,tso-tso,16 +lre22_dev_ljpmq,tso-tso,12 +lre22_dev_lkjon,tso-tso,15 +lre22_dev_lkszp,nbl-nbl,19 +lre22_dev_llbim,ara-ayl,15 +lre22_dev_llkkt,fra-ntf,15 +lre22_dev_llvcc,orm-orm,22 +lre22_dev_lmbug,ara-arq,12 +lre22_dev_lmmmw,nbl-nbl,19 +lre22_dev_lmsek,ven-ven,16 +lre22_dev_lmudp,ara-ayl,10 +lre22_dev_lmzmv,eng-iaf,19 +lre22_dev_lnlae,ara-arq,14 +lre22_dev_lnlvt,zul-zul,17 +lre22_dev_lnppu,ara-ayl,13 +lre22_dev_lnpyc,tso-tso,19 +lre22_dev_lolkv,xho-xho,19 +lre22_dev_lorcx,nbl-nbl,20 +lre22_dev_lparq,xho-xho,16 +lre22_dev_lqlft,ara-arq,11 +lre22_dev_lqlyq,ara-arq,12 +lre22_dev_lqoeu,tso-tso,14 +lre22_dev_lqueh,ara-ayl,11 +lre22_dev_lquzk,ara-arq,12 +lre22_dev_lqvav,zul-zul,18 +lre22_dev_lrgpy,eng-iaf,16 +lre22_dev_lrjbn,ven-ven,21 +lre22_dev_lrtad,ara-arq,14 +lre22_dev_lrtxd,ara-aeb,11 +lre22_dev_lrvkn,ven-ven,16 +lre22_dev_lrzwy,ara-ayl,13 +lre22_dev_lsefk,ara-arq,13 +lre22_dev_ltmmt,orm-orm,22 +lre22_dev_lutgh,ara-aeb,15 +lre22_dev_lvhmd,tso-tso,14 +lre22_dev_lvqim,ara-aeb,14 +lre22_dev_lvuuo,fra-ntf,17 +lre22_dev_lvzri,ven-ven,16 +lre22_dev_lweml,ara-arq,14 +lre22_dev_lwstj,eng-iaf,16 +lre22_dev_lwzdj,afr-afr,18 +lre22_dev_lxdsk,eng-ens,16 +lre22_dev_lxlcr,ara-aeb,13 +lre22_dev_lxshv,eng-iaf,20 +lre22_dev_lxxvv,eng-ens,16 +lre22_dev_lyfhc,ven-ven,18 +lre22_dev_lyikp,zul-zul,19 +lre22_dev_lyjix,tso-tso,14 +lre22_dev_lyxyh,eng-iaf,19 +lre22_dev_lyzxd,tir-tir,17 +lre22_dev_lzguf,orm-orm,21 +lre22_dev_lzpmk,tir-tir,16 +lre22_dev_lzugv,xho-xho,19 +lre22_dev_maeeb,tir-tir,15 +lre22_dev_maemn,zul-zul,16 +lre22_dev_manpw,orm-orm,19 +lre22_dev_mavli,ara-aeb,12 +lre22_dev_mbywd,orm-orm,19 +lre22_dev_mcath,nbl-nbl,22 +lre22_dev_mcjtw,xho-xho,16 +lre22_dev_mcndd,ven-ven,15 +lre22_dev_mcxqb,tir-tir,13 +lre22_dev_mdlia,fra-ntf,16 +lre22_dev_mdxsp,eng-ens,18 +lre22_dev_menex,eng-iaf,16 +lre22_dev_merfk,orm-orm,21 +lre22_dev_mfipk,zul-zul,16 +lre22_dev_mfuqh,ara-arq,14 +lre22_dev_mgcvo,xho-xho,19 +lre22_dev_mggbx,zul-zul,18 +lre22_dev_mgghl,tso-tso,12 +lre22_dev_mgwqd,ara-arq,14 +lre22_dev_mhswt,ara-ayl,15 +lre22_dev_mhwmt,tso-tso,16 +lre22_dev_miayn,ara-aeb,12 +lre22_dev_miley,tso-tso,16 +lre22_dev_mjfmb,nbl-nbl,21 +lre22_dev_mkbyx,tir-tir,19 +lre22_dev_mlbzi,xho-xho,23 +lre22_dev_mlduq,xho-xho,16 +lre22_dev_mljnp,ara-arq,14 +lre22_dev_mljpb,orm-orm,22 +lre22_dev_mlrsm,xho-xho,17 +lre22_dev_mlwzr,eng-ens,13 +lre22_dev_mlyeo,ven-ven,15 +lre22_dev_mmaed,ara-ayl,14 +lre22_dev_mmbns,eng-ens,12 +lre22_dev_mneyt,xho-xho,17 +lre22_dev_mnhsk,ven-ven,14 +lre22_dev_mnnvk,eng-ens,15 +lre22_dev_mnswo,tso-tso,16 +lre22_dev_mntdk,eng-ens,18 +lre22_dev_mogwl,orm-orm,22 +lre22_dev_mpbun,nbl-nbl,21 +lre22_dev_mpmuf,ara-aeb,14 +lre22_dev_mpoet,nbl-nbl,16 +lre22_dev_mptyi,afr-afr,18 +lre22_dev_mpzxy,orm-orm,18 +lre22_dev_mqxni,ara-arq,11 +lre22_dev_mqzga,tso-tso,19 +lre22_dev_mrgdh,xho-xho,17 +lre22_dev_mrgko,afr-afr,18 +lre22_dev_mrksc,tir-tir,19 +lre22_dev_mrogp,eng-iaf,15 +lre22_dev_mscwd,fra-ntf,16 +lre22_dev_mshco,ara-ayl,12 +lre22_dev_msptn,ara-ayl,16 +lre22_dev_msslk,ara-aeb,14 +lre22_dev_mtaus,fra-ntf,19 +lre22_dev_mtpgl,tso-tso,13 +lre22_dev_mttly,tir-tir,19 +lre22_dev_mubqn,fra-ntf,15 +lre22_dev_muskv,tso-tso,12 +lre22_dev_muzkp,ara-arq,14 +lre22_dev_mvdus,ven-ven,19 +lre22_dev_mvngl,xho-xho,19 +lre22_dev_mvrpq,tso-tso,12 +lre22_dev_mvtcj,afr-afr,22 +lre22_dev_mwhsu,xho-xho,21 +lre22_dev_mwkyp,nbl-nbl,20 +lre22_dev_mxcey,ara-ayl,12 +lre22_dev_mxcub,ara-aeb,12 +lre22_dev_myekh,ara-aeb,11 +lre22_dev_mzxhf,zul-zul,17 +lre22_dev_mzyru,ara-arq,12 +lre22_dev_nakax,eng-iaf,15 +lre22_dev_naymc,ara-ayl,13 +lre22_dev_nbgid,orm-orm,19 +lre22_dev_nbmnl,xho-xho,16 +lre22_dev_ncffi,zul-zul,14 +lre22_dev_ncjtj,fra-ntf,22 +lre22_dev_ncpix,ara-ayl,11 +lre22_dev_nctqc,xho-xho,16 +lre22_dev_ndkuo,orm-orm,20 +lre22_dev_ndqfw,nbl-nbl,17 +lre22_dev_nedes,ven-ven,15 +lre22_dev_neomw,zul-zul,18 +lre22_dev_neziz,tir-tir,19 +lre22_dev_nfcvg,eng-iaf,17 +lre22_dev_nfdfc,afr-afr,17 +lre22_dev_ngijv,xho-xho,21 +lre22_dev_ngrxk,ara-ayl,13 +lre22_dev_ngzja,ara-aeb,13 +lre22_dev_nhaub,tso-tso,13 +lre22_dev_nhkro,xho-xho,23 +lre22_dev_nhlvt,ara-arq,14 +lre22_dev_nhlxm,eng-ens,14 +lre22_dev_nhyjy,afr-afr,17 +lre22_dev_nifei,zul-zul,19 +lre22_dev_nikpx,ven-ven,18 +lre22_dev_njceq,afr-afr,18 +lre22_dev_njmlt,eng-ens,17 +lre22_dev_njqfj,orm-orm,18 +lre22_dev_nkdje,eng-iaf,19 +lre22_dev_nkkqo,nbl-nbl,22 +lre22_dev_nknrw,orm-orm,21 +lre22_dev_nkogd,fra-ntf,19 +lre22_dev_nksfc,tir-tir,19 +lre22_dev_nkwmm,orm-orm,22 +lre22_dev_nmhdg,ara-ayl,10 +lre22_dev_nmoux,ven-ven,20 +lre22_dev_nmrsq,ven-ven,21 +lre22_dev_nnbhc,fra-ntf,20 +lre22_dev_nnbpy,tir-tir,18 +lre22_dev_nnpwd,ara-aeb,13 +lre22_dev_nodin,ara-ayl,14 +lre22_dev_nogji,nbl-nbl,20 +lre22_dev_nonvr,afr-afr,15 +lre22_dev_notcl,eng-iaf,19 +lre22_dev_noufn,ara-aeb,11 +lre22_dev_noveb,ara-ayl,11 +lre22_dev_npajm,nbl-nbl,19 +lre22_dev_npehj,ara-ayl,14 +lre22_dev_nqdaj,tso-tso,12 +lre22_dev_nqkon,xho-xho,18 +lre22_dev_nqlhw,ara-aeb,13 +lre22_dev_nraqr,eng-ens,14 +lre22_dev_nrino,tso-tso,14 +lre22_dev_nrzgt,xho-xho,16 +lre22_dev_nscrg,orm-orm,18 +lre22_dev_nstgp,orm-orm,23 +lre22_dev_ntgqz,afr-afr,23 +lre22_dev_nthzr,eng-iaf,18 +lre22_dev_ntwzb,afr-afr,16 +lre22_dev_nudwv,eng-ens,14 +lre22_dev_nuerz,eng-iaf,18 +lre22_dev_nujfy,xho-xho,21 +lre22_dev_nurlx,eng-ens,13 +lre22_dev_nvakd,zul-zul,17 +lre22_dev_nvgkj,eng-ens,17 +lre22_dev_nvhvv,fra-ntf,20 +lre22_dev_nwbnz,ara-arq,14 +lre22_dev_nwjed,nbl-nbl,19 +lre22_dev_nwrto,ara-aeb,11 +lre22_dev_nwunl,zul-zul,14 +lre22_dev_nwvyy,tir-tir,19 +lre22_dev_nxwlo,nbl-nbl,17 +lre22_dev_nxxzy,zul-zul,16 +lre22_dev_nxzpp,nbl-nbl,20 +lre22_dev_nyhwg,ara-arq,14 +lre22_dev_nykvr,eng-ens,17 +lre22_dev_nyvkc,tir-tir,15 +lre22_dev_nyyui,ara-arq,11 +lre22_dev_nzbfh,zul-zul,19 +lre22_dev_nzxsk,xho-xho,21 +lre22_dev_oasrh,ara-arq,11 +lre22_dev_oavaf,xho-xho,21 +lre22_dev_obfrf,orm-orm,20 +lre22_dev_obocn,ara-arq,14 +lre22_dev_obumo,eng-ens,15 +lre22_dev_ocbuj,eng-ens,12 +lre22_dev_ocbxu,nbl-nbl,21 +lre22_dev_ocdvw,ara-ayl,13 +lre22_dev_ocdzj,xho-xho,19 +lre22_dev_ocveq,fra-ntf,22 +lre22_dev_odest,ara-ayl,11 +lre22_dev_odjlq,ven-ven,18 +lre22_dev_odpoq,ara-ayl,12 +lre22_dev_odrcm,fra-ntf,21 +lre22_dev_oeavx,ara-arq,12 +lre22_dev_oefoy,ara-aeb,12 +lre22_dev_oefqy,ven-ven,16 +lre22_dev_oehxk,ara-ayl,12 +lre22_dev_oeqbo,ara-aeb,14 +lre22_dev_oeqjq,fra-ntf,20 +lre22_dev_ofdgy,ara-ayl,15 +lre22_dev_ofgkq,fra-ntf,21 +lre22_dev_ofpva,ara-arq,11 +lre22_dev_ofufy,eng-iaf,17 +lre22_dev_ogglz,ara-aeb,13 +lre22_dev_oggtr,nbl-nbl,19 +lre22_dev_ogpxk,ara-aeb,11 +lre22_dev_ogsay,tso-tso,19 +lre22_dev_ogtvj,zul-zul,19 +lre22_dev_ohqwz,ara-arq,13 +lre22_dev_ohuxo,afr-afr,20 +lre22_dev_ohweb,ven-ven,16 +lre22_dev_ohzpg,fra-ntf,21 +lre22_dev_oijcy,xho-xho,19 +lre22_dev_oijgv,tir-tir,16 +lre22_dev_oikqj,eng-iaf,17 +lre22_dev_oinvl,ven-ven,15 +lre22_dev_oiofr,fra-ntf,19 +lre22_dev_oipks,eng-ens,17 +lre22_dev_ojzos,ara-arq,14 +lre22_dev_okbnu,ara-ayl,10 +lre22_dev_okpcp,eng-iaf,18 +lre22_dev_okwpq,tso-tso,16 +lre22_dev_oleie,ara-arq,12 +lre22_dev_oljep,ven-ven,21 +lre22_dev_oljsa,fra-ntf,16 +lre22_dev_olkup,nbl-nbl,16 +lre22_dev_olqbh,ara-ayl,14 +lre22_dev_omjqo,ara-aeb,14 +lre22_dev_omwiy,ara-ayl,12 +lre22_dev_omxnk,ara-arq,13 +lre22_dev_onqke,eng-iaf,16 +lre22_dev_onzje,tir-tir,13 +lre22_dev_ooktw,afr-afr,18 +lre22_dev_oosff,ara-aeb,12 +lre22_dev_ootbi,xho-xho,21 +lre22_dev_opciz,orm-orm,23 +lre22_dev_opgny,xho-xho,19 +lre22_dev_opifd,ara-arq,12 +lre22_dev_oporo,eng-iaf,19 +lre22_dev_opryj,nbl-nbl,16 +lre22_dev_opuzh,eng-ens,12 +lre22_dev_oqbaw,ven-ven,18 +lre22_dev_oqeuj,tir-tir,14 +lre22_dev_oqmhb,xho-xho,21 +lre22_dev_oqmrs,ara-arq,14 +lre22_dev_oqqwq,tso-tso,12 +lre22_dev_oquaq,xho-xho,17 +lre22_dev_oriap,fra-ntf,20 +lre22_dev_orsjj,tir-tir,20 +lre22_dev_orvna,fra-ntf,21 +lre22_dev_oskoe,orm-orm,20 +lre22_dev_otlyk,nbl-nbl,18 +lre22_dev_oujnj,nbl-nbl,17 +lre22_dev_oumka,ven-ven,14 +lre22_dev_ouqsx,ara-arq,13 +lre22_dev_outyl,zul-zul,16 +lre22_dev_owlwt,ara-ayl,14 +lre22_dev_owvfd,orm-orm,18 +lre22_dev_oxizc,tir-tir,15 +lre22_dev_oxpht,eng-ens,18 +lre22_dev_oxqlz,afr-afr,15 +lre22_dev_oydiw,nbl-nbl,16 +lre22_dev_oyfcl,fra-ntf,22 +lre22_dev_oyhba,eng-ens,18 +lre22_dev_oyiif,afr-afr,17 +lre22_dev_oyslg,afr-afr,21 +lre22_dev_ozfpi,tir-tir,15 +lre22_dev_ozlww,ven-ven,19 +lre22_dev_paxnc,eng-ens,17 +lre22_dev_pbbgx,eng-iaf,14 +lre22_dev_pcfmw,nbl-nbl,21 +lre22_dev_pclpc,fra-ntf,15 +lre22_dev_pcmmj,afr-afr,16 +lre22_dev_pcsqz,tso-tso,18 +lre22_dev_pdcfm,ara-ayl,10 +lre22_dev_pdtuf,eng-ens,18 +lre22_dev_pdzuj,zul-zul,17 +lre22_dev_pehfu,fra-ntf,15 +lre22_dev_pewpj,orm-orm,22 +lre22_dev_pexjz,orm-orm,17 +lre22_dev_pfioj,eng-iaf,15 +lre22_dev_pfkcf,eng-iaf,16 +lre22_dev_pfknl,ara-arq,14 +lre22_dev_pfucv,ara-ayl,12 +lre22_dev_pfyha,fra-ntf,21 +lre22_dev_pgavf,ara-ayl,13 +lre22_dev_phket,nbl-nbl,22 +lre22_dev_piabk,afr-afr,19 +lre22_dev_picvg,orm-orm,17 +lre22_dev_piina,eng-ens,14 +lre22_dev_pjahm,afr-afr,20 +lre22_dev_pjcso,nbl-nbl,17 +lre22_dev_pjggp,ven-ven,16 +lre22_dev_pjohw,xho-xho,19 +lre22_dev_pkpxo,ara-ayl,11 +lre22_dev_pktgk,nbl-nbl,22 +lre22_dev_plojq,eng-ens,12 +lre22_dev_pmayg,ven-ven,21 +lre22_dev_pmjyi,xho-xho,20 +lre22_dev_pmkcp,nbl-nbl,20 +lre22_dev_pnfhk,fra-ntf,18 +lre22_dev_pnust,nbl-nbl,20 +lre22_dev_pnwey,eng-iaf,15 +lre22_dev_pnwti,ara-aeb,13 +lre22_dev_pohmm,afr-afr,14 +lre22_dev_pojvr,nbl-nbl,22 +lre22_dev_poxsw,ara-aeb,13 +lre22_dev_ppjvq,tir-tir,16 +lre22_dev_ppkfc,fra-ntf,19 +lre22_dev_ppmnu,tso-tso,12 +lre22_dev_ppzno,tso-tso,12 +lre22_dev_pqksl,afr-afr,14 +lre22_dev_pqnvh,zul-zul,19 +lre22_dev_prcus,tso-tso,15 +lre22_dev_prhoh,tir-tir,19 +lre22_dev_prkth,ara-arq,12 +lre22_dev_prnhd,xho-xho,18 +lre22_dev_psjma,fra-ntf,18 +lre22_dev_psldq,tir-tir,19 +lre22_dev_psnvo,afr-afr,15 +lre22_dev_psnzj,zul-zul,19 +lre22_dev_pudqr,eng-ens,17 +lre22_dev_pufnl,orm-orm,19 +lre22_dev_pusxa,nbl-nbl,22 +lre22_dev_pvsqi,ara-arq,11 +lre22_dev_pvteg,fra-ntf,17 +lre22_dev_pvvay,tir-tir,14 +lre22_dev_pvxcv,ara-aeb,15 +lre22_dev_pvygc,ara-aeb,11 +lre22_dev_pwcxu,tir-tir,13 +lre22_dev_pwhdm,nbl-nbl,17 +lre22_dev_pwnkz,ven-ven,20 +lre22_dev_pwrqe,ara-aeb,14 +lre22_dev_pxbhi,afr-afr,16 +lre22_dev_pxeyk,zul-zul,18 +lre22_dev_pxkzd,ara-arq,14 +lre22_dev_pydgm,afr-afr,19 +lre22_dev_pyiju,ven-ven,20 +lre22_dev_pzhrc,tso-tso,13 +lre22_dev_pzkea,ven-ven,14 +lre22_dev_pzqka,ara-arq,11 +lre22_dev_pzuis,ara-arq,13 +lre22_dev_qabac,ven-ven,19 +lre22_dev_qahym,ara-ayl,11 +lre22_dev_qaxfr,xho-xho,17 +lre22_dev_qazyc,ara-ayl,14 +lre22_dev_qbcoz,nbl-nbl,22 +lre22_dev_qcavr,eng-iaf,20 +lre22_dev_qcbkh,fra-ntf,18 +lre22_dev_qcbtt,afr-afr,18 +lre22_dev_qclly,xho-xho,22 +lre22_dev_qcqdt,eng-iaf,18 +lre22_dev_qdqzp,zul-zul,17 +lre22_dev_qdwut,eng-ens,16 +lre22_dev_qehxr,afr-afr,22 +lre22_dev_qeqah,tir-tir,16 +lre22_dev_qeyjd,afr-afr,17 +lre22_dev_qfprv,ara-ayl,13 +lre22_dev_qfqhi,ara-ayl,15 +lre22_dev_qgoge,tso-tso,13 +lre22_dev_qgrlb,eng-iaf,16 +lre22_dev_qgrsu,zul-zul,14 +lre22_dev_qheor,xho-xho,23 +lre22_dev_qhfdz,tso-tso,14 +lre22_dev_qhlol,ven-ven,21 +lre22_dev_qhnfr,zul-zul,15 +lre22_dev_qhvuq,tso-tso,14 +lre22_dev_qibby,afr-afr,23 +lre22_dev_qicen,orm-orm,16 +lre22_dev_qiehd,eng-iaf,14 +lre22_dev_qjbfh,eng-iaf,15 +lre22_dev_qjdln,afr-afr,19 +lre22_dev_qjmro,ara-ayl,11 +lre22_dev_qkgor,zul-zul,16 +lre22_dev_qlgvf,ara-aeb,12 +lre22_dev_qlpjn,eng-iaf,16 +lre22_dev_qmoop,nbl-nbl,16 +lre22_dev_qmqhy,afr-afr,20 +lre22_dev_qmreh,ara-ayl,10 +lre22_dev_qmucf,ven-ven,18 +lre22_dev_qmvnu,fra-ntf,15 +lre22_dev_qmzke,ara-ayl,13 +lre22_dev_qmzxw,orm-orm,21 +lre22_dev_qnams,ven-ven,20 +lre22_dev_qnefv,xho-xho,23 +lre22_dev_qodht,zul-zul,19 +lre22_dev_qoqtk,eng-ens,16 +lre22_dev_qotto,fra-ntf,18 +lre22_dev_qoudd,tso-tso,18 +lre22_dev_qpego,ara-ayl,14 +lre22_dev_qphcb,fra-ntf,22 +lre22_dev_qqkiv,ara-arq,13 +lre22_dev_qqmeu,eng-ens,17 +lre22_dev_qqudk,orm-orm,21 +lre22_dev_qqvdr,orm-orm,23 +lre22_dev_qrbmq,ara-arq,12 +lre22_dev_qrfvx,fra-ntf,22 +lre22_dev_qrsqg,zul-zul,19 +lre22_dev_qrylo,eng-ens,18 +lre22_dev_qsbdh,nbl-nbl,16 +lre22_dev_qsqzo,afr-afr,14 +lre22_dev_qsudg,nbl-nbl,22 +lre22_dev_qszwt,fra-ntf,21 +lre22_dev_qtcmx,nbl-nbl,21 +lre22_dev_qtfpf,zul-zul,16 +lre22_dev_qtkhk,afr-afr,22 +lre22_dev_qtydg,afr-afr,22 +lre22_dev_qujmp,zul-zul,19 +lre22_dev_qulse,eng-ens,17 +lre22_dev_qutbz,eng-ens,18 +lre22_dev_quvqg,ara-aeb,13 +lre22_dev_qvpjs,eng-iaf,19 +lre22_dev_qvtdy,tso-tso,12 +lre22_dev_qvzol,orm-orm,19 +lre22_dev_qwvgm,ara-ayl,13 +lre22_dev_qwzxt,zul-zul,19 +lre22_dev_qxigw,tir-tir,19 +lre22_dev_qxkuu,tso-tso,13 +lre22_dev_qxtss,afr-afr,15 +lre22_dev_qxvbe,nbl-nbl,17 +lre22_dev_qxysh,afr-afr,22 +lre22_dev_qyfba,zul-zul,14 +lre22_dev_qyfov,fra-ntf,19 +lre22_dev_qyjgj,afr-afr,22 +lre22_dev_qyuwy,ara-aeb,15 +lre22_dev_qzfdr,nbl-nbl,18 +lre22_dev_qzldb,eng-iaf,19 +lre22_dev_ranrd,nbl-nbl,22 +lre22_dev_raurj,eng-ens,12 +lre22_dev_rbntq,ara-arq,11 +lre22_dev_rbssw,ara-aeb,11 +lre22_dev_rbwgx,ara-ayl,16 +lre22_dev_rcooi,fra-ntf,18 +lre22_dev_rcyom,ara-ayl,11 +lre22_dev_rdcns,zul-zul,18 +lre22_dev_rdrhv,ara-arq,11 +lre22_dev_rdyxn,eng-iaf,19 +lre22_dev_repec,tir-tir,19 +lre22_dev_rgbby,tso-tso,19 +lre22_dev_rgdvt,fra-ntf,20 +lre22_dev_rguqm,tso-tso,14 +lre22_dev_rgwjy,afr-afr,19 +lre22_dev_rijeq,orm-orm,19 +lre22_dev_rincv,tir-tir,16 +lre22_dev_rindo,zul-zul,17 +lre22_dev_rirhy,ara-arq,11 +lre22_dev_rjikw,fra-ntf,20 +lre22_dev_rjsik,tso-tso,16 +lre22_dev_rjvvj,tso-tso,19 +lre22_dev_rksid,nbl-nbl,22 +lre22_dev_rkycg,ven-ven,21 +lre22_dev_rlamm,zul-zul,15 +lre22_dev_rllya,tso-tso,15 +lre22_dev_rlzrk,eng-ens,14 +lre22_dev_rmxbg,tir-tir,14 +lre22_dev_rnrsy,tir-tir,19 +lre22_dev_rokej,xho-xho,17 +lre22_dev_rooaf,fra-ntf,17 +lre22_dev_rorob,ven-ven,15 +lre22_dev_rowwe,nbl-nbl,17 +lre22_dev_rqcuw,ara-ayl,11 +lre22_dev_rqdte,ara-ayl,10 +lre22_dev_rqpau,tso-tso,15 +lre22_dev_rquba,ven-ven,19 +lre22_dev_rrbgv,afr-afr,20 +lre22_dev_rsvjn,fra-ntf,16 +lre22_dev_rsynm,tir-tir,19 +lre22_dev_rtezn,tir-tir,19 +lre22_dev_rtkum,orm-orm,21 +lre22_dev_rturg,zul-zul,17 +lre22_dev_runwu,tir-tir,16 +lre22_dev_rvbmf,tso-tso,12 +lre22_dev_rvfls,tso-tso,16 +lre22_dev_rvhxb,ara-aeb,11 +lre22_dev_rvufk,orm-orm,20 +lre22_dev_rvzbo,ara-ayl,14 +lre22_dev_rwhfu,xho-xho,16 +lre22_dev_rwhiz,ara-ayl,10 +lre22_dev_rwimz,ven-ven,16 +lre22_dev_rwish,eng-ens,16 +lre22_dev_rwpzp,xho-xho,19 +lre22_dev_rwqlq,tir-tir,19 +lre22_dev_rwsnw,afr-afr,15 +lre22_dev_rwzwb,tso-tso,19 +lre22_dev_rxcjq,ara-arq,13 +lre22_dev_rxcka,ara-arq,14 +lre22_dev_rxgxu,tir-tir,19 +lre22_dev_rxqxn,nbl-nbl,20 +lre22_dev_rxwip,ara-ayl,10 +lre22_dev_rycca,ven-ven,14 +lre22_dev_rydpu,eng-ens,17 +lre22_dev_ryksb,ven-ven,14 +lre22_dev_rysmu,afr-afr,23 +lre22_dev_rzisy,ara-aeb,13 +lre22_dev_rzpus,ara-arq,15 +lre22_dev_rzqyn,ara-ayl,11 +lre22_dev_rzzca,orm-orm,21 +lre22_dev_sazdy,tso-tso,15 +lre22_dev_sbkip,afr-afr,14 +lre22_dev_sbyek,ara-arq,11 +lre22_dev_scjzn,xho-xho,21 +lre22_dev_scobo,ven-ven,17 +lre22_dev_scqui,orm-orm,16 +lre22_dev_sdccf,ara-arq,14 +lre22_dev_sdcty,tso-tso,19 +lre22_dev_sdebh,ara-ayl,12 +lre22_dev_sedif,orm-orm,21 +lre22_dev_sedug,xho-xho,18 +lre22_dev_seynu,tso-tso,13 +lre22_dev_seyxt,ara-aeb,13 +lre22_dev_sezun,ara-aeb,14 +lre22_dev_sfeyl,ara-aeb,12 +lre22_dev_sfnux,afr-afr,18 +lre22_dev_sfqnk,zul-zul,15 +lre22_dev_sftvb,ara-ayl,11 +lre22_dev_sfwkd,ven-ven,17 +lre22_dev_shgbp,fra-ntf,22 +lre22_dev_shikk,tir-tir,19 +lre22_dev_shpve,afr-afr,21 +lre22_dev_sidjm,ara-ayl,10 +lre22_dev_sihvc,orm-orm,17 +lre22_dev_siiaw,ven-ven,16 +lre22_dev_sinfr,xho-xho,19 +lre22_dev_sipnk,eng-iaf,16 +lre22_dev_sjbcr,tir-tir,19 +lre22_dev_sjdzp,eng-iaf,16 +lre22_dev_sjmsx,ven-ven,19 +lre22_dev_sjsnf,afr-afr,16 +lre22_dev_sjwmd,tir-tir,19 +lre22_dev_sjxce,nbl-nbl,16 +lre22_dev_sjzcc,eng-ens,13 +lre22_dev_sjzsv,fra-ntf,22 +lre22_dev_skegk,afr-afr,18 +lre22_dev_skpib,ven-ven,14 +lre22_dev_slgub,orm-orm,18 +lre22_dev_slryu,nbl-nbl,17 +lre22_dev_slupt,ara-ayl,13 +lre22_dev_smfbl,ara-aeb,14 +lre22_dev_smfon,xho-xho,20 +lre22_dev_smvms,afr-afr,18 +lre22_dev_snegl,xho-xho,18 +lre22_dev_snvvg,tso-tso,14 +lre22_dev_sobpf,orm-orm,19 +lre22_dev_soely,eng-iaf,14 +lre22_dev_sorzd,tir-tir,19 +lre22_dev_spixz,nbl-nbl,18 +lre22_dev_spjcl,fra-ntf,17 +lre22_dev_spzra,tso-tso,17 +lre22_dev_sqaei,xho-xho,23 +lre22_dev_sqime,ven-ven,14 +lre22_dev_srgaw,eng-iaf,15 +lre22_dev_srnhq,ven-ven,16 +lre22_dev_srsng,orm-orm,21 +lre22_dev_srysc,nbl-nbl,17 +lre22_dev_srzgk,eng-ens,16 +lre22_dev_srzsi,ara-aeb,14 +lre22_dev_ssjtt,nbl-nbl,16 +lre22_dev_stajf,xho-xho,21 +lre22_dev_sttfd,ara-aeb,15 +lre22_dev_suevr,ara-aeb,15 +lre22_dev_sumum,afr-afr,18 +lre22_dev_svukm,fra-ntf,20 +lre22_dev_swkzf,tir-tir,17 +lre22_dev_sxqmv,ara-aeb,11 +lre22_dev_sxvuf,ara-aeb,11 +lre22_dev_sydqt,eng-ens,18 +lre22_dev_syooe,eng-ens,14 +lre22_dev_szpip,tir-tir,17 +lre22_dev_szsgp,fra-ntf,19 +lre22_dev_szzuj,ara-ayl,11 +lre22_dev_tabof,orm-orm,19 +lre22_dev_tavcw,ven-ven,19 +lre22_dev_tbjal,xho-xho,22 +lre22_dev_tbxzb,fra-ntf,21 +lre22_dev_tdalr,nbl-nbl,18 +lre22_dev_tdfzf,eng-iaf,17 +lre22_dev_tdlyk,tir-tir,15 +lre22_dev_tefms,fra-ntf,15 +lre22_dev_telgo,xho-xho,19 +lre22_dev_teric,eng-ens,14 +lre22_dev_tfcgx,orm-orm,21 +lre22_dev_tgiid,xho-xho,19 +lre22_dev_tgoea,ara-ayl,13 +lre22_dev_tgrrk,eng-iaf,18 +lre22_dev_tgtyv,tso-tso,12 +lre22_dev_tgzex,tso-tso,12 +lre22_dev_thone,nbl-nbl,17 +lre22_dev_thpnk,afr-afr,18 +lre22_dev_thwls,ven-ven,17 +lre22_dev_tibov,tir-tir,14 +lre22_dev_tidld,tso-tso,16 +lre22_dev_tiezu,eng-ens,17 +lre22_dev_tioqa,nbl-nbl,16 +lre22_dev_tiuym,zul-zul,15 +lre22_dev_tjivp,afr-afr,22 +lre22_dev_tjltd,orm-orm,20 +lre22_dev_tkcqj,ara-aeb,12 +lre22_dev_tkpij,tir-tir,19 +lre22_dev_tkpwp,orm-orm,19 +lre22_dev_tkyuh,tso-tso,12 +lre22_dev_tlkrm,zul-zul,19 +lre22_dev_tlspo,zul-zul,18 +lre22_dev_tmdvx,zul-zul,17 +lre22_dev_tmynp,afr-afr,20 +lre22_dev_tntmu,xho-xho,22 +lre22_dev_tnwok,orm-orm,21 +lre22_dev_toccu,eng-iaf,16 +lre22_dev_tofur,tir-tir,14 +lre22_dev_tokhl,ven-ven,21 +lre22_dev_tonkq,zul-zul,15 +lre22_dev_topxu,zul-zul,14 +lre22_dev_touna,ara-arq,15 +lre22_dev_towvr,tso-tso,12 +lre22_dev_tpasn,tir-tir,15 +lre22_dev_tpmen,ara-ayl,10 +lre22_dev_tpuws,tir-tir,19 +lre22_dev_tqbqi,xho-xho,17 +lre22_dev_tqtfo,tso-tso,17 +lre22_dev_traqh,fra-ntf,21 +lre22_dev_trdfp,ara-ayl,15 +lre22_dev_trdml,xho-xho,23 +lre22_dev_trmpg,nbl-nbl,19 +lre22_dev_tsdyg,tso-tso,19 +lre22_dev_tsvmo,ara-ayl,11 +lre22_dev_ttcul,afr-afr,19 +lre22_dev_ttrfr,ara-arq,12 +lre22_dev_tuhrp,ven-ven,14 +lre22_dev_twaba,afr-afr,15 +lre22_dev_twcnd,tir-tir,13 +lre22_dev_twtog,ven-ven,15 +lre22_dev_twvne,tir-tir,19 +lre22_dev_txcqg,orm-orm,19 +lre22_dev_txjsy,eng-ens,18 +lre22_dev_txmpu,afr-afr,19 +lre22_dev_txqde,eng-iaf,16 +lre22_dev_tyaup,eng-ens,17 +lre22_dev_tyaym,afr-afr,17 +lre22_dev_tybrl,nbl-nbl,16 +lre22_dev_tyduc,eng-ens,17 +lre22_dev_tyhsa,fra-ntf,21 +lre22_dev_tyigo,ara-ayl,11 +lre22_dev_tykte,zul-zul,18 +lre22_dev_tymil,tir-tir,16 +lre22_dev_tyofb,ven-ven,20 +lre22_dev_tysph,fra-ntf,16 +lre22_dev_tzamn,ara-aeb,11 +lre22_dev_tzrpp,ven-ven,15 +lre22_dev_tzukm,ara-aeb,12 +lre22_dev_uabum,xho-xho,19 +lre22_dev_uankd,nbl-nbl,18 +lre22_dev_uazyk,ara-ayl,14 +lre22_dev_ubdfa,eng-iaf,15 +lre22_dev_ubugi,orm-orm,22 +lre22_dev_ucetp,ven-ven,21 +lre22_dev_ucsxt,eng-ens,12 +lre22_dev_uczke,zul-zul,14 +lre22_dev_udldh,ara-arq,11 +lre22_dev_uejdk,orm-orm,17 +lre22_dev_uekog,zul-zul,17 +lre22_dev_uemql,xho-xho,16 +lre22_dev_ueovt,eng-ens,14 +lre22_dev_uesao,zul-zul,19 +lre22_dev_ueyxm,ara-ayl,13 +lre22_dev_ufafi,tir-tir,17 +lre22_dev_ufaig,tso-tso,12 +lre22_dev_uffpc,ara-arq,14 +lre22_dev_ufrmg,ven-ven,20 +lre22_dev_ugieb,ara-aeb,12 +lre22_dev_ugoiy,ara-ayl,10 +lre22_dev_ugzkq,ara-aeb,12 +lre22_dev_uhdrj,xho-xho,18 +lre22_dev_uhjdn,ara-ayl,16 +lre22_dev_uhkcq,ara-ayl,11 +lre22_dev_uhrjo,ara-aeb,13 +lre22_dev_uhrow,afr-afr,16 +lre22_dev_uikqm,ara-arq,12 +lre22_dev_uitct,eng-ens,13 +lre22_dev_uitqu,ara-ayl,12 +lre22_dev_ujiby,eng-ens,18 +lre22_dev_ujmtl,orm-orm,22 +lre22_dev_ukdpu,ven-ven,17 +lre22_dev_ukfpb,xho-xho,19 +lre22_dev_ukklw,fra-ntf,22 +lre22_dev_ukwjy,xho-xho,17 +lre22_dev_uljbx,fra-ntf,20 +lre22_dev_uljgh,tir-tir,13 +lre22_dev_uljvo,fra-ntf,21 +lre22_dev_undfd,orm-orm,20 +lre22_dev_unmiu,ara-arq,14 +lre22_dev_updar,nbl-nbl,17 +lre22_dev_uprkv,eng-iaf,16 +lre22_dev_urkok,ara-ayl,11 +lre22_dev_urolj,orm-orm,22 +lre22_dev_uscpv,eng-ens,14 +lre22_dev_ushtk,fra-ntf,20 +lre22_dev_usiey,ven-ven,19 +lre22_dev_usitw,ara-arq,14 +lre22_dev_utkxp,nbl-nbl,19 +lre22_dev_utnvo,tir-tir,16 +lre22_dev_utyjg,tso-tso,18 +lre22_dev_uuwaa,ara-arq,12 +lre22_dev_uuxla,eng-iaf,15 +lre22_dev_uuzuj,ara-arq,14 +lre22_dev_uvcxs,eng-ens,12 +lre22_dev_uveah,ven-ven,17 +lre22_dev_uvfqy,ara-arq,13 +lre22_dev_uvnhb,fra-ntf,20 +lre22_dev_uvqbm,afr-afr,19 +lre22_dev_uvsus,zul-zul,15 +lre22_dev_uvyev,fra-ntf,20 +lre22_dev_uwicd,tso-tso,12 +lre22_dev_uwnlz,zul-zul,18 +lre22_dev_uwwyj,afr-afr,20 +lre22_dev_uwyxc,eng-iaf,17 +lre22_dev_uxjzh,xho-xho,21 +lre22_dev_uxpyg,tso-tso,15 +lre22_dev_uxrxr,tso-tso,12 +lre22_dev_uyciz,eng-ens,14 +lre22_dev_uycza,xho-xho,17 +lre22_dev_uyvyb,eng-ens,17 +lre22_dev_uziar,zul-zul,15 +lre22_dev_uzlxd,fra-ntf,22 +lre22_dev_uznjr,tir-tir,13 +lre22_dev_vagda,ara-ayl,12 +lre22_dev_vanjm,ven-ven,18 +lre22_dev_vaqia,tir-tir,19 +lre22_dev_vasjz,ara-arq,11 +lre22_dev_vcexs,tir-tir,17 +lre22_dev_vchpm,fra-ntf,21 +lre22_dev_vctsa,nbl-nbl,19 +lre22_dev_vcxit,ven-ven,15 +lre22_dev_vcyqv,xho-xho,19 +lre22_dev_vdjlh,afr-afr,22 +lre22_dev_vdogx,ven-ven,15 +lre22_dev_veutb,eng-ens,16 +lre22_dev_vezrd,tso-tso,12 +lre22_dev_vfbfg,tso-tso,12 +lre22_dev_vffqd,orm-orm,21 +lre22_dev_vfhum,afr-afr,16 +lre22_dev_vfjtw,ara-arq,11 +lre22_dev_vfnjb,eng-ens,15 +lre22_dev_vgbbh,ara-arq,13 +lre22_dev_vgcao,eng-iaf,20 +lre22_dev_vgpnk,xho-xho,19 +lre22_dev_vityk,zul-zul,18 +lre22_dev_vjeuy,tir-tir,19 +lre22_dev_vjltt,zul-zul,17 +lre22_dev_vjqrm,tir-tir,13 +lre22_dev_vjvbs,tso-tso,18 +lre22_dev_vlcbq,tso-tso,16 +lre22_dev_vlnlb,tso-tso,13 +lre22_dev_vlscu,ara-ayl,15 +lre22_dev_vlwhz,fra-ntf,22 +lre22_dev_vlyeh,tso-tso,16 +lre22_dev_vmnps,zul-zul,14 +lre22_dev_vmqxk,tso-tso,18 +lre22_dev_vmrez,ven-ven,18 +lre22_dev_vmsnh,ara-aeb,11 +lre22_dev_vmuti,ara-aeb,14 +lre22_dev_vncre,afr-afr,22 +lre22_dev_vnkqv,afr-afr,15 +lre22_dev_vnmlt,zul-zul,18 +lre22_dev_vpkra,ara-ayl,11 +lre22_dev_vpoit,ara-arq,14 +lre22_dev_vpruu,orm-orm,23 +lre22_dev_vptiv,tir-tir,18 +lre22_dev_vqhcn,tso-tso,16 +lre22_dev_vqura,tir-tir,16 +lre22_dev_vrqfs,xho-xho,23 +lre22_dev_vrvtr,zul-zul,15 +lre22_dev_vrxvj,fra-ntf,17 +lre22_dev_vsbay,eng-iaf,19 +lre22_dev_vsbvi,fra-ntf,19 +lre22_dev_vslkb,eng-ens,12 +lre22_dev_vsrdg,tso-tso,12 +lre22_dev_vsrnz,zul-zul,14 +lre22_dev_vsryb,nbl-nbl,19 +lre22_dev_vtlab,zul-zul,19 +lre22_dev_vtrff,eng-iaf,17 +lre22_dev_vtztf,ara-aeb,11 +lre22_dev_vucth,eng-ens,14 +lre22_dev_vucug,orm-orm,21 +lre22_dev_vufuu,eng-ens,18 +lre22_dev_vujbs,zul-zul,19 +lre22_dev_vuufm,afr-afr,19 +lre22_dev_vvgdf,eng-ens,18 +lre22_dev_vvlcx,ara-aeb,12 +lre22_dev_vvvho,tir-tir,18 +lre22_dev_vwait,eng-iaf,14 +lre22_dev_vwdcw,ara-arq,14 +lre22_dev_vwyzq,ara-arq,14 +lre22_dev_vwzon,eng-ens,12 +lre22_dev_vxhoc,ara-aeb,11 +lre22_dev_vxkgz,ven-ven,18 +lre22_dev_vxlgl,tir-tir,18 +lre22_dev_vxsqt,eng-ens,15 +lre22_dev_vyqsd,nbl-nbl,17 +lre22_dev_vzcai,zul-zul,19 +lre22_dev_vzgoj,eng-iaf,14 +lre22_dev_vzlon,zul-zul,16 +lre22_dev_vznrg,nbl-nbl,16 +lre22_dev_vzqme,xho-xho,19 +lre22_dev_wabqx,ven-ven,18 +lre22_dev_wafdh,fra-ntf,21 +lre22_dev_wagmt,eng-iaf,18 +lre22_dev_waocz,ven-ven,20 +lre22_dev_wavrh,zul-zul,16 +lre22_dev_wawqg,ara-ayl,13 +lre22_dev_waznj,nbl-nbl,22 +lre22_dev_wbepu,fra-ntf,19 +lre22_dev_wbygw,eng-ens,16 +lre22_dev_wccgz,tso-tso,17 +lre22_dev_wcpwx,tir-tir,18 +lre22_dev_wczkn,eng-iaf,17 +lre22_dev_wdfmt,tir-tir,17 +lre22_dev_wdgbh,ara-arq,12 +lre22_dev_wdind,tso-tso,19 +lre22_dev_wdkit,nbl-nbl,16 +lre22_dev_wdmpt,eng-ens,17 +lre22_dev_wdpya,nbl-nbl,16 +lre22_dev_wdrxo,orm-orm,21 +lre22_dev_wdyiy,ara-ayl,13 +lre22_dev_weccy,afr-afr,15 +lre22_dev_wfmco,ara-arq,14 +lre22_dev_wfnon,nbl-nbl,17 +lre22_dev_wgdui,eng-iaf,14 +lre22_dev_wgkmr,eng-iaf,17 +lre22_dev_wgnex,tir-tir,19 +lre22_dev_wgucy,eng-iaf,18 +lre22_dev_wgwdn,eng-iaf,17 +lre22_dev_whqhx,eng-iaf,15 +lre22_dev_whxwv,eng-ens,14 +lre22_dev_witnq,fra-ntf,17 +lre22_dev_wixzu,tso-tso,16 +lre22_dev_wjhbw,eng-iaf,16 +lre22_dev_wjist,orm-orm,16 +lre22_dev_wjnhh,zul-zul,19 +lre22_dev_wjnyo,ven-ven,20 +lre22_dev_wjtnm,orm-orm,19 +lre22_dev_wjzhz,ara-aeb,13 +lre22_dev_wkacx,eng-iaf,15 +lre22_dev_wkqey,fra-ntf,16 +lre22_dev_wldli,zul-zul,14 +lre22_dev_wlnst,nbl-nbl,16 +lre22_dev_wltvq,zul-zul,17 +lre22_dev_wlwhq,orm-orm,19 +lre22_dev_wmdan,xho-xho,21 +lre22_dev_wmfce,nbl-nbl,20 +lre22_dev_wmigl,ven-ven,20 +lre22_dev_wmwmc,eng-iaf,19 +lre22_dev_wmypk,xho-xho,19 +lre22_dev_wmzpv,eng-ens,17 +lre22_dev_wnjpz,ven-ven,19 +lre22_dev_wnmkt,orm-orm,23 +lre22_dev_wnpep,nbl-nbl,16 +lre22_dev_wnqhz,nbl-nbl,16 +lre22_dev_wnxpz,ven-ven,15 +lre22_dev_wnxrw,ven-ven,18 +lre22_dev_woawg,ven-ven,18 +lre22_dev_wobzv,eng-ens,14 +lre22_dev_wocbv,tso-tso,18 +lre22_dev_woerb,fra-ntf,21 +lre22_dev_wojrt,orm-orm,19 +lre22_dev_wosus,tir-tir,17 +lre22_dev_wozuc,xho-xho,19 +lre22_dev_wqcyu,tso-tso,15 +lre22_dev_wqfuv,eng-ens,17 +lre22_dev_wqhag,zul-zul,19 +lre22_dev_wqmsd,tir-tir,13 +lre22_dev_wqthl,ara-aeb,12 +lre22_dev_wqtvm,eng-ens,15 +lre22_dev_wrmnw,zul-zul,18 +lre22_dev_wrtec,zul-zul,17 +lre22_dev_wrvls,zul-zul,14 +lre22_dev_wscfs,nbl-nbl,16 +lre22_dev_wssqw,eng-ens,15 +lre22_dev_wtbdf,tir-tir,14 +lre22_dev_wtcpe,ara-aeb,11 +lre22_dev_wthrk,orm-orm,18 +lre22_dev_wtofd,eng-iaf,20 +lre22_dev_wtuol,tso-tso,18 +lre22_dev_wuqez,ara-aeb,11 +lre22_dev_wuquc,tir-tir,18 +lre22_dev_wvlde,tso-tso,13 +lre22_dev_wwbmg,ara-aeb,11 +lre22_dev_wwduf,fra-ntf,18 +lre22_dev_wwvuw,ara-arq,13 +lre22_dev_wxaev,orm-orm,17 +lre22_dev_wycsj,ven-ven,18 +lre22_dev_wypwj,ara-ayl,10 +lre22_dev_wytpq,fra-ntf,17 +lre22_dev_wzhqk,xho-xho,22 +lre22_dev_wzpmq,eng-ens,12 +lre22_dev_wztdj,zul-zul,19 +lre22_dev_wzxgv,ven-ven,18 +lre22_dev_xacjk,fra-ntf,18 +lre22_dev_xaevp,tir-tir,14 +lre22_dev_xaldr,eng-iaf,14 +lre22_dev_xapdy,ara-aeb,12 +lre22_dev_xaurw,nbl-nbl,16 +lre22_dev_xawdd,tir-tir,20 +lre22_dev_xbcpb,ara-arq,12 +lre22_dev_xbfrs,ven-ven,17 +lre22_dev_xbqsr,nbl-nbl,22 +lre22_dev_xbvcc,nbl-nbl,17 +lre22_dev_xbvqw,orm-orm,23 +lre22_dev_xcame,xho-xho,16 +lre22_dev_xcrnp,ara-aeb,13 +lre22_dev_xcswu,ven-ven,18 +lre22_dev_xcuok,orm-orm,21 +lre22_dev_xcvkj,tso-tso,16 +lre22_dev_xdtdp,fra-ntf,17 +lre22_dev_xdyea,ara-ayl,10 +lre22_dev_xerqi,fra-ntf,17 +lre22_dev_xetdb,eng-ens,14 +lre22_dev_xfecy,nbl-nbl,16 +lre22_dev_xfgcu,eng-iaf,19 +lre22_dev_xfing,tir-tir,20 +lre22_dev_xgaig,ara-aeb,15 +lre22_dev_xgoyq,eng-ens,18 +lre22_dev_xhdtx,eng-iaf,14 +lre22_dev_xhvkx,orm-orm,19 +lre22_dev_xiblr,tir-tir,17 +lre22_dev_xifty,ara-aeb,12 +lre22_dev_xigtx,ara-arq,14 +lre22_dev_xijus,tso-tso,14 +lre22_dev_xipox,xho-xho,20 +lre22_dev_xittq,ara-aeb,13 +lre22_dev_xjpwq,ara-ayl,15 +lre22_dev_xjrla,afr-afr,20 +lre22_dev_xkdof,ara-ayl,13 +lre22_dev_xkiba,eng-ens,18 +lre22_dev_xlcxh,fra-ntf,18 +lre22_dev_xlsxb,tso-tso,16 +lre22_dev_xmhpj,ven-ven,20 +lre22_dev_xnqct,ara-arq,11 +lre22_dev_xoayi,eng-ens,13 +lre22_dev_xohps,ara-arq,11 +lre22_dev_xokpn,zul-zul,18 +lre22_dev_xonym,eng-ens,14 +lre22_dev_xozod,afr-afr,14 +lre22_dev_xpenp,ara-arq,11 +lre22_dev_xpnti,ara-aeb,11 +lre22_dev_xpqyr,orm-orm,22 +lre22_dev_xpswt,orm-orm,23 +lre22_dev_xpumn,ven-ven,14 +lre22_dev_xpvcf,orm-orm,20 +lre22_dev_xqhoa,ara-ayl,13 +lre22_dev_xqnpt,orm-orm,22 +lre22_dev_xqooi,xho-xho,20 +lre22_dev_xqupu,fra-ntf,21 +lre22_dev_xresy,eng-iaf,17 +lre22_dev_xrouj,ara-ayl,16 +lre22_dev_xsnxu,ara-aeb,12 +lre22_dev_xtaof,ara-ayl,13 +lre22_dev_xtbxk,orm-orm,20 +lre22_dev_xtgak,nbl-nbl,20 +lre22_dev_xuauh,ara-aeb,13 +lre22_dev_xubei,eng-iaf,17 +lre22_dev_xubol,ara-aeb,11 +lre22_dev_xuieb,orm-orm,19 +lre22_dev_xunxs,ara-ayl,14 +lre22_dev_xutjo,nbl-nbl,20 +lre22_dev_xvbos,afr-afr,22 +lre22_dev_xvcfn,eng-ens,16 +lre22_dev_xvgqo,eng-ens,12 +lre22_dev_xwemk,zul-zul,18 +lre22_dev_xwsyq,ara-ayl,14 +lre22_dev_xxdbg,tso-tso,18 +lre22_dev_xyoua,fra-ntf,22 +lre22_dev_xzoej,ara-aeb,13 +lre22_dev_xzrdl,ara-arq,13 +lre22_dev_xztsz,tso-tso,16 +lre22_dev_xzxbd,zul-zul,15 +lre22_dev_yagvv,tso-tso,13 +lre22_dev_ybqju,tso-tso,13 +lre22_dev_ybrji,ara-arq,11 +lre22_dev_ybsmy,ven-ven,21 +lre22_dev_ycbaf,ara-aeb,14 +lre22_dev_ychsm,ven-ven,14 +lre22_dev_ycrlj,xho-xho,17 +lre22_dev_ycuhc,orm-orm,21 +lre22_dev_ydhqc,ara-arq,13 +lre22_dev_ydmnb,nbl-nbl,17 +lre22_dev_yduem,xho-xho,21 +lre22_dev_yemzu,ara-aeb,11 +lre22_dev_yeoyx,eng-ens,18 +lre22_dev_yersp,ara-ayl,13 +lre22_dev_yeshv,eng-iaf,17 +lre22_dev_yexec,ven-ven,20 +lre22_dev_yeyna,ara-ayl,14 +lre22_dev_yfxmd,ara-arq,14 +lre22_dev_yfzah,ara-arq,14 +lre22_dev_ygkvo,ara-arq,11 +lre22_dev_yhgvr,ara-arq,15 +lre22_dev_yhwin,ara-arq,12 +lre22_dev_yirig,ara-ayl,16 +lre22_dev_yixgu,xho-xho,16 +lre22_dev_yjbfl,xho-xho,19 +lre22_dev_yjodc,eng-ens,14 +lre22_dev_yjoht,ara-aeb,12 +lre22_dev_yjqkb,ara-arq,14 +lre22_dev_yjrkq,ara-arq,15 +lre22_dev_yjrng,afr-afr,16 +lre22_dev_ykpzq,afr-afr,21 +lre22_dev_yktop,eng-iaf,20 +lre22_dev_ylfah,zul-zul,15 +lre22_dev_ylgex,tso-tso,14 +lre22_dev_ylkds,nbl-nbl,17 +lre22_dev_ylvyc,xho-xho,20 +lre22_dev_ylzic,eng-iaf,20 +lre22_dev_ymoon,afr-afr,17 +lre22_dev_yncqr,ara-arq,13 +lre22_dev_ynjtn,ven-ven,18 +lre22_dev_ynmzy,tso-tso,16 +lre22_dev_ynozi,fra-ntf,21 +lre22_dev_yntec,orm-orm,19 +lre22_dev_ynurl,tso-tso,14 +lre22_dev_ypdtt,ara-aeb,11 +lre22_dev_yprom,tso-tso,13 +lre22_dev_yptsk,xho-xho,23 +lre22_dev_ypyft,eng-iaf,14 +lre22_dev_yqhwt,orm-orm,23 +lre22_dev_yqtxe,eng-iaf,19 +lre22_dev_yquja,ara-ayl,10 +lre22_dev_yqxhl,eng-ens,14 +lre22_dev_yqyby,nbl-nbl,18 +lre22_dev_yqzua,fra-ntf,16 +lre22_dev_yrfxo,ven-ven,21 +lre22_dev_yrgzf,ara-aeb,13 +lre22_dev_yruqe,tso-tso,17 +lre22_dev_yrwgb,zul-zul,18 +lre22_dev_yrxsi,orm-orm,21 +lre22_dev_ysdkl,tso-tso,15 +lre22_dev_ytgav,xho-xho,16 +lre22_dev_ytoet,ara-arq,14 +lre22_dev_yuabg,eng-ens,16 +lre22_dev_yundm,tso-tso,14 +lre22_dev_yuvux,ara-ayl,13 +lre22_dev_yvdcv,fra-ntf,21 +lre22_dev_yvoli,orm-orm,23 +lre22_dev_yweox,orm-orm,21 +lre22_dev_ywgoc,eng-iaf,19 +lre22_dev_ywoyx,ven-ven,18 +lre22_dev_ywxql,zul-zul,19 +lre22_dev_yxkyl,eng-iaf,15 +lre22_dev_yxtmn,ara-aeb,14 +lre22_dev_yycsn,ara-ayl,12 +lre22_dev_yyswd,eng-iaf,16 +lre22_dev_yyugr,ven-ven,21 +lre22_dev_yzitu,orm-orm,20 +lre22_dev_yzwmi,eng-ens,16 +lre22_dev_yzzww,zul-zul,17 +lre22_dev_zabub,ara-ayl,16 +lre22_dev_zabuv,eng-iaf,14 +lre22_dev_zacuc,zul-zul,19 +lre22_dev_zavru,zul-zul,19 +lre22_dev_zbfgy,ara-arq,12 +lre22_dev_zbjez,nbl-nbl,17 +lre22_dev_zbtpo,ven-ven,18 +lre22_dev_zbzip,tso-tso,19 +lre22_dev_zcevz,nbl-nbl,16 +lre22_dev_zcnsv,afr-afr,21 +lre22_dev_zcqkl,eng-iaf,20 +lre22_dev_zczer,ven-ven,14 +lre22_dev_zdcdt,nbl-nbl,18 +lre22_dev_zddua,xho-xho,19 +lre22_dev_zdvsh,ara-arq,14 +lre22_dev_zdwxx,ara-ayl,14 +lre22_dev_zdyxi,tir-tir,14 +lre22_dev_zetju,eng-iaf,17 +lre22_dev_zfsek,ara-arq,11 +lre22_dev_zfvfa,eng-ens,18 +lre22_dev_zggiu,zul-zul,19 +lre22_dev_zgndz,tso-tso,14 +lre22_dev_zgxth,eng-ens,16 +lre22_dev_zhlxa,ara-ayl,14 +lre22_dev_zhnsb,ara-ayl,15 +lre22_dev_zhsmo,ara-aeb,13 +lre22_dev_zhvbf,xho-xho,18 +lre22_dev_zhzrh,eng-iaf,15 +lre22_dev_ziigd,orm-orm,21 +lre22_dev_zilud,tir-tir,19 +lre22_dev_zjivp,zul-zul,19 +lre22_dev_zjleg,zul-zul,19 +lre22_dev_zjquq,orm-orm,16 +lre22_dev_zkgjo,nbl-nbl,22 +lre22_dev_zkhes,fra-ntf,16 +lre22_dev_zkioq,ara-aeb,12 +lre22_dev_zkwaw,afr-afr,21 +lre22_dev_zlapc,ara-ayl,13 +lre22_dev_zlntm,zul-zul,19 +lre22_dev_zmmyn,xho-xho,23 +lre22_dev_zmxld,ven-ven,17 +lre22_dev_znhcf,ven-ven,21 +lre22_dev_znwsk,afr-afr,22 +lre22_dev_znxvg,eng-ens,18 +lre22_dev_znycz,ara-aeb,13 +lre22_dev_zoayx,zul-zul,18 +lre22_dev_zogte,nbl-nbl,16 +lre22_dev_zoldl,ara-aeb,12 +lre22_dev_zoqzl,eng-ens,17 +lre22_dev_zorfv,eng-iaf,16 +lre22_dev_zoseh,ara-arq,12 +lre22_dev_zpotb,xho-xho,16 +lre22_dev_zptbg,tir-tir,14 +lre22_dev_zqjzi,ara-aeb,11 +lre22_dev_zqljj,ara-aeb,14 +lre22_dev_zqlri,orm-orm,18 +lre22_dev_zqoif,zul-zul,19 +lre22_dev_zqorv,ara-aeb,12 +lre22_dev_zqwgs,fra-ntf,18 +lre22_dev_zrhbt,tir-tir,19 +lre22_dev_zrqar,ara-aeb,13 +lre22_dev_zrqec,eng-iaf,17 +lre22_dev_ztdrx,fra-ntf,15 +lre22_dev_ztdwr,orm-orm,17 +lre22_dev_zthiv,ara-arq,15 +lre22_dev_ztknh,xho-xho,18 +lre22_dev_ztlcq,ara-aeb,13 +lre22_dev_ztufj,fra-ntf,19 +lre22_dev_zubjl,fra-ntf,20 +lre22_dev_zunuw,tso-tso,17 +lre22_dev_zutul,tir-tir,13 +lre22_dev_zutvv,eng-ens,12 +lre22_dev_zuugc,eng-iaf,17 +lre22_dev_zuvqx,eng-iaf,14 +lre22_dev_zvthu,orm-orm,20 +lre22_dev_zvvov,ara-aeb,11 +lre22_dev_zvyuh,ara-arq,14 +lre22_dev_zwfqq,eng-iaf,17 +lre22_dev_zwosr,xho-xho,16 +lre22_dev_zwvhw,tso-tso,12 +lre22_dev_zxihz,ven-ven,14 +lre22_dev_zydma,eng-ens,12 +lre22_dev_zyqlz,zul-zul,19 +lre22_dev_zyyie,orm-orm,23 +lre22_dev_zyywo,eng-iaf,14 +lre22_dev_zzyze,ara-ayl,12 diff --git a/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/test_segments.csv b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/test_segments.csv new file mode 100644 index 00000000..4d50b6a5 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/test_segments.csv @@ -0,0 +1,2088 @@ +id,class_id,subclass_idx +lre22_dev_aayck,ara-aeb,12 +lre22_dev_aayto,eng-iaf,14 +lre22_dev_abaha,zul-zul,17 +lre22_dev_abetm,fra-ntf,15 +lre22_dev_abnwz,zul-zul,19 +lre22_dev_abvjt,zul-zul,19 +lre22_dev_abwgm,ara-ayl,13 +lre22_dev_acepd,eng-iaf,19 +lre22_dev_acspt,eng-ens,12 +lre22_dev_aczdh,eng-ens,13 +lre22_dev_adkkm,tso-tso,19 +lre22_dev_adpus,tso-tso,13 +lre22_dev_adwju,ara-aeb,14 +lre22_dev_afnfn,afr-afr,20 +lre22_dev_afohq,ara-aeb,13 +lre22_dev_agnnp,afr-afr,17 +lre22_dev_agquw,fra-ntf,20 +lre22_dev_ahoow,ara-ayl,15 +lre22_dev_ahqxq,fra-ntf,22 +lre22_dev_aieqr,eng-iaf,17 +lre22_dev_ainix,eng-iaf,16 +lre22_dev_aiojl,fra-ntf,18 +lre22_dev_aiypg,nbl-nbl,17 +lre22_dev_ajcpi,orm-orm,22 +lre22_dev_ajeqv,ara-aeb,11 +lre22_dev_ajlqy,xho-xho,16 +lre22_dev_ajlyw,orm-orm,21 +lre22_dev_ajmrs,ara-aeb,11 +lre22_dev_ajzjc,eng-iaf,16 +lre22_dev_ajzyq,ara-ayl,14 +lre22_dev_akmfp,orm-orm,19 +lre22_dev_aleeu,ara-arq,14 +lre22_dev_aliba,ara-aeb,15 +lre22_dev_alkwi,eng-iaf,14 +lre22_dev_aluwk,nbl-nbl,16 +lre22_dev_alvdl,ara-arq,14 +lre22_dev_amrca,ara-aeb,11 +lre22_dev_aoanh,ara-ayl,15 +lre22_dev_aoeql,eng-ens,16 +lre22_dev_apfpk,eng-iaf,14 +lre22_dev_apufs,tir-tir,17 +lre22_dev_apvko,orm-orm,20 +lre22_dev_arefe,orm-orm,23 +lre22_dev_arvyp,ara-ayl,11 +lre22_dev_arwsc,fra-ntf,20 +lre22_dev_asqwa,ara-aeb,14 +lre22_dev_asrng,fra-ntf,18 +lre22_dev_aswjo,afr-afr,18 +lre22_dev_aulzk,ven-ven,21 +lre22_dev_aupcr,zul-zul,18 +lre22_dev_auqcy,eng-ens,18 +lre22_dev_auxdy,nbl-nbl,16 +lre22_dev_auycg,ara-ayl,11 +lre22_dev_aviiv,tso-tso,14 +lre22_dev_avrwo,tso-tso,19 +lre22_dev_avwim,ara-arq,13 +lre22_dev_avzdv,zul-zul,18 +lre22_dev_awtna,ara-arq,13 +lre22_dev_awxbj,orm-orm,23 +lre22_dev_axejc,fra-ntf,17 +lre22_dev_axtso,eng-ens,16 +lre22_dev_axwoo,ara-aeb,15 +lre22_dev_axyma,ara-arq,15 +lre22_dev_aycai,ven-ven,17 +lre22_dev_ayfjz,orm-orm,20 +lre22_dev_aylrz,eng-iaf,16 +lre22_dev_aynwz,tso-tso,18 +lre22_dev_aypyt,ara-aeb,11 +lre22_dev_ayszn,zul-zul,18 +lre22_dev_ayvge,ara-aeb,11 +lre22_dev_ayvmo,afr-afr,23 +lre22_dev_ayzdz,xho-xho,20 +lre22_dev_azbmt,xho-xho,19 +lre22_dev_azjsr,tir-tir,19 +lre22_dev_azkdh,nbl-nbl,20 +lre22_dev_azwrd,fra-ntf,15 +lre22_dev_badwe,ara-aeb,13 +lre22_dev_baiaf,zul-zul,17 +lre22_dev_baiwb,ara-aeb,13 +lre22_dev_baxuo,zul-zul,18 +lre22_dev_bbbtf,eng-ens,18 +lre22_dev_bbdws,ara-ayl,12 +lre22_dev_bbitq,eng-ens,16 +lre22_dev_bbnvu,ara-arq,13 +lre22_dev_bbunq,eng-iaf,14 +lre22_dev_bcinm,ara-aeb,14 +lre22_dev_bcrhs,zul-zul,17 +lre22_dev_bcwpu,ara-aeb,13 +lre22_dev_bcxdq,fra-ntf,21 +lre22_dev_bdgbr,ara-aeb,12 +lre22_dev_bdgrw,orm-orm,17 +lre22_dev_bdiml,ara-aeb,11 +lre22_dev_bdyue,xho-xho,21 +lre22_dev_bdzsj,tir-tir,13 +lre22_dev_beanp,tso-tso,12 +lre22_dev_beigo,ara-aeb,14 +lre22_dev_belhi,orm-orm,23 +lre22_dev_bfoej,ven-ven,20 +lre22_dev_bfznf,ara-ayl,11 +lre22_dev_bgeiq,ven-ven,15 +lre22_dev_bgeyp,ara-aeb,11 +lre22_dev_bgomt,afr-afr,14 +lre22_dev_bgrfd,nbl-nbl,19 +lre22_dev_bgwlu,tir-tir,17 +lre22_dev_bifkp,nbl-nbl,18 +lre22_dev_bipvh,nbl-nbl,17 +lre22_dev_biuyu,eng-ens,12 +lre22_dev_bixnf,ara-ayl,11 +lre22_dev_bjhdf,tso-tso,17 +lre22_dev_bjsmm,ara-ayl,10 +lre22_dev_bkhqg,eng-ens,17 +lre22_dev_bkpah,ven-ven,14 +lre22_dev_blaco,afr-afr,17 +lre22_dev_bleum,xho-xho,18 +lre22_dev_bnhvt,nbl-nbl,16 +lre22_dev_bowyn,ara-arq,14 +lre22_dev_bpeqb,xho-xho,21 +lre22_dev_bpgqs,tir-tir,13 +lre22_dev_bpzpv,afr-afr,16 +lre22_dev_bqenu,eng-ens,12 +lre22_dev_bqfxw,zul-zul,14 +lre22_dev_bqowg,tir-tir,19 +lre22_dev_bqxyq,tir-tir,19 +lre22_dev_brjud,xho-xho,21 +lre22_dev_bruwl,xho-xho,16 +lre22_dev_brzld,fra-ntf,20 +lre22_dev_bsgqz,eng-ens,13 +lre22_dev_bsocl,eng-ens,12 +lre22_dev_bszou,ara-arq,13 +lre22_dev_btapz,zul-zul,15 +lre22_dev_btjlk,ara-aeb,14 +lre22_dev_btkry,xho-xho,19 +lre22_dev_btyeu,ara-ayl,15 +lre22_dev_bvnuu,fra-ntf,19 +lre22_dev_bvqag,eng-iaf,20 +lre22_dev_bvvho,eng-ens,16 +lre22_dev_bvwaj,tir-tir,14 +lre22_dev_bvymi,eng-ens,15 +lre22_dev_bwgmj,eng-iaf,20 +lre22_dev_bwqpz,ara-arq,14 +lre22_dev_bwyrh,ara-aeb,12 +lre22_dev_bxkrj,ven-ven,18 +lre22_dev_bxkti,afr-afr,20 +lre22_dev_bxzms,nbl-nbl,17 +lre22_dev_bygrw,tso-tso,18 +lre22_dev_byjqr,ven-ven,18 +lre22_dev_bylkl,eng-iaf,16 +lre22_dev_bzmkn,fra-ntf,22 +lre22_dev_bzntz,ara-arq,13 +lre22_dev_bzwkf,eng-iaf,19 +lre22_dev_caijh,ven-ven,18 +lre22_dev_canou,tir-tir,19 +lre22_dev_caqxh,afr-afr,20 +lre22_dev_cayuc,eng-ens,12 +lre22_dev_cbruy,xho-xho,23 +lre22_dev_cbyyw,ara-arq,14 +lre22_dev_cbzbe,afr-afr,22 +lre22_dev_cclfh,ara-arq,15 +lre22_dev_ccovd,ara-arq,11 +lre22_dev_ccpns,eng-ens,17 +lre22_dev_ccsjt,eng-iaf,16 +lre22_dev_ccsql,fra-ntf,21 +lre22_dev_ccugm,eng-ens,18 +lre22_dev_ccyfn,afr-afr,23 +lre22_dev_cdmgw,tir-tir,16 +lre22_dev_cdshg,eng-iaf,17 +lre22_dev_ceccy,orm-orm,20 +lre22_dev_cecwt,fra-ntf,22 +lre22_dev_cegvk,ara-arq,11 +lre22_dev_cferi,zul-zul,15 +lre22_dev_cfojx,ara-arq,11 +lre22_dev_cfzoe,tir-tir,20 +lre22_dev_cgfna,zul-zul,18 +lre22_dev_cggzh,ara-ayl,13 +lre22_dev_cgims,tir-tir,20 +lre22_dev_cgixe,tir-tir,19 +lre22_dev_cgjov,zul-zul,14 +lre22_dev_chhio,ara-aeb,14 +lre22_dev_chnvd,tir-tir,13 +lre22_dev_chpww,nbl-nbl,21 +lre22_dev_churq,ara-ayl,13 +lre22_dev_cifqp,zul-zul,17 +lre22_dev_cijnx,xho-xho,22 +lre22_dev_ciozp,nbl-nbl,16 +lre22_dev_citpi,ara-aeb,12 +lre22_dev_cjrav,tir-tir,15 +lre22_dev_cksrw,ara-aeb,14 +lre22_dev_cktce,tir-tir,17 +lre22_dev_ckzhf,nbl-nbl,20 +lre22_dev_cleyn,ara-aeb,11 +lre22_dev_clhmt,fra-ntf,19 +lre22_dev_clrjd,orm-orm,21 +lre22_dev_clssx,eng-iaf,14 +lre22_dev_cluxm,ara-ayl,13 +lre22_dev_clzwe,ara-aeb,14 +lre22_dev_cminq,ara-aeb,11 +lre22_dev_cmmap,afr-afr,23 +lre22_dev_cmssr,orm-orm,20 +lre22_dev_cmufu,tso-tso,16 +lre22_dev_cnapz,orm-orm,19 +lre22_dev_cndba,tso-tso,12 +lre22_dev_cnkjh,tso-tso,15 +lre22_dev_cnvfe,orm-orm,18 +lre22_dev_cobbz,ara-arq,12 +lre22_dev_coppu,nbl-nbl,21 +lre22_dev_coqoj,eng-ens,17 +lre22_dev_cotun,ven-ven,16 +lre22_dev_cowrt,xho-xho,19 +lre22_dev_cppma,afr-afr,20 +lre22_dev_cpqkz,ara-arq,14 +lre22_dev_cpraw,afr-afr,17 +lre22_dev_cpsrb,fra-ntf,20 +lre22_dev_cpuax,zul-zul,16 +lre22_dev_cpudb,nbl-nbl,16 +lre22_dev_cqqds,afr-afr,22 +lre22_dev_cquib,ven-ven,21 +lre22_dev_cqwxe,nbl-nbl,16 +lre22_dev_cqyad,eng-iaf,15 +lre22_dev_crkut,eng-ens,17 +lre22_dev_crozj,fra-ntf,17 +lre22_dev_crrro,orm-orm,16 +lre22_dev_csavn,ara-aeb,15 +lre22_dev_cschy,afr-afr,16 +lre22_dev_csegr,tso-tso,14 +lre22_dev_csgvq,fra-ntf,17 +lre22_dev_csltj,ara-aeb,14 +lre22_dev_csmtr,ara-ayl,14 +lre22_dev_csqxl,ven-ven,20 +lre22_dev_ctjqw,nbl-nbl,16 +lre22_dev_ctxxt,nbl-nbl,17 +lre22_dev_cuaoy,ara-aeb,13 +lre22_dev_cudpj,ara-arq,13 +lre22_dev_cuhdf,afr-afr,21 +lre22_dev_cuoju,ven-ven,21 +lre22_dev_cupti,nbl-nbl,21 +lre22_dev_cusej,ara-aeb,14 +lre22_dev_cvfle,tir-tir,14 +lre22_dev_cvnqu,eng-ens,14 +lre22_dev_cvvjc,zul-zul,18 +lre22_dev_cvwht,fra-ntf,18 +lre22_dev_cvwtu,fra-ntf,21 +lre22_dev_cwlvk,tso-tso,16 +lre22_dev_cwnky,xho-xho,17 +lre22_dev_cxdlr,afr-afr,14 +lre22_dev_cxfii,ara-arq,13 +lre22_dev_cxpzt,zul-zul,16 +lre22_dev_cxqri,fra-ntf,21 +lre22_dev_cyaug,xho-xho,22 +lre22_dev_czdbd,fra-ntf,15 +lre22_dev_czvoy,ven-ven,16 +lre22_dev_czzrm,afr-afr,17 +lre22_dev_dahzr,ven-ven,17 +lre22_dev_dapny,ven-ven,17 +lre22_dev_dapug,nbl-nbl,19 +lre22_dev_dcbnz,xho-xho,16 +lre22_dev_dciaf,nbl-nbl,22 +lre22_dev_dcljn,afr-afr,19 +lre22_dev_dcmrn,afr-afr,20 +lre22_dev_dcobq,xho-xho,16 +lre22_dev_dcohp,tir-tir,16 +lre22_dev_dcsep,tso-tso,12 +lre22_dev_dctlw,ara-arq,12 +lre22_dev_dctvv,ara-arq,12 +lre22_dev_dcyoy,eng-iaf,17 +lre22_dev_ddgeb,xho-xho,23 +lre22_dev_ddsab,eng-ens,18 +lre22_dev_ddtpk,eng-ens,18 +lre22_dev_debjr,xho-xho,16 +lre22_dev_defkv,eng-ens,15 +lre22_dev_dejub,ara-arq,11 +lre22_dev_delok,eng-ens,14 +lre22_dev_dezlg,nbl-nbl,17 +lre22_dev_dffbj,fra-ntf,21 +lre22_dev_dfkox,xho-xho,19 +lre22_dev_dfpcn,ara-ayl,13 +lre22_dev_dfqgl,afr-afr,18 +lre22_dev_dfras,eng-iaf,19 +lre22_dev_dftpm,eng-iaf,20 +lre22_dev_dfvta,tso-tso,17 +lre22_dev_dgarp,eng-ens,13 +lre22_dev_dgntq,zul-zul,17 +lre22_dev_dgssb,tir-tir,19 +lre22_dev_dgvtc,xho-xho,23 +lre22_dev_dhdvp,ara-ayl,10 +lre22_dev_dhmbl,fra-ntf,22 +lre22_dev_diiry,orm-orm,16 +lre22_dev_disrs,afr-afr,16 +lre22_dev_ditsk,xho-xho,21 +lre22_dev_djbbz,ara-arq,14 +lre22_dev_djevu,tso-tso,16 +lre22_dev_djlaf,tir-tir,20 +lre22_dev_djoim,zul-zul,15 +lre22_dev_djvvp,zul-zul,17 +lre22_dev_djwyo,ven-ven,18 +lre22_dev_dkbfm,ara-ayl,12 +lre22_dev_dkpcy,ara-aeb,12 +lre22_dev_dlxzj,orm-orm,19 +lre22_dev_dmnjo,ven-ven,14 +lre22_dev_dmtsm,zul-zul,16 +lre22_dev_dnaql,orm-orm,23 +lre22_dev_dnkpf,ara-aeb,15 +lre22_dev_dnscr,tso-tso,12 +lre22_dev_dnygt,eng-ens,15 +lre22_dev_dobre,xho-xho,19 +lre22_dev_dohlp,xho-xho,23 +lre22_dev_doioo,orm-orm,19 +lre22_dev_donaq,ara-aeb,13 +lre22_dev_dooht,ara-arq,11 +lre22_dev_dpmbt,zul-zul,14 +lre22_dev_dptyy,xho-xho,17 +lre22_dev_dqmud,eng-iaf,15 +lre22_dev_dqmxb,xho-xho,20 +lre22_dev_dqopt,eng-ens,14 +lre22_dev_dqpgr,ara-aeb,14 +lre22_dev_drkux,eng-ens,14 +lre22_dev_dsfha,ven-ven,18 +lre22_dev_dsftc,tso-tso,16 +lre22_dev_dskaq,ven-ven,15 +lre22_dev_dtdmp,zul-zul,18 +lre22_dev_dtdux,afr-afr,14 +lre22_dev_dtyki,ara-arq,11 +lre22_dev_durlr,orm-orm,18 +lre22_dev_dutdz,tso-tso,12 +lre22_dev_dvbol,ara-ayl,15 +lre22_dev_dwesk,nbl-nbl,22 +lre22_dev_dwtjw,ven-ven,14 +lre22_dev_dxckb,tso-tso,12 +lre22_dev_dxizq,eng-iaf,14 +lre22_dev_dxtnq,fra-ntf,18 +lre22_dev_dxvib,zul-zul,14 +lre22_dev_dyago,eng-iaf,16 +lre22_dev_dyipl,eng-iaf,18 +lre22_dev_dyqlo,ara-arq,13 +lre22_dev_dyvml,eng-iaf,15 +lre22_dev_dzkui,tso-tso,12 +lre22_dev_dzqta,ven-ven,20 +lre22_dev_dzxio,eng-ens,18 +lre22_dev_eachn,tir-tir,16 +lre22_dev_eapvu,eng-iaf,20 +lre22_dev_ebfdv,ara-ayl,10 +lre22_dev_ebgbd,eng-ens,17 +lre22_dev_eblhy,eng-iaf,20 +lre22_dev_ebtrq,ara-aeb,13 +lre22_dev_ebymv,tir-tir,14 +lre22_dev_ebzhg,nbl-nbl,21 +lre22_dev_ecbwo,ven-ven,21 +lre22_dev_ecllm,fra-ntf,21 +lre22_dev_eclpf,ven-ven,16 +lre22_dev_ecmhd,ara-aeb,14 +lre22_dev_ecnqi,eng-ens,14 +lre22_dev_ecpdc,ara-ayl,10 +lre22_dev_ecslx,afr-afr,22 +lre22_dev_ecuyo,xho-xho,23 +lre22_dev_edgur,tso-tso,16 +lre22_dev_edjtb,nbl-nbl,22 +lre22_dev_edsls,tso-tso,16 +lre22_dev_edssc,orm-orm,23 +lre22_dev_edvab,zul-zul,19 +lre22_dev_eehzu,zul-zul,18 +lre22_dev_eekci,afr-afr,15 +lre22_dev_eekcw,zul-zul,17 +lre22_dev_efihg,nbl-nbl,16 +lre22_dev_efsxw,tso-tso,16 +lre22_dev_efxjv,ara-aeb,14 +lre22_dev_efymf,ara-aeb,14 +lre22_dev_ehcvr,tir-tir,19 +lre22_dev_ehehw,xho-xho,20 +lre22_dev_ehewh,eng-ens,18 +lre22_dev_ehvyp,zul-zul,14 +lre22_dev_eifqv,zul-zul,19 +lre22_dev_eifxu,ara-ayl,10 +lre22_dev_ejcvy,fra-ntf,18 +lre22_dev_ejeek,eng-ens,16 +lre22_dev_ejfyn,fra-ntf,22 +lre22_dev_ejjqg,tso-tso,12 +lre22_dev_ejtox,ven-ven,19 +lre22_dev_ejwch,fra-ntf,21 +lre22_dev_ejzhx,xho-xho,17 +lre22_dev_ekbkm,afr-afr,21 +lre22_dev_ekzhk,ara-ayl,10 +lre22_dev_elanj,tso-tso,18 +lre22_dev_elvvn,tir-tir,16 +lre22_dev_emadg,xho-xho,22 +lre22_dev_emkzr,afr-afr,21 +lre22_dev_emmck,ara-arq,15 +lre22_dev_enwfu,afr-afr,15 +lre22_dev_eodro,ara-arq,15 +lre22_dev_eoisu,ven-ven,18 +lre22_dev_eomzr,xho-xho,23 +lre22_dev_eorva,xho-xho,21 +lre22_dev_epbwh,nbl-nbl,17 +lre22_dev_epeou,xho-xho,20 +lre22_dev_epifq,nbl-nbl,22 +lre22_dev_epqqo,ara-ayl,14 +lre22_dev_epsld,tso-tso,12 +lre22_dev_epsza,ara-ayl,12 +lre22_dev_eqmgm,ara-aeb,12 +lre22_dev_eqrhr,afr-afr,22 +lre22_dev_eqvan,ara-ayl,13 +lre22_dev_ersgd,orm-orm,22 +lre22_dev_erxig,zul-zul,15 +lre22_dev_esbrw,fra-ntf,19 +lre22_dev_esuug,nbl-nbl,20 +lre22_dev_etczk,tir-tir,14 +lre22_dev_etelz,fra-ntf,21 +lre22_dev_ettsh,fra-ntf,20 +lre22_dev_etuwp,ven-ven,19 +lre22_dev_eubgy,fra-ntf,18 +lre22_dev_euewj,orm-orm,18 +lre22_dev_euzyb,ara-aeb,13 +lre22_dev_ewatn,zul-zul,18 +lre22_dev_ewehs,orm-orm,17 +lre22_dev_ewexz,fra-ntf,18 +lre22_dev_ewgop,tir-tir,20 +lre22_dev_ewmgd,fra-ntf,21 +lre22_dev_ewzma,orm-orm,18 +lre22_dev_expvn,xho-xho,17 +lre22_dev_eyoqu,tir-tir,16 +lre22_dev_eyylz,nbl-nbl,16 +lre22_dev_eyzqu,tir-tir,18 +lre22_dev_ezdty,afr-afr,18 +lre22_dev_ezgcl,ara-aeb,13 +lre22_dev_eznzd,zul-zul,19 +lre22_dev_ezzwj,eng-iaf,18 +lre22_dev_facyr,zul-zul,18 +lre22_dev_faejb,tso-tso,16 +lre22_dev_famjw,orm-orm,18 +lre22_dev_favzh,ara-arq,11 +lre22_dev_fbsre,orm-orm,23 +lre22_dev_fbtkl,fra-ntf,22 +lre22_dev_fbvxh,ara-ayl,14 +lre22_dev_fbyhp,nbl-nbl,20 +lre22_dev_fbysf,nbl-nbl,17 +lre22_dev_fcckx,ara-arq,12 +lre22_dev_fczba,eng-iaf,17 +lre22_dev_fdouw,eng-ens,14 +lre22_dev_fdtmf,tso-tso,13 +lre22_dev_fdtnc,fra-ntf,20 +lre22_dev_fdwme,afr-afr,19 +lre22_dev_fdyhr,eng-ens,18 +lre22_dev_feanh,fra-ntf,22 +lre22_dev_femmc,ara-arq,12 +lre22_dev_fevab,orm-orm,19 +lre22_dev_fexsi,orm-orm,17 +lre22_dev_fflai,ara-aeb,14 +lre22_dev_fgblw,tso-tso,14 +lre22_dev_fglhf,nbl-nbl,22 +lre22_dev_fhucm,ara-ayl,14 +lre22_dev_fhzwp,nbl-nbl,17 +lre22_dev_fifon,eng-iaf,14 +lre22_dev_fipff,orm-orm,19 +lre22_dev_fipyx,zul-zul,14 +lre22_dev_firtn,zul-zul,18 +lre22_dev_fjdqb,nbl-nbl,16 +lre22_dev_fjdxl,tir-tir,14 +lre22_dev_fjocp,ara-ayl,12 +lre22_dev_fjudb,ara-aeb,15 +lre22_dev_fkbjz,afr-afr,22 +lre22_dev_fkwaq,afr-afr,19 +lre22_dev_flbgp,afr-afr,16 +lre22_dev_flgxs,tir-tir,13 +lre22_dev_fljfm,tir-tir,19 +lre22_dev_fmauu,tso-tso,18 +lre22_dev_fmbvf,fra-ntf,19 +lre22_dev_fmhfa,ara-arq,12 +lre22_dev_fmije,ara-ayl,13 +lre22_dev_fnafq,tir-tir,20 +lre22_dev_fofmo,eng-ens,15 +lre22_dev_foikm,tir-tir,16 +lre22_dev_fosfi,eng-iaf,19 +lre22_dev_fotti,eng-ens,13 +lre22_dev_fozzx,zul-zul,15 +lre22_dev_fpehr,ara-aeb,12 +lre22_dev_fpiig,orm-orm,21 +lre22_dev_fqfag,ara-ayl,16 +lre22_dev_fqogo,tir-tir,13 +lre22_dev_frdqe,ara-arq,11 +lre22_dev_fremq,afr-afr,22 +lre22_dev_frjdx,zul-zul,18 +lre22_dev_fruha,ara-ayl,12 +lre22_dev_frxmu,eng-iaf,18 +lre22_dev_fsbeo,tso-tso,13 +lre22_dev_fsijy,fra-ntf,22 +lre22_dev_fsjwh,nbl-nbl,18 +lre22_dev_fspmb,tso-tso,19 +lre22_dev_ftbak,tir-tir,13 +lre22_dev_ftxuo,eng-iaf,20 +lre22_dev_fupee,ara-aeb,13 +lre22_dev_fupla,ara-aeb,11 +lre22_dev_fvmdq,fra-ntf,22 +lre22_dev_fvmjb,fra-ntf,20 +lre22_dev_fvubo,fra-ntf,22 +lre22_dev_fvwze,afr-afr,23 +lre22_dev_fvxxt,ara-arq,13 +lre22_dev_fwcye,ven-ven,21 +lre22_dev_fwkwv,orm-orm,18 +lre22_dev_fxezd,orm-orm,17 +lre22_dev_fxuir,nbl-nbl,19 +lre22_dev_fzgcm,zul-zul,14 +lre22_dev_fzncb,nbl-nbl,16 +lre22_dev_gaezu,ara-aeb,11 +lre22_dev_gawox,ara-aeb,13 +lre22_dev_gbcfq,zul-zul,14 +lre22_dev_gbdkv,orm-orm,17 +lre22_dev_gbevf,eng-iaf,20 +lre22_dev_gchke,ara-aeb,12 +lre22_dev_gcncr,ara-arq,13 +lre22_dev_gdeqd,ara-ayl,14 +lre22_dev_gdncj,eng-iaf,14 +lre22_dev_gdobt,ven-ven,21 +lre22_dev_geeoy,xho-xho,22 +lre22_dev_geraa,afr-afr,20 +lre22_dev_gfigd,nbl-nbl,16 +lre22_dev_gfjzm,ara-ayl,12 +lre22_dev_gftlv,tir-tir,20 +lre22_dev_ggaux,xho-xho,16 +lre22_dev_ggbgc,zul-zul,15 +lre22_dev_gghhn,zul-zul,18 +lre22_dev_ggrwj,eng-iaf,17 +lre22_dev_ghdur,eng-ens,15 +lre22_dev_ghgbo,ara-ayl,14 +lre22_dev_ghhop,nbl-nbl,20 +lre22_dev_ghnwg,ara-ayl,14 +lre22_dev_ghpmd,ara-ayl,14 +lre22_dev_ghqbh,orm-orm,19 +lre22_dev_gihvo,eng-ens,16 +lre22_dev_giueq,tso-tso,19 +lre22_dev_giuix,ara-aeb,15 +lre22_dev_gjaqj,eng-iaf,20 +lre22_dev_gjgcw,xho-xho,18 +lre22_dev_gjirh,eng-iaf,16 +lre22_dev_gjvwy,nbl-nbl,22 +lre22_dev_gkeql,eng-iaf,16 +lre22_dev_gkhas,tso-tso,16 +lre22_dev_glmyp,nbl-nbl,16 +lre22_dev_glqft,eng-ens,18 +lre22_dev_glsnb,afr-afr,17 +lre22_dev_gmfcb,eng-iaf,16 +lre22_dev_gmlwo,afr-afr,16 +lre22_dev_gmpjq,tso-tso,12 +lre22_dev_gmrvk,ara-aeb,14 +lre22_dev_gmryq,ara-ayl,13 +lre22_dev_gmsds,eng-ens,16 +lre22_dev_gmztl,xho-xho,16 +lre22_dev_gnbyu,eng-iaf,15 +lre22_dev_gntym,zul-zul,17 +lre22_dev_gocpa,tso-tso,15 +lre22_dev_gpyxs,orm-orm,17 +lre22_dev_grgvb,afr-afr,16 +lre22_dev_grspj,orm-orm,19 +lre22_dev_grvjm,xho-xho,19 +lre22_dev_gsidj,eng-ens,18 +lre22_dev_gslzy,afr-afr,22 +lre22_dev_gtwcl,tir-tir,14 +lre22_dev_gulky,orm-orm,21 +lre22_dev_gvlhy,tir-tir,20 +lre22_dev_gvljx,tso-tso,15 +lre22_dev_gvmma,tso-tso,13 +lre22_dev_gvtvb,afr-afr,23 +lre22_dev_gweym,xho-xho,19 +lre22_dev_gwljh,ara-aeb,11 +lre22_dev_gwxtn,ara-ayl,14 +lre22_dev_gxdpw,fra-ntf,16 +lre22_dev_gxext,afr-afr,15 +lre22_dev_gxkqq,nbl-nbl,19 +lre22_dev_gxkxo,xho-xho,21 +lre22_dev_gxnkr,xho-xho,18 +lre22_dev_gxxbk,fra-ntf,21 +lre22_dev_gydvv,afr-afr,20 +lre22_dev_gytkt,ara-arq,12 +lre22_dev_gzmvp,afr-afr,18 +lre22_dev_gzoou,ven-ven,19 +lre22_dev_gzvza,tir-tir,15 +lre22_dev_gzwee,eng-iaf,17 +lre22_dev_haewp,tir-tir,19 +lre22_dev_haokb,fra-ntf,19 +lre22_dev_hazis,nbl-nbl,20 +lre22_dev_hbbbc,eng-ens,16 +lre22_dev_hblqa,nbl-nbl,17 +lre22_dev_hbmfy,zul-zul,15 +lre22_dev_hbndl,zul-zul,17 +lre22_dev_hcgfc,eng-ens,13 +lre22_dev_hcjnx,orm-orm,17 +lre22_dev_hcont,tir-tir,17 +lre22_dev_hcvik,tso-tso,13 +lre22_dev_hczom,zul-zul,19 +lre22_dev_hdaca,xho-xho,19 +lre22_dev_hdijt,fra-ntf,15 +lre22_dev_hdkyr,afr-afr,18 +lre22_dev_hdnoq,orm-orm,23 +lre22_dev_hdtlb,eng-iaf,16 +lre22_dev_hever,nbl-nbl,18 +lre22_dev_hfirj,nbl-nbl,17 +lre22_dev_hgbxp,xho-xho,21 +lre22_dev_hgcax,xho-xho,19 +lre22_dev_hgkwa,tso-tso,13 +lre22_dev_hgljd,ara-arq,15 +lre22_dev_hgvrh,nbl-nbl,21 +lre22_dev_hhovn,eng-iaf,16 +lre22_dev_hhpzm,fra-ntf,22 +lre22_dev_hhuab,ven-ven,20 +lre22_dev_hicev,ven-ven,18 +lre22_dev_hickz,ara-arq,12 +lre22_dev_hilii,orm-orm,23 +lre22_dev_hjenx,eng-iaf,19 +lre22_dev_hjiui,orm-orm,18 +lre22_dev_hkfts,eng-ens,18 +lre22_dev_hkhvl,zul-zul,19 +lre22_dev_hkobh,xho-xho,17 +lre22_dev_hkvay,ara-arq,13 +lre22_dev_hkvtj,orm-orm,21 +lre22_dev_hlevc,fra-ntf,17 +lre22_dev_hliut,ara-aeb,14 +lre22_dev_hlntc,zul-zul,18 +lre22_dev_hlprm,zul-zul,18 +lre22_dev_hmeav,ven-ven,17 +lre22_dev_hnelt,tir-tir,15 +lre22_dev_hniiy,ara-arq,15 +lre22_dev_hoepv,ara-aeb,13 +lre22_dev_hofkm,orm-orm,19 +lre22_dev_hoilz,tir-tir,19 +lre22_dev_hookr,ara-aeb,13 +lre22_dev_hpbhl,tir-tir,16 +lre22_dev_hpbzf,ara-aeb,11 +lre22_dev_hpizl,eng-ens,15 +lre22_dev_hplhi,ara-ayl,13 +lre22_dev_hplrq,xho-xho,20 +lre22_dev_hqdva,ven-ven,21 +lre22_dev_hqnus,xho-xho,16 +lre22_dev_hqoiz,orm-orm,18 +lre22_dev_hrerz,eng-ens,14 +lre22_dev_hrgjq,tir-tir,19 +lre22_dev_hrrhr,zul-zul,17 +lre22_dev_hsfbi,ara-ayl,14 +lre22_dev_hsjlg,tir-tir,17 +lre22_dev_hskug,afr-afr,16 +lre22_dev_hszzt,tso-tso,19 +lre22_dev_htgrl,tso-tso,18 +lre22_dev_htxah,zul-zul,17 +lre22_dev_htxrs,xho-xho,23 +lre22_dev_hudwz,nbl-nbl,17 +lre22_dev_huuqj,fra-ntf,18 +lre22_dev_hvsds,afr-afr,21 +lre22_dev_hwbhz,orm-orm,23 +lre22_dev_hwbvs,tso-tso,13 +lre22_dev_hwdlb,tso-tso,19 +lre22_dev_hwyki,eng-iaf,16 +lre22_dev_hxcmj,eng-iaf,20 +lre22_dev_hxdly,ara-arq,11 +lre22_dev_hyeqm,xho-xho,19 +lre22_dev_hyofm,ara-arq,12 +lre22_dev_hyogg,ara-arq,13 +lre22_dev_hyouu,tso-tso,13 +lre22_dev_hzfpc,fra-ntf,16 +lre22_dev_hzkjt,ara-aeb,12 +lre22_dev_hzrgv,fra-ntf,20 +lre22_dev_hzuus,tir-tir,19 +lre22_dev_hzzbp,xho-xho,19 +lre22_dev_iautt,afr-afr,20 +lre22_dev_ibdnu,tir-tir,13 +lre22_dev_ibuww,ara-aeb,13 +lre22_dev_icbuo,ven-ven,21 +lre22_dev_icqmr,tso-tso,14 +lre22_dev_ictwj,tir-tir,14 +lre22_dev_ifumz,ven-ven,14 +lre22_dev_igcgi,tso-tso,19 +lre22_dev_igder,tir-tir,19 +lre22_dev_igexm,xho-xho,21 +lre22_dev_igfxi,fra-ntf,20 +lre22_dev_igoxr,afr-afr,15 +lre22_dev_igxyt,ven-ven,21 +lre22_dev_ihqtn,ara-aeb,11 +lre22_dev_ihxfl,tir-tir,13 +lre22_dev_ihyrb,nbl-nbl,18 +lre22_dev_iifuu,tir-tir,15 +lre22_dev_iiien,xho-xho,20 +lre22_dev_ijccu,eng-iaf,16 +lre22_dev_ijrun,afr-afr,18 +lre22_dev_ijwlx,ara-arq,14 +lre22_dev_ijydw,xho-xho,21 +lre22_dev_ikdjt,xho-xho,23 +lre22_dev_iklbv,ara-arq,13 +lre22_dev_ikyai,fra-ntf,18 +lre22_dev_ildmr,orm-orm,21 +lre22_dev_ilebo,orm-orm,19 +lre22_dev_ilptc,eng-ens,18 +lre22_dev_ilsku,fra-ntf,16 +lre22_dev_ilyti,ara-arq,11 +lre22_dev_imnqh,zul-zul,17 +lre22_dev_imxdr,eng-ens,16 +lre22_dev_indww,fra-ntf,19 +lre22_dev_iokar,eng-iaf,15 +lre22_dev_iomtu,eng-iaf,15 +lre22_dev_ioobz,tir-tir,14 +lre22_dev_iosom,zul-zul,17 +lre22_dev_iowyd,ara-arq,14 +lre22_dev_iphzy,nbl-nbl,18 +lre22_dev_ipmrc,nbl-nbl,16 +lre22_dev_ipomi,ara-aeb,12 +lre22_dev_ipour,afr-afr,15 +lre22_dev_ippjq,ara-ayl,16 +lre22_dev_ipvjc,ara-aeb,13 +lre22_dev_iqfdc,ven-ven,19 +lre22_dev_iqppw,tso-tso,15 +lre22_dev_iqtde,tso-tso,14 +lre22_dev_irlee,eng-iaf,14 +lre22_dev_irxuq,ara-aeb,14 +lre22_dev_isjzo,ara-arq,14 +lre22_dev_isnwz,ara-ayl,14 +lre22_dev_isqvk,afr-afr,15 +lre22_dev_isqww,orm-orm,19 +lre22_dev_istdz,tir-tir,18 +lre22_dev_iszhe,fra-ntf,20 +lre22_dev_itblz,ven-ven,18 +lre22_dev_itfez,ara-arq,13 +lre22_dev_itjqm,zul-zul,18 +lre22_dev_itnap,nbl-nbl,21 +lre22_dev_itrms,xho-xho,21 +lre22_dev_itroi,fra-ntf,17 +lre22_dev_ittds,zul-zul,16 +lre22_dev_iuknz,tso-tso,16 +lre22_dev_iumnm,ara-ayl,15 +lre22_dev_iunul,afr-afr,23 +lre22_dev_iverq,ven-ven,16 +lre22_dev_ivwzd,ara-ayl,14 +lre22_dev_ivzjf,tso-tso,12 +lre22_dev_iwbta,nbl-nbl,16 +lre22_dev_iwdeh,orm-orm,21 +lre22_dev_iwgel,ara-aeb,11 +lre22_dev_ixbhj,ara-aeb,11 +lre22_dev_ixbnl,fra-ntf,16 +lre22_dev_ixcef,ven-ven,20 +lre22_dev_ixfdf,orm-orm,18 +lre22_dev_ixjey,orm-orm,19 +lre22_dev_ixlve,tir-tir,17 +lre22_dev_ixutu,ara-ayl,12 +lre22_dev_ixxoj,xho-xho,23 +lre22_dev_ixyko,afr-afr,22 +lre22_dev_iylls,eng-iaf,19 +lre22_dev_izegw,orm-orm,23 +lre22_dev_izglb,ara-ayl,13 +lre22_dev_iziar,ara-arq,13 +lre22_dev_jadvz,afr-afr,18 +lre22_dev_jajtw,ara-aeb,14 +lre22_dev_janvu,tso-tso,16 +lre22_dev_japrb,xho-xho,21 +lre22_dev_jarvz,ara-aeb,12 +lre22_dev_jazcn,tso-tso,13 +lre22_dev_jbfxj,tso-tso,12 +lre22_dev_jbnfg,fra-ntf,15 +lre22_dev_jbwgd,afr-afr,20 +lre22_dev_jceug,tso-tso,15 +lre22_dev_jcqtd,eng-ens,14 +lre22_dev_jcxry,ven-ven,20 +lre22_dev_jdbli,tir-tir,20 +lre22_dev_jegmb,orm-orm,18 +lre22_dev_jegqj,ara-ayl,12 +lre22_dev_jenns,xho-xho,22 +lre22_dev_jfarf,ven-ven,14 +lre22_dev_jfcve,zul-zul,17 +lre22_dev_jfgyq,xho-xho,23 +lre22_dev_jftnz,afr-afr,14 +lre22_dev_jftsj,afr-afr,22 +lre22_dev_jgnid,nbl-nbl,16 +lre22_dev_jgsju,eng-ens,13 +lre22_dev_jifal,orm-orm,19 +lre22_dev_jihsd,orm-orm,21 +lre22_dev_jihwf,ara-ayl,11 +lre22_dev_jiptp,eng-iaf,15 +lre22_dev_jizij,tir-tir,14 +lre22_dev_jjpzg,orm-orm,23 +lre22_dev_jkezw,fra-ntf,18 +lre22_dev_jkmux,fra-ntf,20 +lre22_dev_jkpnt,orm-orm,22 +lre22_dev_jlkfj,eng-ens,18 +lre22_dev_jlmtf,ven-ven,19 +lre22_dev_jlrfm,ara-arq,12 +lre22_dev_jmojg,orm-orm,19 +lre22_dev_jmrcv,ara-aeb,13 +lre22_dev_jmsxc,eng-iaf,16 +lre22_dev_jnjpw,tir-tir,14 +lre22_dev_jnzvu,ara-aeb,14 +lre22_dev_jocyh,xho-xho,19 +lre22_dev_joezr,tso-tso,16 +lre22_dev_jofqy,ara-arq,11 +lre22_dev_jpbyf,eng-ens,15 +lre22_dev_jppuy,ara-arq,13 +lre22_dev_jptts,ara-aeb,12 +lre22_dev_jqdyx,fra-ntf,22 +lre22_dev_jqjbq,zul-zul,17 +lre22_dev_jqpnb,ven-ven,21 +lre22_dev_jqqin,zul-zul,17 +lre22_dev_jqzkq,ara-ayl,13 +lre22_dev_jrroq,orm-orm,21 +lre22_dev_jruru,eng-ens,16 +lre22_dev_jskbr,ara-arq,11 +lre22_dev_jskdd,nbl-nbl,19 +lre22_dev_jslnc,eng-ens,12 +lre22_dev_jsmat,orm-orm,17 +lre22_dev_jsmdw,ara-aeb,11 +lre22_dev_jsvaz,afr-afr,19 +lre22_dev_jsxcy,afr-afr,21 +lre22_dev_jszgk,eng-iaf,19 +lre22_dev_jthui,ven-ven,20 +lre22_dev_jtpvz,ven-ven,17 +lre22_dev_jtwdi,ven-ven,14 +lre22_dev_jtwfh,ven-ven,18 +lre22_dev_juwid,tir-tir,20 +lre22_dev_jvdww,fra-ntf,21 +lre22_dev_jweyx,tir-tir,19 +lre22_dev_jwuto,afr-afr,19 +lre22_dev_jwwgs,afr-afr,19 +lre22_dev_jxhxf,nbl-nbl,17 +lre22_dev_jxtxk,orm-orm,20 +lre22_dev_jxzvy,eng-ens,15 +lre22_dev_jyjlm,nbl-nbl,19 +lre22_dev_jynvf,ara-ayl,13 +lre22_dev_jyzmh,nbl-nbl,19 +lre22_dev_jzivf,eng-ens,14 +lre22_dev_jzpns,tso-tso,14 +lre22_dev_kadwu,fra-ntf,18 +lre22_dev_kbnbi,tir-tir,13 +lre22_dev_kbqbd,fra-ntf,16 +lre22_dev_kbscm,tso-tso,15 +lre22_dev_kbxko,ara-aeb,12 +lre22_dev_kcegv,tso-tso,15 +lre22_dev_kcibo,afr-afr,17 +lre22_dev_kcmky,ara-ayl,14 +lre22_dev_kctrd,nbl-nbl,22 +lre22_dev_kcvbf,fra-ntf,16 +lre22_dev_kdbqy,zul-zul,15 +lre22_dev_kdgpz,ara-arq,14 +lre22_dev_kdhgq,nbl-nbl,22 +lre22_dev_kdvtu,eng-iaf,16 +lre22_dev_kdyhm,tso-tso,12 +lre22_dev_keeyz,zul-zul,18 +lre22_dev_kejvy,ven-ven,18 +lre22_dev_kerpr,ven-ven,21 +lre22_dev_keweh,ara-aeb,13 +lre22_dev_keysx,orm-orm,23 +lre22_dev_kezyv,ara-ayl,13 +lre22_dev_kgbiq,ven-ven,18 +lre22_dev_kgovz,tso-tso,15 +lre22_dev_kgxka,eng-ens,16 +lre22_dev_khkcx,fra-ntf,20 +lre22_dev_khobl,orm-orm,19 +lre22_dev_khttn,afr-afr,17 +lre22_dev_khvss,tir-tir,15 +lre22_dev_kiezl,tso-tso,16 +lre22_dev_kihlw,eng-ens,14 +lre22_dev_kipuq,ara-arq,14 +lre22_dev_kiqcx,tir-tir,16 +lre22_dev_kjiks,xho-xho,19 +lre22_dev_kjmpa,zul-zul,18 +lre22_dev_kjocf,eng-iaf,16 +lre22_dev_kkbur,ven-ven,16 +lre22_dev_kksdi,xho-xho,22 +lre22_dev_kkytv,ara-aeb,11 +lre22_dev_kmkgx,nbl-nbl,17 +lre22_dev_kmpkm,zul-zul,19 +lre22_dev_kmyzy,ara-ayl,13 +lre22_dev_knfsj,afr-afr,15 +lre22_dev_knyuq,orm-orm,19 +lre22_dev_koacp,orm-orm,19 +lre22_dev_koket,eng-ens,18 +lre22_dev_kovdn,zul-zul,15 +lre22_dev_kowqf,ven-ven,19 +lre22_dev_kozfr,nbl-nbl,21 +lre22_dev_kpmyz,orm-orm,19 +lre22_dev_kqfdc,eng-ens,17 +lre22_dev_kqumw,fra-ntf,22 +lre22_dev_kqwdi,nbl-nbl,16 +lre22_dev_krczb,ven-ven,19 +lre22_dev_kremz,nbl-nbl,16 +lre22_dev_ksruw,ven-ven,18 +lre22_dev_kszdw,eng-iaf,20 +lre22_dev_ktgvi,ara-arq,11 +lre22_dev_ktjax,fra-ntf,20 +lre22_dev_ktlvc,orm-orm,19 +lre22_dev_kvqgp,afr-afr,21 +lre22_dev_kvyoz,afr-afr,20 +lre22_dev_kvzim,afr-afr,14 +lre22_dev_kvzwc,eng-iaf,14 +lre22_dev_kwcwa,ara-arq,14 +lre22_dev_kwomo,zul-zul,19 +lre22_dev_kwxau,xho-xho,18 +lre22_dev_kxawf,tir-tir,19 +lre22_dev_kxjhn,ara-aeb,11 +lre22_dev_kxklh,tir-tir,19 +lre22_dev_kxlgg,tir-tir,16 +lre22_dev_kyqbp,fra-ntf,21 +lre22_dev_kyzio,ven-ven,20 +lre22_dev_kzcgh,ara-ayl,13 +lre22_dev_kzeyf,ven-ven,18 +lre22_dev_kzfwf,fra-ntf,19 +lre22_dev_kzjuz,orm-orm,21 +lre22_dev_kzjwx,ara-ayl,11 +lre22_dev_lamjl,tso-tso,17 +lre22_dev_laowh,xho-xho,16 +lre22_dev_larex,ara-ayl,11 +lre22_dev_laycs,tso-tso,12 +lre22_dev_lbxfn,eng-iaf,20 +lre22_dev_lcrog,zul-zul,18 +lre22_dev_ldczz,xho-xho,17 +lre22_dev_ldkgv,ara-aeb,13 +lre22_dev_ldkst,fra-ntf,20 +lre22_dev_ldkwr,orm-orm,22 +lre22_dev_lenxf,ven-ven,14 +lre22_dev_lfbey,ara-ayl,12 +lre22_dev_lfmml,fra-ntf,18 +lre22_dev_lfmxu,ven-ven,18 +lre22_dev_lfqfj,afr-afr,17 +lre22_dev_lgetu,ara-aeb,14 +lre22_dev_lgleu,ara-ayl,11 +lre22_dev_lgoat,eng-iaf,16 +lre22_dev_lhgaj,tso-tso,15 +lre22_dev_lhqyw,nbl-nbl,17 +lre22_dev_lhrmr,eng-iaf,17 +lre22_dev_lhtsd,tir-tir,19 +lre22_dev_lhydp,fra-ntf,22 +lre22_dev_livbf,tir-tir,15 +lre22_dev_ljdrg,ara-arq,13 +lre22_dev_ljniw,tso-tso,16 +lre22_dev_ljpmq,tso-tso,12 +lre22_dev_lkjon,tso-tso,15 +lre22_dev_lkszp,nbl-nbl,19 +lre22_dev_llbim,ara-ayl,15 +lre22_dev_llkkt,fra-ntf,15 +lre22_dev_llvcc,orm-orm,22 +lre22_dev_lmbug,ara-arq,12 +lre22_dev_lmmmw,nbl-nbl,19 +lre22_dev_lmsek,ven-ven,16 +lre22_dev_lmudp,ara-ayl,10 +lre22_dev_lmzmv,eng-iaf,19 +lre22_dev_lnlae,ara-arq,14 +lre22_dev_lnlvt,zul-zul,17 +lre22_dev_lnppu,ara-ayl,13 +lre22_dev_lnpyc,tso-tso,19 +lre22_dev_lolkv,xho-xho,19 +lre22_dev_lorcx,nbl-nbl,20 +lre22_dev_lparq,xho-xho,16 +lre22_dev_lqlft,ara-arq,11 +lre22_dev_lqlyq,ara-arq,12 +lre22_dev_lqoeu,tso-tso,14 +lre22_dev_lqueh,ara-ayl,11 +lre22_dev_lquzk,ara-arq,12 +lre22_dev_lqvav,zul-zul,18 +lre22_dev_lrgpy,eng-iaf,16 +lre22_dev_lrjbn,ven-ven,21 +lre22_dev_lrtad,ara-arq,14 +lre22_dev_lrtxd,ara-aeb,11 +lre22_dev_lrvkn,ven-ven,16 +lre22_dev_lrzwy,ara-ayl,13 +lre22_dev_lsefk,ara-arq,13 +lre22_dev_ltmmt,orm-orm,22 +lre22_dev_lutgh,ara-aeb,15 +lre22_dev_lvhmd,tso-tso,14 +lre22_dev_lvqim,ara-aeb,14 +lre22_dev_lvuuo,fra-ntf,17 +lre22_dev_lvzri,ven-ven,16 +lre22_dev_lweml,ara-arq,14 +lre22_dev_lwstj,eng-iaf,16 +lre22_dev_lwzdj,afr-afr,18 +lre22_dev_lxdsk,eng-ens,16 +lre22_dev_lxlcr,ara-aeb,13 +lre22_dev_lxshv,eng-iaf,20 +lre22_dev_lxxvv,eng-ens,16 +lre22_dev_lyfhc,ven-ven,18 +lre22_dev_lyikp,zul-zul,19 +lre22_dev_lyjix,tso-tso,14 +lre22_dev_lyxyh,eng-iaf,19 +lre22_dev_lyzxd,tir-tir,17 +lre22_dev_lzguf,orm-orm,21 +lre22_dev_lzpmk,tir-tir,16 +lre22_dev_lzugv,xho-xho,19 +lre22_dev_maeeb,tir-tir,15 +lre22_dev_maemn,zul-zul,16 +lre22_dev_manpw,orm-orm,19 +lre22_dev_mavli,ara-aeb,12 +lre22_dev_mbywd,orm-orm,19 +lre22_dev_mcath,nbl-nbl,22 +lre22_dev_mcjtw,xho-xho,16 +lre22_dev_mcndd,ven-ven,15 +lre22_dev_mcxqb,tir-tir,13 +lre22_dev_mdlia,fra-ntf,16 +lre22_dev_mdxsp,eng-ens,18 +lre22_dev_menex,eng-iaf,16 +lre22_dev_merfk,orm-orm,21 +lre22_dev_mfipk,zul-zul,16 +lre22_dev_mfuqh,ara-arq,14 +lre22_dev_mgcvo,xho-xho,19 +lre22_dev_mggbx,zul-zul,18 +lre22_dev_mgghl,tso-tso,12 +lre22_dev_mgwqd,ara-arq,14 +lre22_dev_mhswt,ara-ayl,15 +lre22_dev_mhwmt,tso-tso,16 +lre22_dev_miayn,ara-aeb,12 +lre22_dev_miley,tso-tso,16 +lre22_dev_mjfmb,nbl-nbl,21 +lre22_dev_mkbyx,tir-tir,19 +lre22_dev_mlbzi,xho-xho,23 +lre22_dev_mlduq,xho-xho,16 +lre22_dev_mljnp,ara-arq,14 +lre22_dev_mljpb,orm-orm,22 +lre22_dev_mlrsm,xho-xho,17 +lre22_dev_mlwzr,eng-ens,13 +lre22_dev_mlyeo,ven-ven,15 +lre22_dev_mmaed,ara-ayl,14 +lre22_dev_mmbns,eng-ens,12 +lre22_dev_mneyt,xho-xho,17 +lre22_dev_mnhsk,ven-ven,14 +lre22_dev_mnnvk,eng-ens,15 +lre22_dev_mnswo,tso-tso,16 +lre22_dev_mntdk,eng-ens,18 +lre22_dev_mogwl,orm-orm,22 +lre22_dev_mpbun,nbl-nbl,21 +lre22_dev_mpmuf,ara-aeb,14 +lre22_dev_mpoet,nbl-nbl,16 +lre22_dev_mptyi,afr-afr,18 +lre22_dev_mpzxy,orm-orm,18 +lre22_dev_mqxni,ara-arq,11 +lre22_dev_mqzga,tso-tso,19 +lre22_dev_mrgdh,xho-xho,17 +lre22_dev_mrgko,afr-afr,18 +lre22_dev_mrksc,tir-tir,19 +lre22_dev_mrogp,eng-iaf,15 +lre22_dev_mscwd,fra-ntf,16 +lre22_dev_mshco,ara-ayl,12 +lre22_dev_msptn,ara-ayl,16 +lre22_dev_msslk,ara-aeb,14 +lre22_dev_mtaus,fra-ntf,19 +lre22_dev_mtpgl,tso-tso,13 +lre22_dev_mttly,tir-tir,19 +lre22_dev_mubqn,fra-ntf,15 +lre22_dev_muskv,tso-tso,12 +lre22_dev_muzkp,ara-arq,14 +lre22_dev_mvdus,ven-ven,19 +lre22_dev_mvngl,xho-xho,19 +lre22_dev_mvrpq,tso-tso,12 +lre22_dev_mvtcj,afr-afr,22 +lre22_dev_mwhsu,xho-xho,21 +lre22_dev_mwkyp,nbl-nbl,20 +lre22_dev_mxcey,ara-ayl,12 +lre22_dev_mxcub,ara-aeb,12 +lre22_dev_myekh,ara-aeb,11 +lre22_dev_mzxhf,zul-zul,17 +lre22_dev_mzyru,ara-arq,12 +lre22_dev_nakax,eng-iaf,15 +lre22_dev_naymc,ara-ayl,13 +lre22_dev_nbgid,orm-orm,19 +lre22_dev_nbmnl,xho-xho,16 +lre22_dev_ncffi,zul-zul,14 +lre22_dev_ncjtj,fra-ntf,22 +lre22_dev_ncpix,ara-ayl,11 +lre22_dev_nctqc,xho-xho,16 +lre22_dev_ndkuo,orm-orm,20 +lre22_dev_ndqfw,nbl-nbl,17 +lre22_dev_nedes,ven-ven,15 +lre22_dev_neomw,zul-zul,18 +lre22_dev_neziz,tir-tir,19 +lre22_dev_nfcvg,eng-iaf,17 +lre22_dev_nfdfc,afr-afr,17 +lre22_dev_ngijv,xho-xho,21 +lre22_dev_ngrxk,ara-ayl,13 +lre22_dev_ngzja,ara-aeb,13 +lre22_dev_nhaub,tso-tso,13 +lre22_dev_nhkro,xho-xho,23 +lre22_dev_nhlvt,ara-arq,14 +lre22_dev_nhlxm,eng-ens,14 +lre22_dev_nhyjy,afr-afr,17 +lre22_dev_nifei,zul-zul,19 +lre22_dev_nikpx,ven-ven,18 +lre22_dev_njceq,afr-afr,18 +lre22_dev_njmlt,eng-ens,17 +lre22_dev_njqfj,orm-orm,18 +lre22_dev_nkdje,eng-iaf,19 +lre22_dev_nkkqo,nbl-nbl,22 +lre22_dev_nknrw,orm-orm,21 +lre22_dev_nkogd,fra-ntf,19 +lre22_dev_nksfc,tir-tir,19 +lre22_dev_nkwmm,orm-orm,22 +lre22_dev_nmhdg,ara-ayl,10 +lre22_dev_nmoux,ven-ven,20 +lre22_dev_nmrsq,ven-ven,21 +lre22_dev_nnbhc,fra-ntf,20 +lre22_dev_nnbpy,tir-tir,18 +lre22_dev_nnpwd,ara-aeb,13 +lre22_dev_nodin,ara-ayl,14 +lre22_dev_nogji,nbl-nbl,20 +lre22_dev_nonvr,afr-afr,15 +lre22_dev_notcl,eng-iaf,19 +lre22_dev_noufn,ara-aeb,11 +lre22_dev_noveb,ara-ayl,11 +lre22_dev_npajm,nbl-nbl,19 +lre22_dev_npehj,ara-ayl,14 +lre22_dev_nqdaj,tso-tso,12 +lre22_dev_nqkon,xho-xho,18 +lre22_dev_nqlhw,ara-aeb,13 +lre22_dev_nraqr,eng-ens,14 +lre22_dev_nrino,tso-tso,14 +lre22_dev_nrzgt,xho-xho,16 +lre22_dev_nscrg,orm-orm,18 +lre22_dev_nstgp,orm-orm,23 +lre22_dev_ntgqz,afr-afr,23 +lre22_dev_nthzr,eng-iaf,18 +lre22_dev_ntwzb,afr-afr,16 +lre22_dev_nudwv,eng-ens,14 +lre22_dev_nuerz,eng-iaf,18 +lre22_dev_nujfy,xho-xho,21 +lre22_dev_nurlx,eng-ens,13 +lre22_dev_nvakd,zul-zul,17 +lre22_dev_nvgkj,eng-ens,17 +lre22_dev_nvhvv,fra-ntf,20 +lre22_dev_nwbnz,ara-arq,14 +lre22_dev_nwjed,nbl-nbl,19 +lre22_dev_nwrto,ara-aeb,11 +lre22_dev_nwunl,zul-zul,14 +lre22_dev_nwvyy,tir-tir,19 +lre22_dev_nxwlo,nbl-nbl,17 +lre22_dev_nxxzy,zul-zul,16 +lre22_dev_nxzpp,nbl-nbl,20 +lre22_dev_nyhwg,ara-arq,14 +lre22_dev_nykvr,eng-ens,17 +lre22_dev_nyvkc,tir-tir,15 +lre22_dev_nyyui,ara-arq,11 +lre22_dev_nzbfh,zul-zul,19 +lre22_dev_nzxsk,xho-xho,21 +lre22_dev_oasrh,ara-arq,11 +lre22_dev_oavaf,xho-xho,21 +lre22_dev_obfrf,orm-orm,20 +lre22_dev_obocn,ara-arq,14 +lre22_dev_obumo,eng-ens,15 +lre22_dev_ocbuj,eng-ens,12 +lre22_dev_ocbxu,nbl-nbl,21 +lre22_dev_ocdvw,ara-ayl,13 +lre22_dev_ocdzj,xho-xho,19 +lre22_dev_ocveq,fra-ntf,22 +lre22_dev_odest,ara-ayl,11 +lre22_dev_odjlq,ven-ven,18 +lre22_dev_odpoq,ara-ayl,12 +lre22_dev_odrcm,fra-ntf,21 +lre22_dev_oeavx,ara-arq,12 +lre22_dev_oefoy,ara-aeb,12 +lre22_dev_oefqy,ven-ven,16 +lre22_dev_oehxk,ara-ayl,12 +lre22_dev_oeqbo,ara-aeb,14 +lre22_dev_oeqjq,fra-ntf,20 +lre22_dev_ofdgy,ara-ayl,15 +lre22_dev_ofgkq,fra-ntf,21 +lre22_dev_ofpva,ara-arq,11 +lre22_dev_ofufy,eng-iaf,17 +lre22_dev_ogglz,ara-aeb,13 +lre22_dev_oggtr,nbl-nbl,19 +lre22_dev_ogpxk,ara-aeb,11 +lre22_dev_ogsay,tso-tso,19 +lre22_dev_ogtvj,zul-zul,19 +lre22_dev_ohqwz,ara-arq,13 +lre22_dev_ohuxo,afr-afr,20 +lre22_dev_ohweb,ven-ven,16 +lre22_dev_ohzpg,fra-ntf,21 +lre22_dev_oijcy,xho-xho,19 +lre22_dev_oijgv,tir-tir,16 +lre22_dev_oikqj,eng-iaf,17 +lre22_dev_oinvl,ven-ven,15 +lre22_dev_oiofr,fra-ntf,19 +lre22_dev_oipks,eng-ens,17 +lre22_dev_ojzos,ara-arq,14 +lre22_dev_okbnu,ara-ayl,10 +lre22_dev_okpcp,eng-iaf,18 +lre22_dev_okwpq,tso-tso,16 +lre22_dev_oleie,ara-arq,12 +lre22_dev_oljep,ven-ven,21 +lre22_dev_oljsa,fra-ntf,16 +lre22_dev_olkup,nbl-nbl,16 +lre22_dev_olqbh,ara-ayl,14 +lre22_dev_omjqo,ara-aeb,14 +lre22_dev_omwiy,ara-ayl,12 +lre22_dev_omxnk,ara-arq,13 +lre22_dev_onqke,eng-iaf,16 +lre22_dev_onzje,tir-tir,13 +lre22_dev_ooktw,afr-afr,18 +lre22_dev_oosff,ara-aeb,12 +lre22_dev_ootbi,xho-xho,21 +lre22_dev_opciz,orm-orm,23 +lre22_dev_opgny,xho-xho,19 +lre22_dev_opifd,ara-arq,12 +lre22_dev_oporo,eng-iaf,19 +lre22_dev_opryj,nbl-nbl,16 +lre22_dev_opuzh,eng-ens,12 +lre22_dev_oqbaw,ven-ven,18 +lre22_dev_oqeuj,tir-tir,14 +lre22_dev_oqmhb,xho-xho,21 +lre22_dev_oqmrs,ara-arq,14 +lre22_dev_oqqwq,tso-tso,12 +lre22_dev_oquaq,xho-xho,17 +lre22_dev_oriap,fra-ntf,20 +lre22_dev_orsjj,tir-tir,20 +lre22_dev_orvna,fra-ntf,21 +lre22_dev_oskoe,orm-orm,20 +lre22_dev_otlyk,nbl-nbl,18 +lre22_dev_oujnj,nbl-nbl,17 +lre22_dev_oumka,ven-ven,14 +lre22_dev_ouqsx,ara-arq,13 +lre22_dev_outyl,zul-zul,16 +lre22_dev_owlwt,ara-ayl,14 +lre22_dev_owvfd,orm-orm,18 +lre22_dev_oxizc,tir-tir,15 +lre22_dev_oxpht,eng-ens,18 +lre22_dev_oxqlz,afr-afr,15 +lre22_dev_oydiw,nbl-nbl,16 +lre22_dev_oyfcl,fra-ntf,22 +lre22_dev_oyhba,eng-ens,18 +lre22_dev_oyiif,afr-afr,17 +lre22_dev_oyslg,afr-afr,21 +lre22_dev_ozfpi,tir-tir,15 +lre22_dev_ozlww,ven-ven,19 +lre22_dev_paxnc,eng-ens,17 +lre22_dev_pbbgx,eng-iaf,14 +lre22_dev_pcfmw,nbl-nbl,21 +lre22_dev_pclpc,fra-ntf,15 +lre22_dev_pcmmj,afr-afr,16 +lre22_dev_pcsqz,tso-tso,18 +lre22_dev_pdcfm,ara-ayl,10 +lre22_dev_pdtuf,eng-ens,18 +lre22_dev_pdzuj,zul-zul,17 +lre22_dev_pehfu,fra-ntf,15 +lre22_dev_pewpj,orm-orm,22 +lre22_dev_pexjz,orm-orm,17 +lre22_dev_pfioj,eng-iaf,15 +lre22_dev_pfkcf,eng-iaf,16 +lre22_dev_pfknl,ara-arq,14 +lre22_dev_pfucv,ara-ayl,12 +lre22_dev_pfyha,fra-ntf,21 +lre22_dev_pgavf,ara-ayl,13 +lre22_dev_phket,nbl-nbl,22 +lre22_dev_piabk,afr-afr,19 +lre22_dev_picvg,orm-orm,17 +lre22_dev_piina,eng-ens,14 +lre22_dev_pjahm,afr-afr,20 +lre22_dev_pjcso,nbl-nbl,17 +lre22_dev_pjggp,ven-ven,16 +lre22_dev_pjohw,xho-xho,19 +lre22_dev_pkpxo,ara-ayl,11 +lre22_dev_pktgk,nbl-nbl,22 +lre22_dev_plojq,eng-ens,12 +lre22_dev_pmayg,ven-ven,21 +lre22_dev_pmjyi,xho-xho,20 +lre22_dev_pmkcp,nbl-nbl,20 +lre22_dev_pnfhk,fra-ntf,18 +lre22_dev_pnust,nbl-nbl,20 +lre22_dev_pnwey,eng-iaf,15 +lre22_dev_pnwti,ara-aeb,13 +lre22_dev_pohmm,afr-afr,14 +lre22_dev_pojvr,nbl-nbl,22 +lre22_dev_poxsw,ara-aeb,13 +lre22_dev_ppjvq,tir-tir,16 +lre22_dev_ppkfc,fra-ntf,19 +lre22_dev_ppmnu,tso-tso,12 +lre22_dev_ppzno,tso-tso,12 +lre22_dev_pqksl,afr-afr,14 +lre22_dev_pqnvh,zul-zul,19 +lre22_dev_prcus,tso-tso,15 +lre22_dev_prhoh,tir-tir,19 +lre22_dev_prkth,ara-arq,12 +lre22_dev_prnhd,xho-xho,18 +lre22_dev_psjma,fra-ntf,18 +lre22_dev_psldq,tir-tir,19 +lre22_dev_psnvo,afr-afr,15 +lre22_dev_psnzj,zul-zul,19 +lre22_dev_pudqr,eng-ens,17 +lre22_dev_pufnl,orm-orm,19 +lre22_dev_pusxa,nbl-nbl,22 +lre22_dev_pvsqi,ara-arq,11 +lre22_dev_pvteg,fra-ntf,17 +lre22_dev_pvvay,tir-tir,14 +lre22_dev_pvxcv,ara-aeb,15 +lre22_dev_pvygc,ara-aeb,11 +lre22_dev_pwcxu,tir-tir,13 +lre22_dev_pwhdm,nbl-nbl,17 +lre22_dev_pwnkz,ven-ven,20 +lre22_dev_pwrqe,ara-aeb,14 +lre22_dev_pxbhi,afr-afr,16 +lre22_dev_pxeyk,zul-zul,18 +lre22_dev_pxkzd,ara-arq,14 +lre22_dev_pydgm,afr-afr,19 +lre22_dev_pyiju,ven-ven,20 +lre22_dev_pzhrc,tso-tso,13 +lre22_dev_pzkea,ven-ven,14 +lre22_dev_pzqka,ara-arq,11 +lre22_dev_pzuis,ara-arq,13 +lre22_dev_qabac,ven-ven,19 +lre22_dev_qahym,ara-ayl,11 +lre22_dev_qaxfr,xho-xho,17 +lre22_dev_qazyc,ara-ayl,14 +lre22_dev_qbcoz,nbl-nbl,22 +lre22_dev_qcavr,eng-iaf,20 +lre22_dev_qcbkh,fra-ntf,18 +lre22_dev_qcbtt,afr-afr,18 +lre22_dev_qclly,xho-xho,22 +lre22_dev_qcqdt,eng-iaf,18 +lre22_dev_qdqzp,zul-zul,17 +lre22_dev_qdwut,eng-ens,16 +lre22_dev_qehxr,afr-afr,22 +lre22_dev_qeqah,tir-tir,16 +lre22_dev_qeyjd,afr-afr,17 +lre22_dev_qfprv,ara-ayl,13 +lre22_dev_qfqhi,ara-ayl,15 +lre22_dev_qgoge,tso-tso,13 +lre22_dev_qgrlb,eng-iaf,16 +lre22_dev_qgrsu,zul-zul,14 +lre22_dev_qheor,xho-xho,23 +lre22_dev_qhfdz,tso-tso,14 +lre22_dev_qhlol,ven-ven,21 +lre22_dev_qhnfr,zul-zul,15 +lre22_dev_qhvuq,tso-tso,14 +lre22_dev_qibby,afr-afr,23 +lre22_dev_qicen,orm-orm,16 +lre22_dev_qiehd,eng-iaf,14 +lre22_dev_qjbfh,eng-iaf,15 +lre22_dev_qjdln,afr-afr,19 +lre22_dev_qjmro,ara-ayl,11 +lre22_dev_qkgor,zul-zul,16 +lre22_dev_qlgvf,ara-aeb,12 +lre22_dev_qlpjn,eng-iaf,16 +lre22_dev_qmoop,nbl-nbl,16 +lre22_dev_qmqhy,afr-afr,20 +lre22_dev_qmreh,ara-ayl,10 +lre22_dev_qmucf,ven-ven,18 +lre22_dev_qmvnu,fra-ntf,15 +lre22_dev_qmzke,ara-ayl,13 +lre22_dev_qmzxw,orm-orm,21 +lre22_dev_qnams,ven-ven,20 +lre22_dev_qnefv,xho-xho,23 +lre22_dev_qodht,zul-zul,19 +lre22_dev_qoqtk,eng-ens,16 +lre22_dev_qotto,fra-ntf,18 +lre22_dev_qoudd,tso-tso,18 +lre22_dev_qpego,ara-ayl,14 +lre22_dev_qphcb,fra-ntf,22 +lre22_dev_qqkiv,ara-arq,13 +lre22_dev_qqmeu,eng-ens,17 +lre22_dev_qqudk,orm-orm,21 +lre22_dev_qqvdr,orm-orm,23 +lre22_dev_qrbmq,ara-arq,12 +lre22_dev_qrfvx,fra-ntf,22 +lre22_dev_qrsqg,zul-zul,19 +lre22_dev_qrylo,eng-ens,18 +lre22_dev_qsbdh,nbl-nbl,16 +lre22_dev_qsqzo,afr-afr,14 +lre22_dev_qsudg,nbl-nbl,22 +lre22_dev_qszwt,fra-ntf,21 +lre22_dev_qtcmx,nbl-nbl,21 +lre22_dev_qtfpf,zul-zul,16 +lre22_dev_qtkhk,afr-afr,22 +lre22_dev_qtydg,afr-afr,22 +lre22_dev_qujmp,zul-zul,19 +lre22_dev_qulse,eng-ens,17 +lre22_dev_qutbz,eng-ens,18 +lre22_dev_quvqg,ara-aeb,13 +lre22_dev_qvpjs,eng-iaf,19 +lre22_dev_qvtdy,tso-tso,12 +lre22_dev_qvzol,orm-orm,19 +lre22_dev_qwvgm,ara-ayl,13 +lre22_dev_qwzxt,zul-zul,19 +lre22_dev_qxigw,tir-tir,19 +lre22_dev_qxkuu,tso-tso,13 +lre22_dev_qxtss,afr-afr,15 +lre22_dev_qxvbe,nbl-nbl,17 +lre22_dev_qxysh,afr-afr,22 +lre22_dev_qyfba,zul-zul,14 +lre22_dev_qyfov,fra-ntf,19 +lre22_dev_qyjgj,afr-afr,22 +lre22_dev_qyuwy,ara-aeb,15 +lre22_dev_qzfdr,nbl-nbl,18 +lre22_dev_qzldb,eng-iaf,19 +lre22_dev_ranrd,nbl-nbl,22 +lre22_dev_raurj,eng-ens,12 +lre22_dev_rbntq,ara-arq,11 +lre22_dev_rbssw,ara-aeb,11 +lre22_dev_rbwgx,ara-ayl,16 +lre22_dev_rcooi,fra-ntf,18 +lre22_dev_rcyom,ara-ayl,11 +lre22_dev_rdcns,zul-zul,18 +lre22_dev_rdrhv,ara-arq,11 +lre22_dev_rdyxn,eng-iaf,19 +lre22_dev_repec,tir-tir,19 +lre22_dev_rgbby,tso-tso,19 +lre22_dev_rgdvt,fra-ntf,20 +lre22_dev_rguqm,tso-tso,14 +lre22_dev_rgwjy,afr-afr,19 +lre22_dev_rijeq,orm-orm,19 +lre22_dev_rincv,tir-tir,16 +lre22_dev_rindo,zul-zul,17 +lre22_dev_rirhy,ara-arq,11 +lre22_dev_rjikw,fra-ntf,20 +lre22_dev_rjsik,tso-tso,16 +lre22_dev_rjvvj,tso-tso,19 +lre22_dev_rksid,nbl-nbl,22 +lre22_dev_rkycg,ven-ven,21 +lre22_dev_rlamm,zul-zul,15 +lre22_dev_rllya,tso-tso,15 +lre22_dev_rlzrk,eng-ens,14 +lre22_dev_rmxbg,tir-tir,14 +lre22_dev_rnrsy,tir-tir,19 +lre22_dev_rokej,xho-xho,17 +lre22_dev_rooaf,fra-ntf,17 +lre22_dev_rorob,ven-ven,15 +lre22_dev_rowwe,nbl-nbl,17 +lre22_dev_rqcuw,ara-ayl,11 +lre22_dev_rqdte,ara-ayl,10 +lre22_dev_rqpau,tso-tso,15 +lre22_dev_rquba,ven-ven,19 +lre22_dev_rrbgv,afr-afr,20 +lre22_dev_rsvjn,fra-ntf,16 +lre22_dev_rsynm,tir-tir,19 +lre22_dev_rtezn,tir-tir,19 +lre22_dev_rtkum,orm-orm,21 +lre22_dev_rturg,zul-zul,17 +lre22_dev_runwu,tir-tir,16 +lre22_dev_rvbmf,tso-tso,12 +lre22_dev_rvfls,tso-tso,16 +lre22_dev_rvhxb,ara-aeb,11 +lre22_dev_rvufk,orm-orm,20 +lre22_dev_rvzbo,ara-ayl,14 +lre22_dev_rwhfu,xho-xho,16 +lre22_dev_rwhiz,ara-ayl,10 +lre22_dev_rwimz,ven-ven,16 +lre22_dev_rwish,eng-ens,16 +lre22_dev_rwpzp,xho-xho,19 +lre22_dev_rwqlq,tir-tir,19 +lre22_dev_rwsnw,afr-afr,15 +lre22_dev_rwzwb,tso-tso,19 +lre22_dev_rxcjq,ara-arq,13 +lre22_dev_rxcka,ara-arq,14 +lre22_dev_rxgxu,tir-tir,19 +lre22_dev_rxqxn,nbl-nbl,20 +lre22_dev_rxwip,ara-ayl,10 +lre22_dev_rycca,ven-ven,14 +lre22_dev_rydpu,eng-ens,17 +lre22_dev_ryksb,ven-ven,14 +lre22_dev_rysmu,afr-afr,23 +lre22_dev_rzisy,ara-aeb,13 +lre22_dev_rzpus,ara-arq,15 +lre22_dev_rzqyn,ara-ayl,11 +lre22_dev_rzzca,orm-orm,21 +lre22_dev_sazdy,tso-tso,15 +lre22_dev_sbkip,afr-afr,14 +lre22_dev_sbyek,ara-arq,11 +lre22_dev_scjzn,xho-xho,21 +lre22_dev_scobo,ven-ven,17 +lre22_dev_scqui,orm-orm,16 +lre22_dev_sdccf,ara-arq,14 +lre22_dev_sdcty,tso-tso,19 +lre22_dev_sdebh,ara-ayl,12 +lre22_dev_sedif,orm-orm,21 +lre22_dev_sedug,xho-xho,18 +lre22_dev_seynu,tso-tso,13 +lre22_dev_seyxt,ara-aeb,13 +lre22_dev_sezun,ara-aeb,14 +lre22_dev_sfeyl,ara-aeb,12 +lre22_dev_sfnux,afr-afr,18 +lre22_dev_sfqnk,zul-zul,15 +lre22_dev_sftvb,ara-ayl,11 +lre22_dev_sfwkd,ven-ven,17 +lre22_dev_shgbp,fra-ntf,22 +lre22_dev_shikk,tir-tir,19 +lre22_dev_shpve,afr-afr,21 +lre22_dev_sidjm,ara-ayl,10 +lre22_dev_sihvc,orm-orm,17 +lre22_dev_siiaw,ven-ven,16 +lre22_dev_sinfr,xho-xho,19 +lre22_dev_sipnk,eng-iaf,16 +lre22_dev_sjbcr,tir-tir,19 +lre22_dev_sjdzp,eng-iaf,16 +lre22_dev_sjmsx,ven-ven,19 +lre22_dev_sjsnf,afr-afr,16 +lre22_dev_sjwmd,tir-tir,19 +lre22_dev_sjxce,nbl-nbl,16 +lre22_dev_sjzcc,eng-ens,13 +lre22_dev_sjzsv,fra-ntf,22 +lre22_dev_skegk,afr-afr,18 +lre22_dev_skpib,ven-ven,14 +lre22_dev_slgub,orm-orm,18 +lre22_dev_slryu,nbl-nbl,17 +lre22_dev_slupt,ara-ayl,13 +lre22_dev_smfbl,ara-aeb,14 +lre22_dev_smfon,xho-xho,20 +lre22_dev_smvms,afr-afr,18 +lre22_dev_snegl,xho-xho,18 +lre22_dev_snvvg,tso-tso,14 +lre22_dev_sobpf,orm-orm,19 +lre22_dev_soely,eng-iaf,14 +lre22_dev_sorzd,tir-tir,19 +lre22_dev_spixz,nbl-nbl,18 +lre22_dev_spjcl,fra-ntf,17 +lre22_dev_spzra,tso-tso,17 +lre22_dev_sqaei,xho-xho,23 +lre22_dev_sqime,ven-ven,14 +lre22_dev_srgaw,eng-iaf,15 +lre22_dev_srnhq,ven-ven,16 +lre22_dev_srsng,orm-orm,21 +lre22_dev_srysc,nbl-nbl,17 +lre22_dev_srzgk,eng-ens,16 +lre22_dev_srzsi,ara-aeb,14 +lre22_dev_ssjtt,nbl-nbl,16 +lre22_dev_stajf,xho-xho,21 +lre22_dev_sttfd,ara-aeb,15 +lre22_dev_suevr,ara-aeb,15 +lre22_dev_sumum,afr-afr,18 +lre22_dev_svukm,fra-ntf,20 +lre22_dev_swkzf,tir-tir,17 +lre22_dev_sxqmv,ara-aeb,11 +lre22_dev_sxvuf,ara-aeb,11 +lre22_dev_sydqt,eng-ens,18 +lre22_dev_syooe,eng-ens,14 +lre22_dev_szpip,tir-tir,17 +lre22_dev_szsgp,fra-ntf,19 +lre22_dev_szzuj,ara-ayl,11 +lre22_dev_tabof,orm-orm,19 +lre22_dev_tavcw,ven-ven,19 +lre22_dev_tbjal,xho-xho,22 +lre22_dev_tbxzb,fra-ntf,21 +lre22_dev_tdalr,nbl-nbl,18 +lre22_dev_tdfzf,eng-iaf,17 +lre22_dev_tdlyk,tir-tir,15 +lre22_dev_tefms,fra-ntf,15 +lre22_dev_telgo,xho-xho,19 +lre22_dev_teric,eng-ens,14 +lre22_dev_tfcgx,orm-orm,21 +lre22_dev_tgiid,xho-xho,19 +lre22_dev_tgoea,ara-ayl,13 +lre22_dev_tgrrk,eng-iaf,18 +lre22_dev_tgtyv,tso-tso,12 +lre22_dev_tgzex,tso-tso,12 +lre22_dev_thone,nbl-nbl,17 +lre22_dev_thpnk,afr-afr,18 +lre22_dev_thwls,ven-ven,17 +lre22_dev_tibov,tir-tir,14 +lre22_dev_tidld,tso-tso,16 +lre22_dev_tiezu,eng-ens,17 +lre22_dev_tioqa,nbl-nbl,16 +lre22_dev_tiuym,zul-zul,15 +lre22_dev_tjivp,afr-afr,22 +lre22_dev_tjltd,orm-orm,20 +lre22_dev_tkcqj,ara-aeb,12 +lre22_dev_tkpij,tir-tir,19 +lre22_dev_tkpwp,orm-orm,19 +lre22_dev_tkyuh,tso-tso,12 +lre22_dev_tlkrm,zul-zul,19 +lre22_dev_tlspo,zul-zul,18 +lre22_dev_tmdvx,zul-zul,17 +lre22_dev_tmynp,afr-afr,20 +lre22_dev_tntmu,xho-xho,22 +lre22_dev_tnwok,orm-orm,21 +lre22_dev_toccu,eng-iaf,16 +lre22_dev_tofur,tir-tir,14 +lre22_dev_tokhl,ven-ven,21 +lre22_dev_tonkq,zul-zul,15 +lre22_dev_topxu,zul-zul,14 +lre22_dev_touna,ara-arq,15 +lre22_dev_towvr,tso-tso,12 +lre22_dev_tpasn,tir-tir,15 +lre22_dev_tpmen,ara-ayl,10 +lre22_dev_tpuws,tir-tir,19 +lre22_dev_tqbqi,xho-xho,17 +lre22_dev_tqtfo,tso-tso,17 +lre22_dev_traqh,fra-ntf,21 +lre22_dev_trdfp,ara-ayl,15 +lre22_dev_trdml,xho-xho,23 +lre22_dev_trmpg,nbl-nbl,19 +lre22_dev_tsdyg,tso-tso,19 +lre22_dev_tsvmo,ara-ayl,11 +lre22_dev_ttcul,afr-afr,19 +lre22_dev_ttrfr,ara-arq,12 +lre22_dev_tuhrp,ven-ven,14 +lre22_dev_twaba,afr-afr,15 +lre22_dev_twcnd,tir-tir,13 +lre22_dev_twtog,ven-ven,15 +lre22_dev_twvne,tir-tir,19 +lre22_dev_txcqg,orm-orm,19 +lre22_dev_txjsy,eng-ens,18 +lre22_dev_txmpu,afr-afr,19 +lre22_dev_txqde,eng-iaf,16 +lre22_dev_tyaup,eng-ens,17 +lre22_dev_tyaym,afr-afr,17 +lre22_dev_tybrl,nbl-nbl,16 +lre22_dev_tyduc,eng-ens,17 +lre22_dev_tyhsa,fra-ntf,21 +lre22_dev_tyigo,ara-ayl,11 +lre22_dev_tykte,zul-zul,18 +lre22_dev_tymil,tir-tir,16 +lre22_dev_tyofb,ven-ven,20 +lre22_dev_tysph,fra-ntf,16 +lre22_dev_tzamn,ara-aeb,11 +lre22_dev_tzrpp,ven-ven,15 +lre22_dev_tzukm,ara-aeb,12 +lre22_dev_uabum,xho-xho,19 +lre22_dev_uankd,nbl-nbl,18 +lre22_dev_uazyk,ara-ayl,14 +lre22_dev_ubdfa,eng-iaf,15 +lre22_dev_ubugi,orm-orm,22 +lre22_dev_ucetp,ven-ven,21 +lre22_dev_ucsxt,eng-ens,12 +lre22_dev_uczke,zul-zul,14 +lre22_dev_udldh,ara-arq,11 +lre22_dev_uejdk,orm-orm,17 +lre22_dev_uekog,zul-zul,17 +lre22_dev_uemql,xho-xho,16 +lre22_dev_ueovt,eng-ens,14 +lre22_dev_uesao,zul-zul,19 +lre22_dev_ueyxm,ara-ayl,13 +lre22_dev_ufafi,tir-tir,17 +lre22_dev_ufaig,tso-tso,12 +lre22_dev_uffpc,ara-arq,14 +lre22_dev_ufrmg,ven-ven,20 +lre22_dev_ugieb,ara-aeb,12 +lre22_dev_ugoiy,ara-ayl,10 +lre22_dev_ugzkq,ara-aeb,12 +lre22_dev_uhdrj,xho-xho,18 +lre22_dev_uhjdn,ara-ayl,16 +lre22_dev_uhkcq,ara-ayl,11 +lre22_dev_uhrjo,ara-aeb,13 +lre22_dev_uhrow,afr-afr,16 +lre22_dev_uikqm,ara-arq,12 +lre22_dev_uitct,eng-ens,13 +lre22_dev_uitqu,ara-ayl,12 +lre22_dev_ujiby,eng-ens,18 +lre22_dev_ujmtl,orm-orm,22 +lre22_dev_ukdpu,ven-ven,17 +lre22_dev_ukfpb,xho-xho,19 +lre22_dev_ukklw,fra-ntf,22 +lre22_dev_ukwjy,xho-xho,17 +lre22_dev_uljbx,fra-ntf,20 +lre22_dev_uljgh,tir-tir,13 +lre22_dev_uljvo,fra-ntf,21 +lre22_dev_undfd,orm-orm,20 +lre22_dev_unmiu,ara-arq,14 +lre22_dev_updar,nbl-nbl,17 +lre22_dev_uprkv,eng-iaf,16 +lre22_dev_urkok,ara-ayl,11 +lre22_dev_urolj,orm-orm,22 +lre22_dev_uscpv,eng-ens,14 +lre22_dev_ushtk,fra-ntf,20 +lre22_dev_usiey,ven-ven,19 +lre22_dev_usitw,ara-arq,14 +lre22_dev_utkxp,nbl-nbl,19 +lre22_dev_utnvo,tir-tir,16 +lre22_dev_utyjg,tso-tso,18 +lre22_dev_uuwaa,ara-arq,12 +lre22_dev_uuxla,eng-iaf,15 +lre22_dev_uuzuj,ara-arq,14 +lre22_dev_uvcxs,eng-ens,12 +lre22_dev_uveah,ven-ven,17 +lre22_dev_uvfqy,ara-arq,13 +lre22_dev_uvnhb,fra-ntf,20 +lre22_dev_uvqbm,afr-afr,19 +lre22_dev_uvsus,zul-zul,15 +lre22_dev_uvyev,fra-ntf,20 +lre22_dev_uwicd,tso-tso,12 +lre22_dev_uwnlz,zul-zul,18 +lre22_dev_uwwyj,afr-afr,20 +lre22_dev_uwyxc,eng-iaf,17 +lre22_dev_uxjzh,xho-xho,21 +lre22_dev_uxpyg,tso-tso,15 +lre22_dev_uxrxr,tso-tso,12 +lre22_dev_uyciz,eng-ens,14 +lre22_dev_uycza,xho-xho,17 +lre22_dev_uyvyb,eng-ens,17 +lre22_dev_uziar,zul-zul,15 +lre22_dev_uzlxd,fra-ntf,22 +lre22_dev_uznjr,tir-tir,13 +lre22_dev_vagda,ara-ayl,12 +lre22_dev_vanjm,ven-ven,18 +lre22_dev_vaqia,tir-tir,19 +lre22_dev_vasjz,ara-arq,11 +lre22_dev_vcexs,tir-tir,17 +lre22_dev_vchpm,fra-ntf,21 +lre22_dev_vctsa,nbl-nbl,19 +lre22_dev_vcxit,ven-ven,15 +lre22_dev_vcyqv,xho-xho,19 +lre22_dev_vdjlh,afr-afr,22 +lre22_dev_vdogx,ven-ven,15 +lre22_dev_veutb,eng-ens,16 +lre22_dev_vezrd,tso-tso,12 +lre22_dev_vfbfg,tso-tso,12 +lre22_dev_vffqd,orm-orm,21 +lre22_dev_vfhum,afr-afr,16 +lre22_dev_vfjtw,ara-arq,11 +lre22_dev_vfnjb,eng-ens,15 +lre22_dev_vgbbh,ara-arq,13 +lre22_dev_vgcao,eng-iaf,20 +lre22_dev_vgpnk,xho-xho,19 +lre22_dev_vityk,zul-zul,18 +lre22_dev_vjeuy,tir-tir,19 +lre22_dev_vjltt,zul-zul,17 +lre22_dev_vjqrm,tir-tir,13 +lre22_dev_vjvbs,tso-tso,18 +lre22_dev_vlcbq,tso-tso,16 +lre22_dev_vlnlb,tso-tso,13 +lre22_dev_vlscu,ara-ayl,15 +lre22_dev_vlwhz,fra-ntf,22 +lre22_dev_vlyeh,tso-tso,16 +lre22_dev_vmnps,zul-zul,14 +lre22_dev_vmqxk,tso-tso,18 +lre22_dev_vmrez,ven-ven,18 +lre22_dev_vmsnh,ara-aeb,11 +lre22_dev_vmuti,ara-aeb,14 +lre22_dev_vncre,afr-afr,22 +lre22_dev_vnkqv,afr-afr,15 +lre22_dev_vnmlt,zul-zul,18 +lre22_dev_vpkra,ara-ayl,11 +lre22_dev_vpoit,ara-arq,14 +lre22_dev_vpruu,orm-orm,23 +lre22_dev_vptiv,tir-tir,18 +lre22_dev_vqhcn,tso-tso,16 +lre22_dev_vqura,tir-tir,16 +lre22_dev_vrqfs,xho-xho,23 +lre22_dev_vrvtr,zul-zul,15 +lre22_dev_vrxvj,fra-ntf,17 +lre22_dev_vsbay,eng-iaf,19 +lre22_dev_vsbvi,fra-ntf,19 +lre22_dev_vslkb,eng-ens,12 +lre22_dev_vsrdg,tso-tso,12 +lre22_dev_vsrnz,zul-zul,14 +lre22_dev_vsryb,nbl-nbl,19 +lre22_dev_vtlab,zul-zul,19 +lre22_dev_vtrff,eng-iaf,17 +lre22_dev_vtztf,ara-aeb,11 +lre22_dev_vucth,eng-ens,14 +lre22_dev_vucug,orm-orm,21 +lre22_dev_vufuu,eng-ens,18 +lre22_dev_vujbs,zul-zul,19 +lre22_dev_vuufm,afr-afr,19 +lre22_dev_vvgdf,eng-ens,18 +lre22_dev_vvlcx,ara-aeb,12 +lre22_dev_vvvho,tir-tir,18 +lre22_dev_vwait,eng-iaf,14 +lre22_dev_vwdcw,ara-arq,14 +lre22_dev_vwyzq,ara-arq,14 +lre22_dev_vwzon,eng-ens,12 +lre22_dev_vxhoc,ara-aeb,11 +lre22_dev_vxkgz,ven-ven,18 +lre22_dev_vxlgl,tir-tir,18 +lre22_dev_vxsqt,eng-ens,15 +lre22_dev_vyqsd,nbl-nbl,17 +lre22_dev_vzcai,zul-zul,19 +lre22_dev_vzgoj,eng-iaf,14 +lre22_dev_vzlon,zul-zul,16 +lre22_dev_vznrg,nbl-nbl,16 +lre22_dev_vzqme,xho-xho,19 +lre22_dev_wabqx,ven-ven,18 +lre22_dev_wafdh,fra-ntf,21 +lre22_dev_wagmt,eng-iaf,18 +lre22_dev_waocz,ven-ven,20 +lre22_dev_wavrh,zul-zul,16 +lre22_dev_wawqg,ara-ayl,13 +lre22_dev_waznj,nbl-nbl,22 +lre22_dev_wbepu,fra-ntf,19 +lre22_dev_wbygw,eng-ens,16 +lre22_dev_wccgz,tso-tso,17 +lre22_dev_wcpwx,tir-tir,18 +lre22_dev_wczkn,eng-iaf,17 +lre22_dev_wdfmt,tir-tir,17 +lre22_dev_wdgbh,ara-arq,12 +lre22_dev_wdind,tso-tso,19 +lre22_dev_wdkit,nbl-nbl,16 +lre22_dev_wdmpt,eng-ens,17 +lre22_dev_wdpya,nbl-nbl,16 +lre22_dev_wdrxo,orm-orm,21 +lre22_dev_wdyiy,ara-ayl,13 +lre22_dev_weccy,afr-afr,15 +lre22_dev_wfmco,ara-arq,14 +lre22_dev_wfnon,nbl-nbl,17 +lre22_dev_wgdui,eng-iaf,14 +lre22_dev_wgkmr,eng-iaf,17 +lre22_dev_wgnex,tir-tir,19 +lre22_dev_wgucy,eng-iaf,18 +lre22_dev_wgwdn,eng-iaf,17 +lre22_dev_whqhx,eng-iaf,15 +lre22_dev_whxwv,eng-ens,14 +lre22_dev_witnq,fra-ntf,17 +lre22_dev_wixzu,tso-tso,16 +lre22_dev_wjhbw,eng-iaf,16 +lre22_dev_wjist,orm-orm,16 +lre22_dev_wjnhh,zul-zul,19 +lre22_dev_wjnyo,ven-ven,20 +lre22_dev_wjtnm,orm-orm,19 +lre22_dev_wjzhz,ara-aeb,13 +lre22_dev_wkacx,eng-iaf,15 +lre22_dev_wkqey,fra-ntf,16 +lre22_dev_wldli,zul-zul,14 +lre22_dev_wlnst,nbl-nbl,16 +lre22_dev_wltvq,zul-zul,17 +lre22_dev_wlwhq,orm-orm,19 +lre22_dev_wmdan,xho-xho,21 +lre22_dev_wmfce,nbl-nbl,20 +lre22_dev_wmigl,ven-ven,20 +lre22_dev_wmwmc,eng-iaf,19 +lre22_dev_wmypk,xho-xho,19 +lre22_dev_wmzpv,eng-ens,17 +lre22_dev_wnjpz,ven-ven,19 +lre22_dev_wnmkt,orm-orm,23 +lre22_dev_wnpep,nbl-nbl,16 +lre22_dev_wnqhz,nbl-nbl,16 +lre22_dev_wnxpz,ven-ven,15 +lre22_dev_wnxrw,ven-ven,18 +lre22_dev_woawg,ven-ven,18 +lre22_dev_wobzv,eng-ens,14 +lre22_dev_wocbv,tso-tso,18 +lre22_dev_woerb,fra-ntf,21 +lre22_dev_wojrt,orm-orm,19 +lre22_dev_wosus,tir-tir,17 +lre22_dev_wozuc,xho-xho,19 +lre22_dev_wqcyu,tso-tso,15 +lre22_dev_wqfuv,eng-ens,17 +lre22_dev_wqhag,zul-zul,19 +lre22_dev_wqmsd,tir-tir,13 +lre22_dev_wqthl,ara-aeb,12 +lre22_dev_wqtvm,eng-ens,15 +lre22_dev_wrmnw,zul-zul,18 +lre22_dev_wrtec,zul-zul,17 +lre22_dev_wrvls,zul-zul,14 +lre22_dev_wscfs,nbl-nbl,16 +lre22_dev_wssqw,eng-ens,15 +lre22_dev_wtbdf,tir-tir,14 +lre22_dev_wtcpe,ara-aeb,11 +lre22_dev_wthrk,orm-orm,18 +lre22_dev_wtofd,eng-iaf,20 +lre22_dev_wtuol,tso-tso,18 +lre22_dev_wuqez,ara-aeb,11 +lre22_dev_wuquc,tir-tir,18 +lre22_dev_wvlde,tso-tso,13 +lre22_dev_wwbmg,ara-aeb,11 +lre22_dev_wwduf,fra-ntf,18 +lre22_dev_wwvuw,ara-arq,13 +lre22_dev_wxaev,orm-orm,17 +lre22_dev_wycsj,ven-ven,18 +lre22_dev_wypwj,ara-ayl,10 +lre22_dev_wytpq,fra-ntf,17 +lre22_dev_wzhqk,xho-xho,22 +lre22_dev_wzpmq,eng-ens,12 +lre22_dev_wztdj,zul-zul,19 +lre22_dev_wzxgv,ven-ven,18 +lre22_dev_xacjk,fra-ntf,18 +lre22_dev_xaevp,tir-tir,14 +lre22_dev_xaldr,eng-iaf,14 +lre22_dev_xapdy,ara-aeb,12 +lre22_dev_xaurw,nbl-nbl,16 +lre22_dev_xawdd,tir-tir,20 +lre22_dev_xbcpb,ara-arq,12 +lre22_dev_xbfrs,ven-ven,17 +lre22_dev_xbqsr,nbl-nbl,22 +lre22_dev_xbvcc,nbl-nbl,17 +lre22_dev_xbvqw,orm-orm,23 +lre22_dev_xcame,xho-xho,16 +lre22_dev_xcrnp,ara-aeb,13 +lre22_dev_xcswu,ven-ven,18 +lre22_dev_xcuok,orm-orm,21 +lre22_dev_xcvkj,tso-tso,16 +lre22_dev_xdtdp,fra-ntf,17 +lre22_dev_xdyea,ara-ayl,10 +lre22_dev_xerqi,fra-ntf,17 +lre22_dev_xetdb,eng-ens,14 +lre22_dev_xfecy,nbl-nbl,16 +lre22_dev_xfgcu,eng-iaf,19 +lre22_dev_xfing,tir-tir,20 +lre22_dev_xgaig,ara-aeb,15 +lre22_dev_xgoyq,eng-ens,18 +lre22_dev_xhdtx,eng-iaf,14 +lre22_dev_xhvkx,orm-orm,19 +lre22_dev_xiblr,tir-tir,17 +lre22_dev_xifty,ara-aeb,12 +lre22_dev_xigtx,ara-arq,14 +lre22_dev_xijus,tso-tso,14 +lre22_dev_xipox,xho-xho,20 +lre22_dev_xittq,ara-aeb,13 +lre22_dev_xjpwq,ara-ayl,15 +lre22_dev_xjrla,afr-afr,20 +lre22_dev_xkdof,ara-ayl,13 +lre22_dev_xkiba,eng-ens,18 +lre22_dev_xlcxh,fra-ntf,18 +lre22_dev_xlsxb,tso-tso,16 +lre22_dev_xmhpj,ven-ven,20 +lre22_dev_xnqct,ara-arq,11 +lre22_dev_xoayi,eng-ens,13 +lre22_dev_xohps,ara-arq,11 +lre22_dev_xokpn,zul-zul,18 +lre22_dev_xonym,eng-ens,14 +lre22_dev_xozod,afr-afr,14 +lre22_dev_xpenp,ara-arq,11 +lre22_dev_xpnti,ara-aeb,11 +lre22_dev_xpqyr,orm-orm,22 +lre22_dev_xpswt,orm-orm,23 +lre22_dev_xpumn,ven-ven,14 +lre22_dev_xpvcf,orm-orm,20 +lre22_dev_xqhoa,ara-ayl,13 +lre22_dev_xqnpt,orm-orm,22 +lre22_dev_xqooi,xho-xho,20 +lre22_dev_xqupu,fra-ntf,21 +lre22_dev_xresy,eng-iaf,17 +lre22_dev_xrouj,ara-ayl,16 +lre22_dev_xsnxu,ara-aeb,12 +lre22_dev_xtaof,ara-ayl,13 +lre22_dev_xtbxk,orm-orm,20 +lre22_dev_xtgak,nbl-nbl,20 +lre22_dev_xuauh,ara-aeb,13 +lre22_dev_xubei,eng-iaf,17 +lre22_dev_xubol,ara-aeb,11 +lre22_dev_xuieb,orm-orm,19 +lre22_dev_xunxs,ara-ayl,14 +lre22_dev_xutjo,nbl-nbl,20 +lre22_dev_xvbos,afr-afr,22 +lre22_dev_xvcfn,eng-ens,16 +lre22_dev_xvgqo,eng-ens,12 +lre22_dev_xwemk,zul-zul,18 +lre22_dev_xwsyq,ara-ayl,14 +lre22_dev_xxdbg,tso-tso,18 +lre22_dev_xyoua,fra-ntf,22 +lre22_dev_xzoej,ara-aeb,13 +lre22_dev_xzrdl,ara-arq,13 +lre22_dev_xztsz,tso-tso,16 +lre22_dev_xzxbd,zul-zul,15 +lre22_dev_yagvv,tso-tso,13 +lre22_dev_ybqju,tso-tso,13 +lre22_dev_ybrji,ara-arq,11 +lre22_dev_ybsmy,ven-ven,21 +lre22_dev_ycbaf,ara-aeb,14 +lre22_dev_ychsm,ven-ven,14 +lre22_dev_ycrlj,xho-xho,17 +lre22_dev_ycuhc,orm-orm,21 +lre22_dev_ydhqc,ara-arq,13 +lre22_dev_ydmnb,nbl-nbl,17 +lre22_dev_yduem,xho-xho,21 +lre22_dev_yemzu,ara-aeb,11 +lre22_dev_yeoyx,eng-ens,18 +lre22_dev_yersp,ara-ayl,13 +lre22_dev_yeshv,eng-iaf,17 +lre22_dev_yexec,ven-ven,20 +lre22_dev_yeyna,ara-ayl,14 +lre22_dev_yfxmd,ara-arq,14 +lre22_dev_yfzah,ara-arq,14 +lre22_dev_ygkvo,ara-arq,11 +lre22_dev_yhgvr,ara-arq,15 +lre22_dev_yhwin,ara-arq,12 +lre22_dev_yirig,ara-ayl,16 +lre22_dev_yixgu,xho-xho,16 +lre22_dev_yjbfl,xho-xho,19 +lre22_dev_yjodc,eng-ens,14 +lre22_dev_yjoht,ara-aeb,12 +lre22_dev_yjqkb,ara-arq,14 +lre22_dev_yjrkq,ara-arq,15 +lre22_dev_yjrng,afr-afr,16 +lre22_dev_ykpzq,afr-afr,21 +lre22_dev_yktop,eng-iaf,20 +lre22_dev_ylfah,zul-zul,15 +lre22_dev_ylgex,tso-tso,14 +lre22_dev_ylkds,nbl-nbl,17 +lre22_dev_ylvyc,xho-xho,20 +lre22_dev_ylzic,eng-iaf,20 +lre22_dev_ymoon,afr-afr,17 +lre22_dev_yncqr,ara-arq,13 +lre22_dev_ynjtn,ven-ven,18 +lre22_dev_ynmzy,tso-tso,16 +lre22_dev_ynozi,fra-ntf,21 +lre22_dev_yntec,orm-orm,19 +lre22_dev_ynurl,tso-tso,14 +lre22_dev_ypdtt,ara-aeb,11 +lre22_dev_yprom,tso-tso,13 +lre22_dev_yptsk,xho-xho,23 +lre22_dev_ypyft,eng-iaf,14 +lre22_dev_yqhwt,orm-orm,23 +lre22_dev_yqtxe,eng-iaf,19 +lre22_dev_yquja,ara-ayl,10 +lre22_dev_yqxhl,eng-ens,14 +lre22_dev_yqyby,nbl-nbl,18 +lre22_dev_yqzua,fra-ntf,16 +lre22_dev_yrfxo,ven-ven,21 +lre22_dev_yrgzf,ara-aeb,13 +lre22_dev_yruqe,tso-tso,17 +lre22_dev_yrwgb,zul-zul,18 +lre22_dev_yrxsi,orm-orm,21 +lre22_dev_ysdkl,tso-tso,15 +lre22_dev_ytgav,xho-xho,16 +lre22_dev_ytoet,ara-arq,14 +lre22_dev_yuabg,eng-ens,16 +lre22_dev_yundm,tso-tso,14 +lre22_dev_yuvux,ara-ayl,13 +lre22_dev_yvdcv,fra-ntf,21 +lre22_dev_yvoli,orm-orm,23 +lre22_dev_yweox,orm-orm,21 +lre22_dev_ywgoc,eng-iaf,19 +lre22_dev_ywoyx,ven-ven,18 +lre22_dev_ywxql,zul-zul,19 +lre22_dev_yxkyl,eng-iaf,15 +lre22_dev_yxtmn,ara-aeb,14 +lre22_dev_yycsn,ara-ayl,12 +lre22_dev_yyswd,eng-iaf,16 +lre22_dev_yyugr,ven-ven,21 +lre22_dev_yzitu,orm-orm,20 +lre22_dev_yzwmi,eng-ens,16 +lre22_dev_yzzww,zul-zul,17 +lre22_dev_zabub,ara-ayl,16 +lre22_dev_zabuv,eng-iaf,14 +lre22_dev_zacuc,zul-zul,19 +lre22_dev_zavru,zul-zul,19 +lre22_dev_zbfgy,ara-arq,12 +lre22_dev_zbjez,nbl-nbl,17 +lre22_dev_zbtpo,ven-ven,18 +lre22_dev_zbzip,tso-tso,19 +lre22_dev_zcevz,nbl-nbl,16 +lre22_dev_zcnsv,afr-afr,21 +lre22_dev_zcqkl,eng-iaf,20 +lre22_dev_zczer,ven-ven,14 +lre22_dev_zdcdt,nbl-nbl,18 +lre22_dev_zddua,xho-xho,19 +lre22_dev_zdvsh,ara-arq,14 +lre22_dev_zdwxx,ara-ayl,14 +lre22_dev_zdyxi,tir-tir,14 +lre22_dev_zetju,eng-iaf,17 +lre22_dev_zfsek,ara-arq,11 +lre22_dev_zfvfa,eng-ens,18 +lre22_dev_zggiu,zul-zul,19 +lre22_dev_zgndz,tso-tso,14 +lre22_dev_zgxth,eng-ens,16 +lre22_dev_zhlxa,ara-ayl,14 +lre22_dev_zhnsb,ara-ayl,15 +lre22_dev_zhsmo,ara-aeb,13 +lre22_dev_zhvbf,xho-xho,18 +lre22_dev_zhzrh,eng-iaf,15 +lre22_dev_ziigd,orm-orm,21 +lre22_dev_zilud,tir-tir,19 +lre22_dev_zjivp,zul-zul,19 +lre22_dev_zjleg,zul-zul,19 +lre22_dev_zjquq,orm-orm,16 +lre22_dev_zkgjo,nbl-nbl,22 +lre22_dev_zkhes,fra-ntf,16 +lre22_dev_zkioq,ara-aeb,12 +lre22_dev_zkwaw,afr-afr,21 +lre22_dev_zlapc,ara-ayl,13 +lre22_dev_zlntm,zul-zul,19 +lre22_dev_zmmyn,xho-xho,23 +lre22_dev_zmxld,ven-ven,17 +lre22_dev_znhcf,ven-ven,21 +lre22_dev_znwsk,afr-afr,22 +lre22_dev_znxvg,eng-ens,18 +lre22_dev_znycz,ara-aeb,13 +lre22_dev_zoayx,zul-zul,18 +lre22_dev_zogte,nbl-nbl,16 +lre22_dev_zoldl,ara-aeb,12 +lre22_dev_zoqzl,eng-ens,17 +lre22_dev_zorfv,eng-iaf,16 +lre22_dev_zoseh,ara-arq,12 +lre22_dev_zpotb,xho-xho,16 +lre22_dev_zptbg,tir-tir,14 +lre22_dev_zqjzi,ara-aeb,11 +lre22_dev_zqljj,ara-aeb,14 +lre22_dev_zqlri,orm-orm,18 +lre22_dev_zqoif,zul-zul,19 +lre22_dev_zqorv,ara-aeb,12 +lre22_dev_zqwgs,fra-ntf,18 +lre22_dev_zrhbt,tir-tir,19 +lre22_dev_zrqar,ara-aeb,13 +lre22_dev_zrqec,eng-iaf,17 +lre22_dev_ztdrx,fra-ntf,15 +lre22_dev_ztdwr,orm-orm,17 +lre22_dev_zthiv,ara-arq,15 +lre22_dev_ztknh,xho-xho,18 +lre22_dev_ztlcq,ara-aeb,13 +lre22_dev_ztufj,fra-ntf,19 +lre22_dev_zubjl,fra-ntf,20 +lre22_dev_zunuw,tso-tso,17 +lre22_dev_zutul,tir-tir,13 +lre22_dev_zutvv,eng-ens,12 +lre22_dev_zuugc,eng-iaf,17 +lre22_dev_zuvqx,eng-iaf,14 +lre22_dev_zvthu,orm-orm,20 +lre22_dev_zvvov,ara-aeb,11 +lre22_dev_zvyuh,ara-arq,14 +lre22_dev_zwfqq,eng-iaf,17 +lre22_dev_zwosr,xho-xho,16 +lre22_dev_zwvhw,tso-tso,12 +lre22_dev_zxihz,ven-ven,14 +lre22_dev_zydma,eng-ens,12 +lre22_dev_zyqlz,zul-zul,19 +lre22_dev_zyyie,orm-orm,23 +lre22_dev_zyywo,eng-iaf,14 +lre22_dev_zzyze,ara-ayl,12 diff --git a/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/train_segments.csv b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/train_segments.csv new file mode 100644 index 00000000..6518f24e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/dev_splits/fold_1/train_segments.csv @@ -0,0 +1,2114 @@ +id,class_id,subclass_idx +lre22_dev_aadaq,afr-afr,5 +lre22_dev_aaxdt,xho-xho,14 +lre22_dev_abujj,xho-xho,15 +lre22_dev_acgiu,zul-zul,6 +lre22_dev_acnyv,ven-ven,7 +lre22_dev_adbku,ara-ayl,4 +lre22_dev_ademr,orm-orm,3 +lre22_dev_adgoy,xho-xho,4 +lre22_dev_adnpi,eng-ens,1 +lre22_dev_adqaa,ven-ven,10 +lre22_dev_adwzf,zul-zul,2 +lre22_dev_aeiuj,afr-afr,4 +lre22_dev_afhui,eng-ens,4 +lre22_dev_afuav,nbl-nbl,15 +lre22_dev_afvvg,ven-ven,10 +lre22_dev_afxjf,eng-iaf,10 +lre22_dev_agmwb,ara-aeb,10 +lre22_dev_agnik,eng-ens,3 +lre22_dev_ahcja,orm-orm,14 +lre22_dev_ahobp,afr-afr,13 +lre22_dev_ahupk,eng-ens,11 +lre22_dev_aicjg,xho-xho,12 +lre22_dev_aikrz,eng-ens,9 +lre22_dev_ailwo,orm-orm,7 +lre22_dev_aiqhl,tir-tir,10 +lre22_dev_aiuwf,ara-ayl,5 +lre22_dev_aizyr,ara-arq,0 +lre22_dev_ajbui,zul-zul,12 +lre22_dev_ajigk,ara-aeb,10 +lre22_dev_ajuwq,ara-ayl,3 +lre22_dev_akbly,nbl-nbl,3 +lre22_dev_akhwr,xho-xho,6 +lre22_dev_aksxd,nbl-nbl,6 +lre22_dev_aktcg,afr-afr,1 +lre22_dev_aktzw,eng-ens,11 +lre22_dev_akulq,orm-orm,14 +lre22_dev_alcie,orm-orm,11 +lre22_dev_alunz,xho-xho,6 +lre22_dev_amaec,tir-tir,10 +lre22_dev_amnvo,ara-arq,6 +lre22_dev_amxrk,zul-zul,9 +lre22_dev_anmuv,tso-tso,11 +lre22_dev_aomcz,ara-aeb,7 +lre22_dev_aooht,fra-ntf,11 +lre22_dev_aprbe,ara-arq,3 +lre22_dev_apxxx,orm-orm,12 +lre22_dev_aqdwu,ven-ven,6 +lre22_dev_aqejl,xho-xho,5 +lre22_dev_aqnyy,tso-tso,5 +lre22_dev_arjuc,afr-afr,5 +lre22_dev_arrkp,tir-tir,1 +lre22_dev_atdgp,zul-zul,13 +lre22_dev_atoxn,eng-ens,10 +lre22_dev_audls,afr-afr,6 +lre22_dev_auilj,ven-ven,11 +lre22_dev_auqgt,eng-iaf,3 +lre22_dev_autlo,zul-zul,7 +lre22_dev_avait,zul-zul,3 +lre22_dev_avvik,nbl-nbl,14 +lre22_dev_awgem,ara-ayl,3 +lre22_dev_awgnb,fra-ntf,14 +lre22_dev_awvym,ara-ayl,9 +lre22_dev_axhbz,tir-tir,12 +lre22_dev_axici,tir-tir,8 +lre22_dev_axtpv,xho-xho,6 +lre22_dev_aygsz,ara-aeb,4 +lre22_dev_ayiif,ven-ven,7 +lre22_dev_azqvo,zul-zul,3 +lre22_dev_basml,eng-ens,11 +lre22_dev_bawje,tir-tir,6 +lre22_dev_bbana,zul-zul,7 +lre22_dev_bbtpz,ven-ven,5 +lre22_dev_bcbrw,eng-iaf,2 +lre22_dev_bchvx,zul-zul,9 +lre22_dev_bcllp,afr-afr,13 +lre22_dev_bcsmi,fra-ntf,6 +lre22_dev_bdqaw,ven-ven,6 +lre22_dev_bdwle,ara-arq,6 +lre22_dev_behbh,ara-ayl,4 +lre22_dev_bexda,ara-arq,6 +lre22_dev_bfbyn,ara-aeb,9 +lre22_dev_bfjgx,ara-ayl,7 +lre22_dev_bgbjo,nbl-nbl,1 +lre22_dev_bgebs,ara-ayl,5 +lre22_dev_bgnod,fra-ntf,3 +lre22_dev_bhezb,ara-ayl,7 +lre22_dev_bhyuy,afr-afr,13 +lre22_dev_bidge,tir-tir,12 +lre22_dev_bimnd,eng-ens,7 +lre22_dev_biyaj,ara-ayl,5 +lre22_dev_bjsgu,afr-afr,10 +lre22_dev_blmfp,eng-iaf,5 +lre22_dev_blohd,ven-ven,4 +lre22_dev_bmebz,ara-arq,4 +lre22_dev_bmjuo,ara-aeb,6 +lre22_dev_bmkrm,fra-ntf,10 +lre22_dev_bmzym,zul-zul,5 +lre22_dev_bnfuu,orm-orm,13 +lre22_dev_bnilb,zul-zul,8 +lre22_dev_bnxna,eng-ens,1 +lre22_dev_boikl,orm-orm,7 +lre22_dev_boisz,ven-ven,2 +lre22_dev_boqxy,zul-zul,13 +lre22_dev_bpqhd,tso-tso,2 +lre22_dev_briiw,ara-aeb,8 +lre22_dev_brohj,fra-ntf,1 +lre22_dev_brqdv,nbl-nbl,3 +lre22_dev_brwcj,afr-afr,6 +lre22_dev_bsclv,orm-orm,8 +lre22_dev_bsdbb,ara-arq,4 +lre22_dev_bstjt,nbl-nbl,10 +lre22_dev_btbke,ara-aeb,0 +lre22_dev_btcfj,ven-ven,12 +lre22_dev_btomw,ven-ven,6 +lre22_dev_btpvy,afr-afr,1 +lre22_dev_btrtb,ara-arq,4 +lre22_dev_btruf,zul-zul,8 +lre22_dev_btsll,ara-ayl,7 +lre22_dev_butrw,ara-ayl,6 +lre22_dev_buwrj,ara-ayl,2 +lre22_dev_bvlhb,fra-ntf,8 +lre22_dev_bvmql,xho-xho,10 +lre22_dev_bvnsc,tir-tir,10 +lre22_dev_bwrej,ven-ven,9 +lre22_dev_bxial,eng-ens,2 +lre22_dev_bxnbf,fra-ntf,9 +lre22_dev_bybim,afr-afr,6 +lre22_dev_byegp,orm-orm,15 +lre22_dev_byngq,ven-ven,9 +lre22_dev_byytf,fra-ntf,6 +lre22_dev_bzies,tso-tso,3 +lre22_dev_bzipd,afr-afr,7 +lre22_dev_cacop,nbl-nbl,5 +lre22_dev_caent,afr-afr,12 +lre22_dev_capsb,ven-ven,0 +lre22_dev_cawbw,orm-orm,12 +lre22_dev_cblep,ven-ven,3 +lre22_dev_cblig,fra-ntf,6 +lre22_dev_ccexy,ven-ven,7 +lre22_dev_ccsye,ara-aeb,8 +lre22_dev_cctyt,eng-iaf,11 +lre22_dev_ccuie,eng-ens,7 +lre22_dev_ccvzf,eng-iaf,1 +lre22_dev_cdlkq,tso-tso,8 +lre22_dev_cdtiu,ara-ayl,9 +lre22_dev_cemyb,tir-tir,12 +lre22_dev_ceprg,eng-iaf,9 +lre22_dev_ceqow,nbl-nbl,15 +lre22_dev_cfdsu,fra-ntf,7 +lre22_dev_cfhbm,ven-ven,3 +lre22_dev_cfsew,afr-afr,12 +lre22_dev_cgges,eng-iaf,11 +lre22_dev_cgjnr,eng-iaf,10 +lre22_dev_cgotg,eng-ens,11 +lre22_dev_cgovb,nbl-nbl,15 +lre22_dev_cgssg,tir-tir,7 +lre22_dev_chhsl,tir-tir,7 +lre22_dev_chjuh,nbl-nbl,9 +lre22_dev_chpoe,nbl-nbl,11 +lre22_dev_chtgu,ara-aeb,10 +lre22_dev_chtlt,eng-iaf,10 +lre22_dev_cigir,eng-ens,9 +lre22_dev_ciyeh,ara-ayl,2 +lre22_dev_cjswm,orm-orm,12 +lre22_dev_cjtdl,ven-ven,13 +lre22_dev_ckzie,ara-aeb,10 +lre22_dev_cldfc,ara-ayl,8 +lre22_dev_clxqz,ara-arq,9 +lre22_dev_cmahj,afr-afr,13 +lre22_dev_cmqxm,tir-tir,6 +lre22_dev_cmrdt,afr-afr,5 +lre22_dev_cmvpq,ara-ayl,2 +lre22_dev_cnbfw,eng-iaf,5 +lre22_dev_cnbvd,afr-afr,9 +lre22_dev_cnomp,orm-orm,15 +lre22_dev_cnrvj,xho-xho,11 +lre22_dev_cnszu,ara-ayl,4 +lre22_dev_cnudd,xho-xho,14 +lre22_dev_cnuoi,orm-orm,14 +lre22_dev_cnxjs,orm-orm,8 +lre22_dev_coarm,xho-xho,4 +lre22_dev_cocyn,zul-zul,6 +lre22_dev_colxc,zul-zul,13 +lre22_dev_cosfn,ara-aeb,10 +lre22_dev_cosgu,ara-ayl,7 +lre22_dev_cpjab,ara-aeb,10 +lre22_dev_cpple,tso-tso,6 +lre22_dev_cqhjy,ara-ayl,3 +lre22_dev_cqkmy,ara-aeb,10 +lre22_dev_cqukb,tso-tso,9 +lre22_dev_cqusc,orm-orm,6 +lre22_dev_cqyzf,fra-ntf,13 +lre22_dev_crcwu,xho-xho,12 +lre22_dev_crqjz,nbl-nbl,10 +lre22_dev_crtpm,ara-arq,5 +lre22_dev_crucu,tir-tir,6 +lre22_dev_crvby,eng-iaf,12 +lre22_dev_crvoh,eng-ens,7 +lre22_dev_csjxv,ara-arq,3 +lre22_dev_ctfiv,ara-aeb,5 +lre22_dev_ctgpr,ven-ven,12 +lre22_dev_ctlrz,tir-tir,8 +lre22_dev_ctzhm,zul-zul,6 +lre22_dev_cudew,ven-ven,8 +lre22_dev_cusin,ara-arq,10 +lre22_dev_cvaad,eng-iaf,5 +lre22_dev_cvedm,zul-zul,12 +lre22_dev_cvgfx,eng-iaf,8 +lre22_dev_cvujh,ara-ayl,2 +lre22_dev_cweil,ara-aeb,10 +lre22_dev_cweuh,eng-ens,7 +lre22_dev_cwiro,afr-afr,6 +lre22_dev_cwtby,ara-arq,7 +lre22_dev_cxggy,afr-afr,4 +lre22_dev_cxnqr,tso-tso,7 +lre22_dev_cxpan,nbl-nbl,14 +lre22_dev_cxsxl,ara-aeb,10 +lre22_dev_cxyti,tso-tso,8 +lre22_dev_cypcg,zul-zul,12 +lre22_dev_czcmz,zul-zul,10 +lre22_dev_czdzw,orm-orm,7 +lre22_dev_czppj,zul-zul,10 +lre22_dev_czxff,zul-zul,9 +lre22_dev_czxld,fra-ntf,9 +lre22_dev_dajnt,zul-zul,12 +lre22_dev_dbcxi,orm-orm,9 +lre22_dev_dbdbv,tso-tso,8 +lre22_dev_dbdwv,orm-orm,5 +lre22_dev_dbgof,nbl-nbl,15 +lre22_dev_dblhh,eng-iaf,0 +lre22_dev_dbljb,xho-xho,12 +lre22_dev_dcibg,eng-iaf,4 +lre22_dev_dcobk,ara-arq,8 +lre22_dev_dcvcu,afr-afr,4 +lre22_dev_dcvyc,fra-ntf,14 +lre22_dev_ddfeo,ara-ayl,5 +lre22_dev_ddhaq,zul-zul,10 +lre22_dev_ddhes,afr-afr,6 +lre22_dev_ddsds,afr-afr,12 +lre22_dev_ddxvn,ven-ven,5 +lre22_dev_dfdrs,ven-ven,7 +lre22_dev_dfifl,ara-ayl,9 +lre22_dev_dfjek,ven-ven,4 +lre22_dev_dflco,zul-zul,12 +lre22_dev_dftta,tso-tso,6 +lre22_dev_dfxnq,eng-ens,11 +lre22_dev_dgjdi,orm-orm,8 +lre22_dev_dgqwo,tir-tir,9 +lre22_dev_dhapq,ara-aeb,8 +lre22_dev_dhdfk,eng-ens,8 +lre22_dev_dhfjj,ara-arq,4 +lre22_dev_dhlxh,ara-aeb,4 +lre22_dev_dhnne,eng-ens,10 +lre22_dev_dhtlz,eng-ens,6 +lre22_dev_diarz,ara-ayl,2 +lre22_dev_diggg,tir-tir,9 +lre22_dev_diqtw,ara-aeb,8 +lre22_dev_dixuw,orm-orm,9 +lre22_dev_diypf,orm-orm,13 +lre22_dev_djzsk,nbl-nbl,13 +lre22_dev_dksey,nbl-nbl,11 +lre22_dev_dlzwh,fra-ntf,12 +lre22_dev_dmdpv,eng-ens,2 +lre22_dev_dmeea,orm-orm,14 +lre22_dev_dmhdv,xho-xho,10 +lre22_dev_dmics,fra-ntf,14 +lre22_dev_dmiiu,ara-aeb,6 +lre22_dev_dmjxr,xho-xho,10 +lre22_dev_dmzxn,afr-afr,4 +lre22_dev_dngtw,ara-ayl,3 +lre22_dev_dnjdq,eng-ens,7 +lre22_dev_dnprz,zul-zul,12 +lre22_dev_dobdj,fra-ntf,0 +lre22_dev_dobwk,orm-orm,8 +lre22_dev_donqm,ara-arq,3 +lre22_dev_dpbyt,tso-tso,6 +lre22_dev_dpfns,ara-aeb,4 +lre22_dev_dpjjp,fra-ntf,7 +lre22_dev_dpomx,eng-iaf,5 +lre22_dev_dpwhs,eng-ens,8 +lre22_dev_dpygj,eng-iaf,8 +lre22_dev_dqzex,xho-xho,3 +lre22_dev_drcqx,eng-iaf,7 +lre22_dev_drfhb,ara-aeb,10 +lre22_dev_drfte,ara-arq,8 +lre22_dev_driks,eng-ens,11 +lre22_dev_drofs,fra-ntf,1 +lre22_dev_dslxl,ara-ayl,7 +lre22_dev_dsmwd,ven-ven,13 +lre22_dev_dsyyk,tir-tir,9 +lre22_dev_dthcb,zul-zul,12 +lre22_dev_dtumd,fra-ntf,5 +lre22_dev_dtwmj,afr-afr,7 +lre22_dev_duegm,tso-tso,9 +lre22_dev_dvirs,afr-afr,6 +lre22_dev_dvtzf,eng-iaf,7 +lre22_dev_dwcfi,ven-ven,7 +lre22_dev_dwfle,fra-ntf,7 +lre22_dev_dwgsv,tir-tir,6 +lre22_dev_dwlay,ara-arq,3 +lre22_dev_dwnit,xho-xho,15 +lre22_dev_dwvoh,tso-tso,6 +lre22_dev_dxgpq,afr-afr,12 +lre22_dev_dxhpf,ara-ayl,9 +lre22_dev_dxlhq,ara-arq,5 +lre22_dev_dxrcj,zul-zul,5 +lre22_dev_dywox,tir-tir,9 +lre22_dev_dzjrv,eng-iaf,8 +lre22_dev_dzsql,tso-tso,6 +lre22_dev_dzxkv,orm-orm,13 +lre22_dev_eabne,xho-xho,2 +lre22_dev_eacdl,fra-ntf,14 +lre22_dev_eaupg,eng-iaf,11 +lre22_dev_eawug,eng-iaf,6 +lre22_dev_ebbgx,nbl-nbl,15 +lre22_dev_ecber,afr-afr,10 +lre22_dev_ecdgv,ara-arq,5 +lre22_dev_ecneb,afr-afr,6 +lre22_dev_ecxrr,tir-tir,9 +lre22_dev_edldw,tir-tir,10 +lre22_dev_edofc,afr-afr,6 +lre22_dev_edvaf,xho-xho,13 +lre22_dev_edydw,eng-ens,5 +lre22_dev_eejtn,zul-zul,4 +lre22_dev_eekzc,fra-ntf,4 +lre22_dev_eenhx,eng-iaf,9 +lre22_dev_efcgi,fra-ntf,0 +lre22_dev_efdoz,ven-ven,8 +lre22_dev_efioy,tso-tso,9 +lre22_dev_efiwx,eng-ens,9 +lre22_dev_efrlw,ven-ven,2 +lre22_dev_eghmh,eng-ens,11 +lre22_dev_ehhyu,nbl-nbl,10 +lre22_dev_eiomi,ven-ven,12 +lre22_dev_eisiy,orm-orm,8 +lre22_dev_ejaiq,ara-aeb,1 +lre22_dev_ejkmr,eng-iaf,5 +lre22_dev_ejthv,ven-ven,12 +lre22_dev_ejtyd,fra-ntf,14 +lre22_dev_ekfzq,ara-ayl,6 +lre22_dev_ekgjp,zul-zul,3 +lre22_dev_ekixu,nbl-nbl,2 +lre22_dev_ekjxx,ara-arq,6 +lre22_dev_ekvxc,eng-iaf,4 +lre22_dev_eldrg,orm-orm,11 +lre22_dev_elitc,ara-arq,3 +lre22_dev_emdtf,xho-xho,0 +lre22_dev_emhqx,tir-tir,4 +lre22_dev_emxnm,afr-afr,4 +lre22_dev_emzaa,xho-xho,3 +lre22_dev_engqe,xho-xho,15 +lre22_dev_ennjl,tso-tso,10 +lre22_dev_eokyg,nbl-nbl,2 +lre22_dev_epkwr,tir-tir,5 +lre22_dev_epojj,tir-tir,7 +lre22_dev_epsdk,nbl-nbl,12 +lre22_dev_epsfl,xho-xho,14 +lre22_dev_epuno,eng-ens,1 +lre22_dev_epylu,eng-iaf,10 +lre22_dev_ereen,ara-arq,10 +lre22_dev_eriaf,eng-ens,4 +lre22_dev_ermqx,ara-arq,2 +lre22_dev_escob,fra-ntf,9 +lre22_dev_esjsk,ara-ayl,7 +lre22_dev_esqti,xho-xho,9 +lre22_dev_etaln,zul-zul,12 +lre22_dev_etarn,nbl-nbl,6 +lre22_dev_etndu,ven-ven,13 +lre22_dev_etpdc,afr-afr,3 +lre22_dev_etsam,zul-zul,7 +lre22_dev_etwge,eng-ens,6 +lre22_dev_etxyc,orm-orm,12 +lre22_dev_eumsq,zul-zul,10 +lre22_dev_eusfl,orm-orm,8 +lre22_dev_eutkk,tso-tso,0 +lre22_dev_euxuy,orm-orm,13 +lre22_dev_evaon,ara-aeb,4 +lre22_dev_evkaz,eng-iaf,8 +lre22_dev_evret,fra-ntf,8 +lre22_dev_evvep,tso-tso,9 +lre22_dev_evvvd,tir-tir,10 +lre22_dev_ewems,ven-ven,7 +lre22_dev_ewijw,orm-orm,11 +lre22_dev_ewqpv,eng-iaf,6 +lre22_dev_ewywf,nbl-nbl,10 +lre22_dev_exaia,afr-afr,3 +lre22_dev_exbum,afr-afr,4 +lre22_dev_exhhd,ara-aeb,5 +lre22_dev_exkkf,afr-afr,3 +lre22_dev_extrh,zul-zul,6 +lre22_dev_exzyo,xho-xho,15 +lre22_dev_eyrzt,ara-ayl,1 +lre22_dev_eysdu,zul-zul,4 +lre22_dev_eyshz,xho-xho,1 +lre22_dev_eyuyq,ara-ayl,7 +lre22_dev_ezsyu,ven-ven,3 +lre22_dev_faahr,afr-afr,9 +lre22_dev_fabli,ven-ven,6 +lre22_dev_fatah,zul-zul,12 +lre22_dev_fccpw,orm-orm,12 +lre22_dev_fcpbu,xho-xho,8 +lre22_dev_fcqbx,tso-tso,3 +lre22_dev_fcwnw,fra-ntf,8 +lre22_dev_fdgia,orm-orm,10 +lre22_dev_febnk,eng-ens,5 +lre22_dev_fedau,eng-iaf,5 +lre22_dev_fehxn,xho-xho,8 +lre22_dev_fejsd,ven-ven,8 +lre22_dev_feqjc,eng-iaf,12 +lre22_dev_fesss,nbl-nbl,15 +lre22_dev_feuww,fra-ntf,8 +lre22_dev_fevex,zul-zul,2 +lre22_dev_ffban,ara-arq,6 +lre22_dev_ffefw,orm-orm,13 +lre22_dev_ffsps,fra-ntf,8 +lre22_dev_ffwid,tso-tso,11 +lre22_dev_fgbtr,nbl-nbl,15 +lre22_dev_fgmbr,ara-arq,6 +lre22_dev_fgmxd,eng-ens,9 +lre22_dev_fgnfs,tir-tir,12 +lre22_dev_fgrze,eng-ens,11 +lre22_dev_fhlhy,ara-aeb,7 +lre22_dev_fihvr,eng-iaf,7 +lre22_dev_fiizm,xho-xho,14 +lre22_dev_fiksd,fra-ntf,12 +lre22_dev_fitjt,tso-tso,6 +lre22_dev_fiuun,eng-ens,7 +lre22_dev_fjdul,ara-ayl,3 +lre22_dev_fjgrh,ven-ven,8 +lre22_dev_fkaqj,nbl-nbl,13 +lre22_dev_flfgv,ara-aeb,9 +lre22_dev_flirl,fra-ntf,13 +lre22_dev_fljab,fra-ntf,14 +lre22_dev_flnzm,tir-tir,11 +lre22_dev_flsmp,orm-orm,15 +lre22_dev_fmjvq,ven-ven,2 +lre22_dev_fmmxd,afr-afr,4 +lre22_dev_fnglh,afr-afr,13 +lre22_dev_fnsax,xho-xho,6 +lre22_dev_fojyn,eng-ens,5 +lre22_dev_foqgk,ven-ven,2 +lre22_dev_fovba,ara-arq,4 +lre22_dev_fozyj,ara-arq,2 +lre22_dev_fpavw,ara-aeb,8 +lre22_dev_fptba,eng-ens,3 +lre22_dev_fqdfc,tso-tso,11 +lre22_dev_fqdhm,eng-iaf,8 +lre22_dev_fqfet,nbl-nbl,7 +lre22_dev_fqgty,fra-ntf,4 +lre22_dev_fqgyd,zul-zul,10 +lre22_dev_fqvup,tso-tso,2 +lre22_dev_frviu,ara-aeb,10 +lre22_dev_frwfk,nbl-nbl,9 +lre22_dev_fsygm,eng-iaf,5 +lre22_dev_ftfjv,orm-orm,11 +lre22_dev_ftjvg,afr-afr,12 +lre22_dev_ftmnu,ara-aeb,10 +lre22_dev_ftrcl,eng-ens,3 +lre22_dev_ftygz,eng-ens,8 +lre22_dev_fughv,eng-iaf,3 +lre22_dev_fuhuk,ara-ayl,5 +lre22_dev_fusyr,ven-ven,13 +lre22_dev_futhm,zul-zul,5 +lre22_dev_fvbzh,ara-ayl,7 +lre22_dev_fvecf,ven-ven,9 +lre22_dev_fvktn,fra-ntf,8 +lre22_dev_fvpts,orm-orm,6 +lre22_dev_fvsmm,eng-iaf,12 +lre22_dev_fvvgc,ara-arq,5 +lre22_dev_fwvzh,zul-zul,2 +lre22_dev_fwwsy,xho-xho,5 +lre22_dev_fxggn,fra-ntf,1 +lre22_dev_fxqfi,orm-orm,10 +lre22_dev_fxuqw,ara-ayl,3 +lre22_dev_fxwfc,eng-iaf,12 +lre22_dev_fymdc,tso-tso,4 +lre22_dev_fywir,tso-tso,10 +lre22_dev_fzjzu,xho-xho,14 +lre22_dev_fzpeh,ara-aeb,10 +lre22_dev_fztdi,tir-tir,9 +lre22_dev_gcced,ven-ven,6 +lre22_dev_gchqj,zul-zul,10 +lre22_dev_gctmk,xho-xho,12 +lre22_dev_gcupw,ven-ven,7 +lre22_dev_gdfdn,tir-tir,5 +lre22_dev_gdlpg,tir-tir,3 +lre22_dev_gdrwq,fra-ntf,14 +lre22_dev_gdvjh,afr-afr,5 +lre22_dev_gdvtc,eng-iaf,13 +lre22_dev_gdxck,orm-orm,4 +lre22_dev_gecgq,afr-afr,12 +lre22_dev_gevbs,nbl-nbl,13 +lre22_dev_gfqxw,tir-tir,11 +lre22_dev_gfujh,eng-ens,8 +lre22_dev_gfwqx,fra-ntf,10 +lre22_dev_ggchj,tir-tir,10 +lre22_dev_ggeie,ara-arq,8 +lre22_dev_ggqob,ara-aeb,9 +lre22_dev_ghllb,eng-ens,8 +lre22_dev_ghlqh,afr-afr,12 +lre22_dev_ghmuk,afr-afr,13 +lre22_dev_ghskg,tso-tso,4 +lre22_dev_ghwmw,ara-arq,2 +lre22_dev_giijn,ven-ven,6 +lre22_dev_gised,xho-xho,9 +lre22_dev_gisrt,tir-tir,9 +lre22_dev_gjptx,nbl-nbl,4 +lre22_dev_gjvkc,ara-arq,7 +lre22_dev_gjxkc,eng-iaf,13 +lre22_dev_gkywh,ara-aeb,7 +lre22_dev_glhtl,eng-iaf,3 +lre22_dev_glulw,ara-aeb,8 +lre22_dev_gmpja,nbl-nbl,3 +lre22_dev_gmpjm,nbl-nbl,12 +lre22_dev_gnkvz,eng-iaf,13 +lre22_dev_gnmcz,nbl-nbl,4 +lre22_dev_goggr,afr-afr,5 +lre22_dev_goqov,ara-aeb,8 +lre22_dev_gpzgq,tso-tso,9 +lre22_dev_gpzuz,fra-ntf,5 +lre22_dev_gqpul,ara-arq,10 +lre22_dev_gratu,tir-tir,7 +lre22_dev_grewx,afr-afr,9 +lre22_dev_grizt,eng-ens,2 +lre22_dev_grsam,afr-afr,11 +lre22_dev_grsyr,zul-zul,1 +lre22_dev_grxus,nbl-nbl,15 +lre22_dev_gsanj,ven-ven,13 +lre22_dev_gsbwz,nbl-nbl,9 +lre22_dev_gtwjj,tso-tso,4 +lre22_dev_gtxwq,orm-orm,12 +lre22_dev_gubts,ara-ayl,0 +lre22_dev_gvawh,xho-xho,11 +lre22_dev_gvfsb,ara-aeb,10 +lre22_dev_gvhgg,afr-afr,9 +lre22_dev_gvnaj,fra-ntf,8 +lre22_dev_gvysc,ara-aeb,10 +lre22_dev_gwfkz,xho-xho,2 +lre22_dev_gwnqp,xho-xho,7 +lre22_dev_gwumi,tso-tso,3 +lre22_dev_gwvcw,xho-xho,11 +lre22_dev_gwwxz,eng-iaf,1 +lre22_dev_gwzrc,eng-ens,11 +lre22_dev_gxtlx,fra-ntf,13 +lre22_dev_gxygl,tso-tso,9 +lre22_dev_gycld,orm-orm,4 +lre22_dev_gzakl,nbl-nbl,15 +lre22_dev_gzrgo,ara-arq,9 +lre22_dev_hbkul,orm-orm,6 +lre22_dev_hbodn,eng-ens,10 +lre22_dev_hbwgy,ara-arq,6 +lre22_dev_hbwyc,nbl-nbl,5 +lre22_dev_hczek,fra-ntf,7 +lre22_dev_hdpsb,nbl-nbl,6 +lre22_dev_hdvsb,ara-aeb,8 +lre22_dev_hetsy,xho-xho,10 +lre22_dev_hfgrm,ven-ven,12 +lre22_dev_hfurz,afr-afr,13 +lre22_dev_hfwyw,nbl-nbl,11 +lre22_dev_hgdqx,tso-tso,3 +lre22_dev_hgwdk,eng-ens,8 +lre22_dev_hgxqf,eng-iaf,8 +lre22_dev_hgyuk,ven-ven,11 +lre22_dev_hhetm,fra-ntf,14 +lre22_dev_hhjki,ara-arq,8 +lre22_dev_hhvtc,ara-arq,10 +lre22_dev_hhxqv,tso-tso,5 +lre22_dev_hiisb,nbl-nbl,15 +lre22_dev_hioxp,tso-tso,3 +lre22_dev_hjqaf,ara-aeb,9 +lre22_dev_hjqid,orm-orm,6 +lre22_dev_hjzwc,eng-iaf,3 +lre22_dev_hkdzu,ara-arq,9 +lre22_dev_hlatl,eng-iaf,12 +lre22_dev_hlywv,nbl-nbl,2 +lre22_dev_hlzxa,ven-ven,7 +lre22_dev_hmvzg,ara-ayl,3 +lre22_dev_hnjgb,eng-ens,9 +lre22_dev_hntdv,eng-ens,11 +lre22_dev_hoish,tir-tir,2 +lre22_dev_hokbg,ara-ayl,6 +lre22_dev_hondp,eng-iaf,8 +lre22_dev_hpbve,tir-tir,11 +lre22_dev_hpdvc,fra-ntf,8 +lre22_dev_hpgst,orm-orm,5 +lre22_dev_hqbjb,xho-xho,5 +lre22_dev_hqdev,tso-tso,2 +lre22_dev_hqidg,tir-tir,1 +lre22_dev_hqids,afr-afr,9 +lre22_dev_hqltr,tir-tir,4 +lre22_dev_hqqhq,eng-ens,11 +lre22_dev_hrmcg,zul-zul,13 +lre22_dev_hrrcp,afr-afr,8 +lre22_dev_hstgi,xho-xho,9 +lre22_dev_hsvpq,ara-ayl,9 +lre22_dev_hswsy,ara-aeb,4 +lre22_dev_htcgm,eng-iaf,6 +lre22_dev_htedo,xho-xho,13 +lre22_dev_hthkx,eng-iaf,7 +lre22_dev_htohd,afr-afr,6 +lre22_dev_htxik,fra-ntf,0 +lre22_dev_huqbr,xho-xho,10 +lre22_dev_hvdom,afr-afr,8 +lre22_dev_hvkoa,afr-afr,13 +lre22_dev_hvnkg,tir-tir,9 +lre22_dev_hvocp,nbl-nbl,12 +lre22_dev_hvqzj,zul-zul,12 +lre22_dev_hvwph,afr-afr,3 +lre22_dev_hwaqg,zul-zul,8 +lre22_dev_hwgvu,ara-aeb,6 +lre22_dev_hwhlz,ven-ven,11 +lre22_dev_hwkes,fra-ntf,12 +lre22_dev_hwvna,eng-ens,2 +lre22_dev_hxfim,eng-iaf,12 +lre22_dev_hxmdw,afr-afr,10 +lre22_dev_hxrnp,zul-zul,6 +lre22_dev_hxvie,tir-tir,9 +lre22_dev_hxvju,zul-zul,3 +lre22_dev_hxzxm,zul-zul,6 +lre22_dev_hybef,nbl-nbl,14 +lre22_dev_hyfok,eng-ens,2 +lre22_dev_hyscv,ara-arq,4 +lre22_dev_hyzod,eng-iaf,6 +lre22_dev_hzdpb,tso-tso,7 +lre22_dev_hzjwn,ara-aeb,5 +lre22_dev_hzljv,tir-tir,8 +lre22_dev_hzomy,tso-tso,9 +lre22_dev_iaaar,tso-tso,9 +lre22_dev_iaimu,afr-afr,13 +lre22_dev_iakmg,orm-orm,15 +lre22_dev_iarxv,ara-aeb,9 +lre22_dev_iaywv,ara-ayl,6 +lre22_dev_ibcne,eng-ens,11 +lre22_dev_ibeth,zul-zul,2 +lre22_dev_ibwbi,tir-tir,9 +lre22_dev_ibyqr,tso-tso,7 +lre22_dev_iccwp,eng-iaf,6 +lre22_dev_ichmi,afr-afr,12 +lre22_dev_idjrt,zul-zul,8 +lre22_dev_iegng,afr-afr,8 +lre22_dev_iezrr,ara-ayl,7 +lre22_dev_ifaib,ara-ayl,5 +lre22_dev_ifhil,tso-tso,9 +lre22_dev_ifptd,ven-ven,12 +lre22_dev_ifriu,ara-aeb,6 +lre22_dev_ignvp,zul-zul,13 +lre22_dev_igxzy,eng-iaf,12 +lre22_dev_ihdva,fra-ntf,10 +lre22_dev_iiydv,eng-iaf,5 +lre22_dev_ijoyg,ara-ayl,9 +lre22_dev_ikghg,eng-iaf,7 +lre22_dev_ikijv,ven-ven,2 +lre22_dev_ilawb,ara-aeb,8 +lre22_dev_ilgnm,orm-orm,6 +lre22_dev_ilqhp,orm-orm,13 +lre22_dev_imrsx,tso-tso,8 +lre22_dev_inrfz,ara-arq,1 +lre22_dev_inrlw,eng-ens,1 +lre22_dev_inttm,tso-tso,8 +lre22_dev_iorip,ven-ven,13 +lre22_dev_ioryq,ara-aeb,8 +lre22_dev_iosse,afr-afr,1 +lre22_dev_ipahz,tir-tir,12 +lre22_dev_ipaup,tir-tir,10 +lre22_dev_ipllz,tir-tir,12 +lre22_dev_iprih,ara-aeb,4 +lre22_dev_iqkpj,tir-tir,6 +lre22_dev_iqowb,ara-aeb,0 +lre22_dev_iqzfp,orm-orm,15 +lre22_dev_irhue,tso-tso,8 +lre22_dev_irkvo,orm-orm,15 +lre22_dev_irnie,ara-aeb,8 +lre22_dev_irnxg,zul-zul,9 +lre22_dev_irsgt,ven-ven,2 +lre22_dev_isavf,nbl-nbl,0 +lre22_dev_isfpd,nbl-nbl,11 +lre22_dev_iskfd,ara-arq,4 +lre22_dev_isndz,ara-arq,6 +lre22_dev_istwz,nbl-nbl,15 +lre22_dev_isxpy,orm-orm,5 +lre22_dev_iszkk,tir-tir,9 +lre22_dev_itdot,ara-ayl,9 +lre22_dev_itfgh,eng-iaf,9 +lre22_dev_itlqd,tir-tir,12 +lre22_dev_itmbo,ara-aeb,10 +lre22_dev_itznp,ara-aeb,3 +lre22_dev_iucwv,zul-zul,5 +lre22_dev_iuowb,ara-aeb,8 +lre22_dev_iupes,zul-zul,4 +lre22_dev_iurgk,fra-ntf,4 +lre22_dev_ivcpr,nbl-nbl,12 +lre22_dev_ivrwa,ven-ven,3 +lre22_dev_ivvlb,afr-afr,11 +lre22_dev_ivwhm,tir-tir,6 +lre22_dev_iwoya,ara-aeb,4 +lre22_dev_iwpvu,orm-orm,5 +lre22_dev_ixpuq,ara-ayl,5 +lre22_dev_ixpyb,tso-tso,11 +lre22_dev_iyfiz,eng-iaf,5 +lre22_dev_iylyu,xho-xho,12 +lre22_dev_iyuli,zul-zul,13 +lre22_dev_iyupt,orm-orm,5 +lre22_dev_iyxjf,zul-zul,12 +lre22_dev_iyzgz,tso-tso,10 +lre22_dev_izepb,ara-arq,4 +lre22_dev_izkix,ven-ven,10 +lre22_dev_izknz,ven-ven,12 +lre22_dev_jadfl,ara-arq,9 +lre22_dev_jafja,zul-zul,9 +lre22_dev_jamvn,ven-ven,1 +lre22_dev_jbach,eng-iaf,2 +lre22_dev_jbqcq,ara-aeb,6 +lre22_dev_jcxgo,afr-afr,6 +lre22_dev_jddrh,fra-ntf,13 +lre22_dev_jdjpg,tir-tir,12 +lre22_dev_jdtrb,eng-iaf,11 +lre22_dev_jdwjj,zul-zul,7 +lre22_dev_jdzqw,tir-tir,3 +lre22_dev_jeaev,nbl-nbl,8 +lre22_dev_jeobs,ara-aeb,9 +lre22_dev_jesxq,eng-ens,10 +lre22_dev_jgcla,ara-arq,2 +lre22_dev_jggxv,fra-ntf,3 +lre22_dev_jgntz,orm-orm,5 +lre22_dev_jhcao,ven-ven,7 +lre22_dev_jhgik,eng-ens,11 +lre22_dev_jhpkj,ara-arq,4 +lre22_dev_jhuof,orm-orm,15 +lre22_dev_jignq,ara-ayl,9 +lre22_dev_jjffc,ven-ven,13 +lre22_dev_jjkfe,eng-ens,9 +lre22_dev_jjqxi,ara-aeb,8 +lre22_dev_jjrgq,eng-iaf,4 +lre22_dev_jkacy,tso-tso,3 +lre22_dev_jkmin,orm-orm,15 +lre22_dev_jkobe,xho-xho,7 +lre22_dev_jkosd,zul-zul,10 +lre22_dev_jkovc,tso-tso,3 +lre22_dev_jktcq,zul-zul,7 +lre22_dev_jlodp,eng-ens,9 +lre22_dev_jmbjo,nbl-nbl,9 +lre22_dev_jmccw,ara-arq,3 +lre22_dev_jminj,fra-ntf,5 +lre22_dev_jmmyw,afr-afr,3 +lre22_dev_jobae,fra-ntf,13 +lre22_dev_jobsv,nbl-nbl,14 +lre22_dev_jobxi,ara-arq,5 +lre22_dev_joghi,ara-arq,6 +lre22_dev_johkj,xho-xho,7 +lre22_dev_jolqw,ara-ayl,5 +lre22_dev_jplye,fra-ntf,11 +lre22_dev_jpsmt,ara-arq,9 +lre22_dev_jqdnf,eng-iaf,13 +lre22_dev_jqqpg,orm-orm,5 +lre22_dev_jqqrs,nbl-nbl,11 +lre22_dev_jrmnp,tir-tir,9 +lre22_dev_jsahe,fra-ntf,12 +lre22_dev_jsciw,eng-ens,5 +lre22_dev_jsisu,eng-iaf,4 +lre22_dev_jstjq,zul-zul,4 +lre22_dev_jsxuw,eng-iaf,8 +lre22_dev_jtaxh,ven-ven,4 +lre22_dev_jtgjo,ara-arq,9 +lre22_dev_jtxor,orm-orm,3 +lre22_dev_junyj,orm-orm,5 +lre22_dev_juykt,ara-ayl,7 +lre22_dev_jvqzf,fra-ntf,9 +lre22_dev_jvvxl,afr-afr,7 +lre22_dev_jvxpt,nbl-nbl,1 +lre22_dev_jwfeb,eng-iaf,4 +lre22_dev_jwmmp,eng-ens,3 +lre22_dev_jwyiq,tso-tso,10 +lre22_dev_jxcmp,ara-aeb,10 +lre22_dev_jxfsy,ara-ayl,9 +lre22_dev_jxjar,tso-tso,10 +lre22_dev_jylrr,ara-aeb,9 +lre22_dev_jzciw,orm-orm,5 +lre22_dev_jzcyt,tso-tso,5 +lre22_dev_jzhpf,tso-tso,4 +lre22_dev_jzidh,afr-afr,11 +lre22_dev_jznzw,eng-iaf,6 +lre22_dev_jzoqd,afr-afr,7 +lre22_dev_jzwnu,ven-ven,11 +lre22_dev_kaoyk,afr-afr,6 +lre22_dev_kasoe,zul-zul,12 +lre22_dev_kaygq,eng-ens,9 +lre22_dev_kayqh,fra-ntf,8 +lre22_dev_kbpcw,eng-iaf,3 +lre22_dev_kbtrx,orm-orm,10 +lre22_dev_kcebk,ven-ven,7 +lre22_dev_kdbil,orm-orm,15 +lre22_dev_kddhf,ara-arq,10 +lre22_dev_kdeij,ara-ayl,3 +lre22_dev_kdiak,zul-zul,12 +lre22_dev_kedwl,nbl-nbl,12 +lre22_dev_keouf,fra-ntf,9 +lre22_dev_keozw,ara-aeb,10 +lre22_dev_kervm,eng-ens,7 +lre22_dev_kflpm,xho-xho,1 +lre22_dev_kfqpd,ara-arq,8 +lre22_dev_kgaqj,ara-aeb,8 +lre22_dev_kghnx,fra-ntf,3 +lre22_dev_kgoze,zul-zul,4 +lre22_dev_kgrxe,fra-ntf,9 +lre22_dev_kgsdu,ara-arq,5 +lre22_dev_kheef,xho-xho,15 +lre22_dev_khgyl,xho-xho,8 +lre22_dev_khsgr,tso-tso,7 +lre22_dev_khxvm,nbl-nbl,9 +lre22_dev_kijjo,ara-aeb,3 +lre22_dev_kiush,xho-xho,2 +lre22_dev_kiyso,ara-arq,1 +lre22_dev_kjewo,ven-ven,6 +lre22_dev_kjgkg,ara-ayl,5 +lre22_dev_kjksh,ven-ven,3 +lre22_dev_kjomd,afr-afr,4 +lre22_dev_kjrcy,afr-afr,11 +lre22_dev_kkauw,fra-ntf,10 +lre22_dev_kkiew,orm-orm,15 +lre22_dev_kkyyu,zul-zul,8 +lre22_dev_klafc,ara-ayl,4 +lre22_dev_klalo,eng-ens,5 +lre22_dev_kliip,afr-afr,1 +lre22_dev_klkxg,tso-tso,8 +lre22_dev_klqwc,ara-arq,7 +lre22_dev_kmbgg,tir-tir,12 +lre22_dev_kmgoo,tir-tir,8 +lre22_dev_kmnko,zul-zul,3 +lre22_dev_kmtyc,ara-aeb,8 +lre22_dev_kmxqj,xho-xho,8 +lre22_dev_kmzdw,fra-ntf,3 +lre22_dev_knxsi,ara-arq,9 +lre22_dev_kofob,orm-orm,7 +lre22_dev_kokfk,fra-ntf,14 +lre22_dev_kokir,nbl-nbl,12 +lre22_dev_kooxu,ara-arq,9 +lre22_dev_korip,tso-tso,7 +lre22_dev_kpbnd,zul-zul,4 +lre22_dev_kpnyf,eng-iaf,3 +lre22_dev_kpwts,ara-ayl,8 +lre22_dev_kpxne,orm-orm,6 +lre22_dev_kpzbl,ven-ven,12 +lre22_dev_kqact,zul-zul,0 +lre22_dev_kqfbl,eng-iaf,12 +lre22_dev_kqfsm,zul-zul,5 +lre22_dev_kqfyp,ara-arq,1 +lre22_dev_kqkqj,ara-ayl,7 +lre22_dev_kqvwr,xho-xho,13 +lre22_dev_kragl,zul-zul,13 +lre22_dev_krbdn,xho-xho,14 +lre22_dev_ksake,ara-aeb,8 +lre22_dev_ksoly,nbl-nbl,11 +lre22_dev_kttyt,orm-orm,5 +lre22_dev_kttzq,tso-tso,9 +lre22_dev_ktwaf,zul-zul,3 +lre22_dev_ktwqf,ven-ven,6 +lre22_dev_ktxef,zul-zul,0 +lre22_dev_ktztb,orm-orm,12 +lre22_dev_kufkm,nbl-nbl,15 +lre22_dev_kuqsu,afr-afr,9 +lre22_dev_kuyka,tir-tir,4 +lre22_dev_kvcpn,ara-ayl,3 +lre22_dev_kvghz,eng-iaf,10 +lre22_dev_kvswv,ven-ven,11 +lre22_dev_kxkos,orm-orm,10 +lre22_dev_kxkzg,ara-ayl,9 +lre22_dev_kxqef,ven-ven,12 +lre22_dev_kyjpf,ven-ven,7 +lre22_dev_kynap,ara-ayl,9 +lre22_dev_kyptg,ven-ven,8 +lre22_dev_kytyr,nbl-nbl,11 +lre22_dev_kywmf,orm-orm,4 +lre22_dev_kzibn,zul-zul,3 +lre22_dev_kzqxx,fra-ntf,1 +lre22_dev_lacgv,tso-tso,7 +lre22_dev_lagpe,tso-tso,6 +lre22_dev_lanuu,tso-tso,9 +lre22_dev_lapag,afr-afr,6 +lre22_dev_larnq,zul-zul,4 +lre22_dev_lbbvq,xho-xho,8 +lre22_dev_lbfca,ara-arq,8 +lre22_dev_lbhoj,orm-orm,11 +lre22_dev_lbiin,ara-ayl,4 +lre22_dev_lcdyj,ara-arq,9 +lre22_dev_ldasz,fra-ntf,9 +lre22_dev_ldbur,tso-tso,1 +lre22_dev_lddhs,orm-orm,12 +lre22_dev_ldedw,ara-aeb,5 +lre22_dev_ldmbr,ara-ayl,5 +lre22_dev_ldmqc,tir-tir,7 +lre22_dev_leadw,eng-iaf,3 +lre22_dev_leaqq,tso-tso,10 +lre22_dev_ledsh,afr-afr,11 +lre22_dev_leovk,afr-afr,6 +lre22_dev_lexlh,ara-aeb,2 +lre22_dev_lfilk,eng-ens,10 +lre22_dev_lfyll,zul-zul,10 +lre22_dev_lgada,zul-zul,6 +lre22_dev_lgcjy,afr-afr,9 +lre22_dev_lgfri,ara-aeb,5 +lre22_dev_lgkbt,xho-xho,4 +lre22_dev_lhbjq,ara-arq,0 +lre22_dev_lhemi,xho-xho,9 +lre22_dev_lhfne,ara-arq,6 +lre22_dev_lhmtg,ara-arq,9 +lre22_dev_lieso,ara-aeb,8 +lre22_dev_likcy,afr-afr,13 +lre22_dev_lipyu,zul-zul,12 +lre22_dev_lisum,ven-ven,4 +lre22_dev_ljevp,ara-ayl,3 +lre22_dev_ljijh,orm-orm,3 +lre22_dev_ljylg,nbl-nbl,13 +lre22_dev_lkfig,ara-ayl,2 +lre22_dev_lklnc,ara-arq,3 +lre22_dev_lkopy,tir-tir,9 +lre22_dev_lllwi,eng-iaf,5 +lre22_dev_llstb,nbl-nbl,10 +lre22_dev_lmeax,eng-iaf,10 +lre22_dev_lmkui,ara-arq,7 +lre22_dev_lmrbp,tir-tir,9 +lre22_dev_lnejh,eng-ens,10 +lre22_dev_lnttv,ven-ven,10 +lre22_dev_loxqz,eng-iaf,8 +lre22_dev_loybq,ara-aeb,10 +lre22_dev_lpadb,fra-ntf,4 +lre22_dev_lpahk,nbl-nbl,11 +lre22_dev_lphgs,tir-tir,7 +lre22_dev_lphoa,eng-ens,2 +lre22_dev_lpkie,eng-iaf,5 +lre22_dev_lpkpc,zul-zul,6 +lre22_dev_lptpx,eng-iaf,4 +lre22_dev_lqwcv,xho-xho,13 +lre22_dev_lrgwx,orm-orm,10 +lre22_dev_lruoj,orm-orm,2 +lre22_dev_lrwee,fra-ntf,10 +lre22_dev_lsess,ven-ven,1 +lre22_dev_lsycj,tir-tir,9 +lre22_dev_ltaoe,eng-ens,8 +lre22_dev_ltish,ara-aeb,5 +lre22_dev_ltqeb,eng-ens,8 +lre22_dev_ltzfg,ven-ven,10 +lre22_dev_luuhd,ara-arq,2 +lre22_dev_lvejl,zul-zul,11 +lre22_dev_lvgsm,tir-tir,10 +lre22_dev_lvwle,xho-xho,7 +lre22_dev_lvxea,tir-tir,8 +lre22_dev_lwsmk,eng-ens,10 +lre22_dev_lwzhq,ara-ayl,3 +lre22_dev_lxbdd,ara-ayl,8 +lre22_dev_lxdgx,nbl-nbl,1 +lre22_dev_lxjij,ara-ayl,7 +lre22_dev_lxldm,tso-tso,8 +lre22_dev_lxmsa,zul-zul,11 +lre22_dev_lxugv,zul-zul,13 +lre22_dev_lxwig,tso-tso,4 +lre22_dev_lyigi,xho-xho,4 +lre22_dev_lymzv,ara-arq,6 +lre22_dev_lyuls,ara-arq,4 +lre22_dev_lyyzw,ara-ayl,5 +lre22_dev_lzhrm,ara-arq,8 +lre22_dev_lzjgb,xho-xho,12 +lre22_dev_lzrpe,xho-xho,8 +lre22_dev_lzvmq,fra-ntf,13 +lre22_dev_maagy,ven-ven,6 +lre22_dev_mabmx,ara-arq,4 +lre22_dev_macre,zul-zul,7 +lre22_dev_maggb,nbl-nbl,7 +lre22_dev_margf,ara-ayl,6 +lre22_dev_maydg,eng-iaf,4 +lre22_dev_mbsgm,zul-zul,7 +lre22_dev_mbttd,fra-ntf,14 +lre22_dev_mcebh,tso-tso,8 +lre22_dev_mcfve,ara-ayl,3 +lre22_dev_mclrc,zul-zul,12 +lre22_dev_mcvgl,ara-ayl,5 +lre22_dev_mdgok,ara-aeb,5 +lre22_dev_mdilb,ven-ven,3 +lre22_dev_mdzqr,nbl-nbl,11 +lre22_dev_mehfu,ara-arq,3 +lre22_dev_meiyg,eng-ens,11 +lre22_dev_merbq,orm-orm,9 +lre22_dev_mfoys,afr-afr,8 +lre22_dev_mgpfx,xho-xho,8 +lre22_dev_mgtzj,zul-zul,12 +lre22_dev_mgxxc,ven-ven,11 +lre22_dev_mhldj,nbl-nbl,14 +lre22_dev_mhvio,eng-iaf,6 +lre22_dev_mhxgi,tir-tir,9 +lre22_dev_miegc,fra-ntf,6 +lre22_dev_miwyu,ara-aeb,8 +lre22_dev_mjocm,ara-aeb,2 +lre22_dev_mjqij,orm-orm,12 +lre22_dev_mjxgy,afr-afr,8 +lre22_dev_mkeyt,tir-tir,12 +lre22_dev_mklub,ven-ven,4 +lre22_dev_mknzf,ara-aeb,10 +lre22_dev_mlhes,ara-arq,9 +lre22_dev_mlhse,tso-tso,3 +lre22_dev_mlhtc,orm-orm,8 +lre22_dev_mlpuq,ven-ven,10 +lre22_dev_mluow,orm-orm,2 +lre22_dev_mmwtu,ara-arq,4 +lre22_dev_mmwzf,tso-tso,7 +lre22_dev_mnjdq,tir-tir,10 +lre22_dev_mnkfe,nbl-nbl,4 +lre22_dev_mnmcm,ara-arq,3 +lre22_dev_mocss,xho-xho,9 +lre22_dev_mohxo,zul-zul,12 +lre22_dev_mojui,fra-ntf,1 +lre22_dev_mojvy,xho-xho,7 +lre22_dev_molqa,fra-ntf,14 +lre22_dev_mopiq,nbl-nbl,14 +lre22_dev_moqto,tir-tir,12 +lre22_dev_morri,ara-aeb,8 +lre22_dev_mpxyg,eng-ens,4 +lre22_dev_mqiap,xho-xho,14 +lre22_dev_mqxep,ara-ayl,2 +lre22_dev_mrcoe,ara-ayl,7 +lre22_dev_mriiq,tso-tso,4 +lre22_dev_mryoy,eng-ens,11 +lre22_dev_mryzh,ara-arq,4 +lre22_dev_msadm,ven-ven,2 +lre22_dev_msghz,nbl-nbl,11 +lre22_dev_mtpfp,ara-aeb,9 +lre22_dev_mtqft,orm-orm,14 +lre22_dev_mtzvt,ara-aeb,10 +lre22_dev_munim,xho-xho,15 +lre22_dev_murhb,nbl-nbl,1 +lre22_dev_mvbra,xho-xho,4 +lre22_dev_mvhza,afr-afr,13 +lre22_dev_mviud,xho-xho,12 +lre22_dev_mvxjk,afr-afr,9 +lre22_dev_mwnkm,orm-orm,8 +lre22_dev_mwoml,xho-xho,9 +lre22_dev_mxhup,eng-ens,8 +lre22_dev_mykuh,ara-ayl,5 +lre22_dev_myqfn,eng-iaf,4 +lre22_dev_mywmj,ven-ven,9 +lre22_dev_mzbrr,ara-arq,10 +lre22_dev_mzsiq,afr-afr,9 +lre22_dev_mztms,eng-ens,3 +lre22_dev_mzuxc,ara-arq,9 +lre22_dev_nbdbe,ara-ayl,7 +lre22_dev_nbjqz,ara-aeb,9 +lre22_dev_nbyhp,afr-afr,3 +lre22_dev_ncnyb,ven-ven,8 +lre22_dev_ncocl,nbl-nbl,6 +lre22_dev_ndecq,ara-ayl,8 +lre22_dev_ndjsl,nbl-nbl,6 +lre22_dev_nelsk,orm-orm,0 +lre22_dev_nenly,eng-iaf,11 +lre22_dev_neqkb,ven-ven,2 +lre22_dev_nfjid,orm-orm,12 +lre22_dev_nfkqr,orm-orm,8 +lre22_dev_nfoas,orm-orm,15 +lre22_dev_ngjbm,eng-ens,10 +lre22_dev_ngmbz,eng-iaf,9 +lre22_dev_ngnua,fra-ntf,10 +lre22_dev_nguuu,fra-ntf,13 +lre22_dev_ngyse,ven-ven,7 +lre22_dev_nhfso,fra-ntf,14 +lre22_dev_nhuue,zul-zul,1 +lre22_dev_niack,ara-ayl,8 +lre22_dev_niari,ven-ven,7 +lre22_dev_nibme,ara-arq,9 +lre22_dev_nikby,tso-tso,10 +lre22_dev_nimex,ara-ayl,8 +lre22_dev_nivmv,xho-xho,11 +lre22_dev_nkebu,eng-ens,5 +lre22_dev_nkgml,eng-ens,10 +lre22_dev_nkofi,fra-ntf,11 +lre22_dev_nkrez,xho-xho,5 +lre22_dev_nkscn,tso-tso,5 +lre22_dev_nkwrs,ara-aeb,2 +lre22_dev_nkxcy,afr-afr,4 +lre22_dev_nlast,xho-xho,12 +lre22_dev_nlcun,eng-ens,0 +lre22_dev_nljyr,afr-afr,5 +lre22_dev_nlkdv,eng-iaf,12 +lre22_dev_nlpcs,ara-ayl,7 +lre22_dev_nlrcn,ara-ayl,4 +lre22_dev_nlxla,xho-xho,0 +lre22_dev_nmmij,ara-ayl,4 +lre22_dev_nmrkv,fra-ntf,12 +lre22_dev_nmufp,tso-tso,10 +lre22_dev_nnbmo,tso-tso,10 +lre22_dev_nnnpi,afr-afr,4 +lre22_dev_nnzok,tir-tir,5 +lre22_dev_noqch,fra-ntf,12 +lre22_dev_nownd,xho-xho,2 +lre22_dev_npabl,nbl-nbl,5 +lre22_dev_npjhu,afr-afr,6 +lre22_dev_nqbks,afr-afr,11 +lre22_dev_nqijo,orm-orm,7 +lre22_dev_nqljj,ara-arq,6 +lre22_dev_nqvfr,tir-tir,7 +lre22_dev_nrtej,tir-tir,11 +lre22_dev_nshvj,nbl-nbl,7 +lre22_dev_nsmyy,tir-tir,12 +lre22_dev_nsqcm,fra-ntf,13 +lre22_dev_nstrj,nbl-nbl,9 +lre22_dev_nsvla,nbl-nbl,10 +lre22_dev_nthbx,eng-ens,0 +lre22_dev_nvwkf,ven-ven,0 +lre22_dev_nvwzy,tso-tso,11 +lre22_dev_nvyyg,orm-orm,7 +lre22_dev_nxdml,eng-ens,1 +lre22_dev_nxmxb,zul-zul,12 +lre22_dev_nxqpl,nbl-nbl,13 +lre22_dev_nxslf,fra-ntf,9 +lre22_dev_nyaof,nbl-nbl,5 +lre22_dev_nzeot,zul-zul,12 +lre22_dev_nzhhf,ara-ayl,7 +lre22_dev_nzpbh,fra-ntf,14 +lre22_dev_nzyjp,orm-orm,4 +lre22_dev_nzzyd,xho-xho,11 +lre22_dev_oaiij,ven-ven,7 +lre22_dev_oaimr,orm-orm,14 +lre22_dev_oatzl,fra-ntf,13 +lre22_dev_oaycx,ara-ayl,8 +lre22_dev_objwd,eng-ens,1 +lre22_dev_oboem,tir-tir,9 +lre22_dev_obzyj,xho-xho,5 +lre22_dev_occhn,fra-ntf,9 +lre22_dev_ocfcr,ven-ven,7 +lre22_dev_ochni,ven-ven,13 +lre22_dev_ociva,tir-tir,5 +lre22_dev_odofq,xho-xho,5 +lre22_dev_odtjr,eng-ens,11 +lre22_dev_oejjy,fra-ntf,4 +lre22_dev_offnw,afr-afr,8 +lre22_dev_ofgqs,ara-ayl,6 +lre22_dev_ofkvj,xho-xho,15 +lre22_dev_ofzhh,orm-orm,11 +lre22_dev_ogilp,afr-afr,6 +lre22_dev_oglxd,ara-ayl,4 +lre22_dev_ogoyt,tso-tso,8 +lre22_dev_ogpou,ven-ven,3 +lre22_dev_ohatz,eng-ens,10 +lre22_dev_ohlzs,nbl-nbl,15 +lre22_dev_ohpzj,tir-tir,4 +lre22_dev_ohzdt,ara-aeb,5 +lre22_dev_oicrh,eng-ens,9 +lre22_dev_oigem,orm-orm,14 +lre22_dev_ojbnw,ara-arq,4 +lre22_dev_ojebm,ven-ven,7 +lre22_dev_ojila,ara-arq,4 +lre22_dev_ojiso,fra-ntf,5 +lre22_dev_ojpdy,tso-tso,9 +lre22_dev_ojtki,tir-tir,11 +lre22_dev_ojxso,nbl-nbl,4 +lre22_dev_okdqa,fra-ntf,14 +lre22_dev_oktvp,ara-ayl,7 +lre22_dev_okvsg,zul-zul,10 +lre22_dev_okyah,tso-tso,11 +lre22_dev_olabw,ara-arq,4 +lre22_dev_omhry,tir-tir,4 +lre22_dev_omnrf,eng-iaf,13 +lre22_dev_omptm,ven-ven,6 +lre22_dev_omqfq,fra-ntf,4 +lre22_dev_onqdn,fra-ntf,13 +lre22_dev_onsyx,tso-tso,9 +lre22_dev_onvgj,tir-tir,6 +lre22_dev_onzha,zul-zul,10 +lre22_dev_ooptw,nbl-nbl,5 +lre22_dev_oowvo,eng-ens,11 +lre22_dev_ooyea,tso-tso,2 +lre22_dev_oozri,ven-ven,0 +lre22_dev_opazz,ara-ayl,1 +lre22_dev_opqkl,nbl-nbl,11 +lre22_dev_oqsva,ara-ayl,2 +lre22_dev_oquxw,nbl-nbl,15 +lre22_dev_orktv,afr-afr,5 +lre22_dev_ornjf,ara-ayl,6 +lre22_dev_ortbp,ara-arq,0 +lre22_dev_osauy,fra-ntf,12 +lre22_dev_osnch,afr-afr,1 +lre22_dev_otelo,eng-iaf,7 +lre22_dev_otewx,tso-tso,10 +lre22_dev_otnwj,eng-ens,3 +lre22_dev_ouecw,ara-aeb,10 +lre22_dev_ouzui,ara-arq,3 +lre22_dev_ovdtj,ara-ayl,6 +lre22_dev_ovjny,tso-tso,1 +lre22_dev_ovqwp,ara-ayl,7 +lre22_dev_ovvkn,afr-afr,11 +lre22_dev_ovvmi,tso-tso,2 +lre22_dev_owyeq,ara-arq,6 +lre22_dev_oxlrt,ara-aeb,10 +lre22_dev_oybst,zul-zul,9 +lre22_dev_oybua,nbl-nbl,2 +lre22_dev_oykjs,tso-tso,4 +lre22_dev_oyswm,ara-arq,8 +lre22_dev_oyxbj,ven-ven,8 +lre22_dev_oyxtq,eng-ens,11 +lre22_dev_oyyxh,ara-arq,8 +lre22_dev_ozbct,tir-tir,12 +lre22_dev_ozcvt,ara-aeb,10 +lre22_dev_ozjel,ara-arq,10 +lre22_dev_ozmuj,zul-zul,3 +lre22_dev_ozuvk,tir-tir,10 +lre22_dev_paguh,fra-ntf,1 +lre22_dev_paspj,tir-tir,6 +lre22_dev_pbmai,fra-ntf,6 +lre22_dev_pbpug,zul-zul,10 +lre22_dev_pbsbs,tso-tso,10 +lre22_dev_pbszl,tso-tso,1 +lre22_dev_pbxxf,eng-iaf,2 +lre22_dev_pcgvn,eng-iaf,3 +lre22_dev_pcmbn,eng-ens,1 +lre22_dev_pcqce,ara-arq,8 +lre22_dev_pdlnr,tso-tso,2 +lre22_dev_pdrus,orm-orm,1 +lre22_dev_pedyx,eng-iaf,12 +lre22_dev_pegyr,nbl-nbl,11 +lre22_dev_pesej,ara-arq,4 +lre22_dev_pevhh,tir-tir,12 +lre22_dev_peykl,xho-xho,13 +lre22_dev_pezwc,tso-tso,4 +lre22_dev_pfemh,eng-iaf,4 +lre22_dev_pfrfc,ven-ven,8 +lre22_dev_pfsoa,nbl-nbl,15 +lre22_dev_pgeoo,tso-tso,9 +lre22_dev_pgwei,orm-orm,2 +lre22_dev_pgxyv,tso-tso,4 +lre22_dev_phofb,ara-ayl,8 +lre22_dev_phula,nbl-nbl,14 +lre22_dev_phwnf,tso-tso,9 +lre22_dev_pifyx,orm-orm,9 +lre22_dev_pilvp,tso-tso,11 +lre22_dev_pinzj,nbl-nbl,11 +lre22_dev_piocw,ara-aeb,8 +lre22_dev_pipas,zul-zul,13 +lre22_dev_pipgo,afr-afr,3 +lre22_dev_pitmn,ara-arq,10 +lre22_dev_pizdz,ara-aeb,2 +lre22_dev_pizlx,ara-ayl,6 +lre22_dev_pjatg,ven-ven,9 +lre22_dev_pjavt,orm-orm,11 +lre22_dev_pjcec,eng-iaf,12 +lre22_dev_pjdwy,afr-afr,1 +lre22_dev_pjlmw,ara-ayl,7 +lre22_dev_pjsqe,eng-ens,7 +lre22_dev_pkdij,ara-ayl,3 +lre22_dev_pkekq,ara-aeb,3 +lre22_dev_pkpst,eng-iaf,9 +lre22_dev_plhqb,nbl-nbl,13 +lre22_dev_plowv,nbl-nbl,5 +lre22_dev_plrjb,xho-xho,12 +lre22_dev_pmove,eng-iaf,4 +lre22_dev_pneax,eng-ens,11 +lre22_dev_pnexr,nbl-nbl,9 +lre22_dev_pngea,nbl-nbl,11 +lre22_dev_pnipe,eng-ens,9 +lre22_dev_pnmlr,ara-arq,5 +lre22_dev_pnsuk,xho-xho,2 +lre22_dev_pnuct,tir-tir,10 +lre22_dev_pocev,ara-arq,4 +lre22_dev_powkd,eng-ens,9 +lre22_dev_pprvm,ara-ayl,7 +lre22_dev_ppyle,ara-aeb,7 +lre22_dev_pqfda,fra-ntf,5 +lre22_dev_pqryo,afr-afr,4 +lre22_dev_prrzc,afr-afr,9 +lre22_dev_psjuf,afr-afr,13 +lre22_dev_psngm,zul-zul,13 +lre22_dev_psroz,fra-ntf,13 +lre22_dev_pssqo,orm-orm,10 +lre22_dev_psvlh,fra-ntf,13 +lre22_dev_pswld,tir-tir,10 +lre22_dev_ptcns,nbl-nbl,11 +lre22_dev_ptobm,afr-afr,6 +lre22_dev_ptowg,tir-tir,8 +lre22_dev_ptreu,xho-xho,15 +lre22_dev_ptwru,fra-ntf,14 +lre22_dev_ptyff,ara-ayl,1 +lre22_dev_ptygm,tir-tir,3 +lre22_dev_pudne,ara-arq,4 +lre22_dev_puelp,zul-zul,9 +lre22_dev_purej,nbl-nbl,9 +lre22_dev_puyvb,ara-ayl,3 +lre22_dev_pvrdh,ara-aeb,9 +lre22_dev_pvryr,eng-ens,11 +lre22_dev_pwets,tir-tir,9 +lre22_dev_pwgnk,tir-tir,10 +lre22_dev_pwhyy,tir-tir,11 +lre22_dev_pwkgs,zul-zul,2 +lre22_dev_pwtdp,eng-iaf,0 +lre22_dev_pxccc,ara-ayl,5 +lre22_dev_pxpdo,xho-xho,14 +lre22_dev_pxsot,xho-xho,14 +lre22_dev_pxuhy,ara-aeb,6 +lre22_dev_pybxn,eng-iaf,11 +lre22_dev_pyoft,eng-iaf,12 +lre22_dev_pyvql,eng-iaf,7 +lre22_dev_pzcnz,nbl-nbl,2 +lre22_dev_pzhrk,ara-aeb,4 +lre22_dev_qadjy,ven-ven,7 +lre22_dev_qaeek,ven-ven,7 +lre22_dev_qafse,eng-iaf,11 +lre22_dev_qahft,ven-ven,13 +lre22_dev_qakoa,zul-zul,9 +lre22_dev_qalhd,ara-ayl,2 +lre22_dev_qazjh,ven-ven,11 +lre22_dev_qbfkw,eng-iaf,6 +lre22_dev_qbgcd,fra-ntf,14 +lre22_dev_qbisr,ara-ayl,3 +lre22_dev_qcnbm,ven-ven,3 +lre22_dev_qdcbb,tir-tir,5 +lre22_dev_qdfgi,zul-zul,12 +lre22_dev_qdmbj,eng-ens,4 +lre22_dev_qdwtg,fra-ntf,11 +lre22_dev_qefvt,ara-ayl,7 +lre22_dev_qffki,orm-orm,13 +lre22_dev_qfplk,tir-tir,8 +lre22_dev_qgxdl,xho-xho,14 +lre22_dev_qhadd,afr-afr,2 +lre22_dev_qhgaf,ara-ayl,7 +lre22_dev_qhinf,tir-tir,6 +lre22_dev_qhkjz,ara-aeb,6 +lre22_dev_qhlwj,ara-arq,8 +lre22_dev_qiarf,ara-arq,4 +lre22_dev_qidwl,ara-arq,5 +lre22_dev_qivzc,orm-orm,12 +lre22_dev_qizyt,ara-ayl,2 +lre22_dev_qjeue,ara-arq,9 +lre22_dev_qjgxh,ara-arq,1 +lre22_dev_qkdhb,afr-afr,1 +lre22_dev_qkiqi,orm-orm,4 +lre22_dev_qkoth,tir-tir,5 +lre22_dev_qkucq,fra-ntf,3 +lre22_dev_qltea,nbl-nbl,2 +lre22_dev_qlube,ara-aeb,5 +lre22_dev_qmcji,nbl-nbl,15 +lre22_dev_qmpzc,nbl-nbl,11 +lre22_dev_qmsog,tir-tir,3 +lre22_dev_qoech,eng-iaf,7 +lre22_dev_qovfg,ara-arq,10 +lre22_dev_qozzv,tir-tir,2 +lre22_dev_qpasx,tir-tir,3 +lre22_dev_qpauj,ara-aeb,4 +lre22_dev_qpfch,orm-orm,6 +lre22_dev_qpvea,orm-orm,9 +lre22_dev_qrgka,ara-arq,8 +lre22_dev_qrqmm,ara-ayl,7 +lre22_dev_qsaol,xho-xho,14 +lre22_dev_qsgpx,ara-arq,10 +lre22_dev_qspeg,eng-ens,7 +lre22_dev_qsvbe,fra-ntf,3 +lre22_dev_qsxoh,fra-ntf,5 +lre22_dev_qtbnc,xho-xho,7 +lre22_dev_qthzi,afr-afr,12 +lre22_dev_qtmaw,fra-ntf,13 +lre22_dev_qtnqh,eng-iaf,13 +lre22_dev_qtpsb,tso-tso,8 +lre22_dev_qtqpc,eng-iaf,12 +lre22_dev_qtwfv,eng-iaf,4 +lre22_dev_qvamq,fra-ntf,9 +lre22_dev_qveuq,tir-tir,9 +lre22_dev_qvffg,orm-orm,0 +lre22_dev_qvplf,xho-xho,6 +lre22_dev_qvqvi,ven-ven,7 +lre22_dev_qwhsh,afr-afr,7 +lre22_dev_qwiwm,eng-ens,9 +lre22_dev_qxbch,ara-aeb,9 +lre22_dev_qxlca,nbl-nbl,2 +lre22_dev_qxscb,afr-afr,2 +lre22_dev_qyoqn,fra-ntf,9 +lre22_dev_qyrgs,nbl-nbl,3 +lre22_dev_qytdl,fra-ntf,9 +lre22_dev_qyyeb,eng-iaf,12 +lre22_dev_qyzqb,tso-tso,8 +lre22_dev_qzayi,orm-orm,12 +lre22_dev_qzexr,eng-iaf,5 +lre22_dev_qzrfi,ara-arq,10 +lre22_dev_qztjh,orm-orm,3 +lre22_dev_qztze,eng-iaf,12 +lre22_dev_raent,eng-iaf,2 +lre22_dev_ragjh,orm-orm,14 +lre22_dev_ramzu,ara-ayl,6 +lre22_dev_ratmr,ven-ven,7 +lre22_dev_rawak,ara-arq,9 +lre22_dev_rbbne,ven-ven,7 +lre22_dev_rbcul,eng-iaf,10 +lre22_dev_rbsoy,eng-iaf,12 +lre22_dev_rbxqy,tso-tso,9 +lre22_dev_rcejf,xho-xho,7 +lre22_dev_rdbzt,zul-zul,7 +lre22_dev_rdhpu,ara-aeb,8 +lre22_dev_rdsew,ven-ven,2 +lre22_dev_rdtkf,ven-ven,11 +lre22_dev_reeba,ara-ayl,6 +lre22_dev_relip,eng-iaf,11 +lre22_dev_rfdoh,ara-aeb,9 +lre22_dev_rfkja,xho-xho,11 +lre22_dev_rflev,ven-ven,3 +lre22_dev_rfqcx,nbl-nbl,14 +lre22_dev_rfwuv,eng-ens,1 +lre22_dev_rgsil,fra-ntf,6 +lre22_dev_rhcuj,ara-aeb,8 +lre22_dev_rhdgz,eng-iaf,12 +lre22_dev_rhpmn,ven-ven,7 +lre22_dev_rhtoe,eng-iaf,11 +lre22_dev_rhyqq,ara-aeb,2 +lre22_dev_riltn,ara-aeb,10 +lre22_dev_rinti,xho-xho,12 +lre22_dev_rioxh,xho-xho,12 +lre22_dev_ripix,tir-tir,10 +lre22_dev_rjbji,ven-ven,10 +lre22_dev_rjqbz,eng-iaf,0 +lre22_dev_rkemd,tir-tir,8 +lre22_dev_rktzl,nbl-nbl,13 +lre22_dev_rkuni,xho-xho,15 +lre22_dev_rlsgd,fra-ntf,5 +lre22_dev_rlypa,afr-afr,7 +lre22_dev_rmeav,ven-ven,8 +lre22_dev_rmejy,fra-ntf,12 +lre22_dev_rmeuz,zul-zul,6 +lre22_dev_rmjsj,nbl-nbl,5 +lre22_dev_rmtxj,eng-iaf,13 +lre22_dev_rnpyc,ara-ayl,2 +lre22_dev_rnunw,orm-orm,9 +lre22_dev_rnvvw,tso-tso,9 +lre22_dev_roavh,fra-ntf,6 +lre22_dev_rodbi,xho-xho,15 +lre22_dev_roeph,xho-xho,13 +lre22_dev_rolun,ara-ayl,3 +lre22_dev_roydh,xho-xho,7 +lre22_dev_rpajy,ara-aeb,8 +lre22_dev_rpdsm,ara-ayl,5 +lre22_dev_rpfae,afr-afr,9 +lre22_dev_rpvyc,eng-iaf,9 +lre22_dev_rqxot,tso-tso,9 +lre22_dev_rumiv,ara-aeb,9 +lre22_dev_runhh,afr-afr,6 +lre22_dev_ruvpd,eng-iaf,4 +lre22_dev_rvpkd,fra-ntf,1 +lre22_dev_rvqxq,orm-orm,12 +lre22_dev_rvstc,ara-arq,7 +lre22_dev_rwbea,tir-tir,9 +lre22_dev_rweyk,nbl-nbl,2 +lre22_dev_rwnfb,eng-ens,8 +lre22_dev_rwrhn,afr-afr,11 +lre22_dev_rxhkp,ara-arq,3 +lre22_dev_rxixz,nbl-nbl,15 +lre22_dev_rxmft,zul-zul,7 +lre22_dev_ryknh,ara-ayl,5 +lre22_dev_rytyf,zul-zul,12 +lre22_dev_rywss,tso-tso,1 +lre22_dev_rzjrd,nbl-nbl,7 +lre22_dev_rzpyx,tso-tso,2 +lre22_dev_satbk,ven-ven,7 +lre22_dev_sbfhc,fra-ntf,6 +lre22_dev_sboxi,xho-xho,15 +lre22_dev_scxxn,eng-iaf,5 +lre22_dev_scyvp,ara-aeb,6 +lre22_dev_sdbou,tir-tir,10 +lre22_dev_sddua,tir-tir,11 +lre22_dev_seasj,afr-afr,7 +lre22_dev_sevcw,tir-tir,12 +lre22_dev_sfevx,tso-tso,4 +lre22_dev_sfqgm,fra-ntf,1 +lre22_dev_sgaza,ara-aeb,8 +lre22_dev_sgkrh,afr-afr,9 +lre22_dev_sgmjh,nbl-nbl,14 +lre22_dev_shafn,ven-ven,8 +lre22_dev_shaob,orm-orm,10 +lre22_dev_shnns,afr-afr,6 +lre22_dev_siprc,ven-ven,7 +lre22_dev_sisge,afr-afr,13 +lre22_dev_siuwu,ara-arq,10 +lre22_dev_sivik,fra-ntf,2 +lre22_dev_sjyoo,afr-afr,1 +lre22_dev_skacz,fra-ntf,13 +lre22_dev_skcai,orm-orm,12 +lre22_dev_skctw,nbl-nbl,0 +lre22_dev_skygk,afr-afr,13 +lre22_dev_slraf,ara-aeb,6 +lre22_dev_slrzl,eng-ens,11 +lre22_dev_sltzh,xho-xho,6 +lre22_dev_sluki,ven-ven,1 +lre22_dev_slyez,tso-tso,8 +lre22_dev_slzuh,xho-xho,15 +lre22_dev_smdsm,nbl-nbl,7 +lre22_dev_smhae,ara-ayl,3 +lre22_dev_smxhe,ara-aeb,10 +lre22_dev_snayr,afr-afr,2 +lre22_dev_snbxs,eng-ens,8 +lre22_dev_sngol,tso-tso,9 +lre22_dev_snhun,fra-ntf,13 +lre22_dev_snkib,ven-ven,8 +lre22_dev_snqld,eng-iaf,2 +lre22_dev_sntvb,eng-ens,11 +lre22_dev_snzbl,tir-tir,12 +lre22_dev_sobid,afr-afr,3 +lre22_dev_soknx,orm-orm,15 +lre22_dev_spesw,ven-ven,13 +lre22_dev_sphuq,eng-iaf,12 +lre22_dev_spqcy,xho-xho,11 +lre22_dev_sqcyu,zul-zul,9 +lre22_dev_sqdkr,eng-iaf,13 +lre22_dev_sqfnt,ara-aeb,9 +lre22_dev_sqhrr,eng-ens,11 +lre22_dev_sqyiu,ara-ayl,4 +lre22_dev_srbwp,ara-aeb,10 +lre22_dev_srokn,afr-afr,6 +lre22_dev_srzck,ara-ayl,3 +lre22_dev_ssbei,tso-tso,10 +lre22_dev_ssfmz,eng-iaf,12 +lre22_dev_ssmgk,xho-xho,10 +lre22_dev_ssmsy,xho-xho,4 +lre22_dev_stgcb,afr-afr,10 +lre22_dev_stihb,afr-afr,0 +lre22_dev_stkav,ara-aeb,9 +lre22_dev_stkrw,xho-xho,3 +lre22_dev_sttnk,fra-ntf,8 +lre22_dev_stwkk,eng-iaf,12 +lre22_dev_stwrt,nbl-nbl,1 +lre22_dev_subio,afr-afr,1 +lre22_dev_sumjk,ara-arq,6 +lre22_dev_suocb,nbl-nbl,6 +lre22_dev_svcbx,tso-tso,9 +lre22_dev_svllg,fra-ntf,14 +lre22_dev_svvqs,afr-afr,3 +lre22_dev_svxyz,ara-ayl,1 +lre22_dev_swhlf,ara-aeb,10 +lre22_dev_swhnk,fra-ntf,12 +lre22_dev_swnrg,ven-ven,12 +lre22_dev_swofz,zul-zul,4 +lre22_dev_swuls,tso-tso,8 +lre22_dev_sxfkn,ara-aeb,2 +lre22_dev_sycoz,tir-tir,10 +lre22_dev_syoek,fra-ntf,5 +lre22_dev_sypnb,ven-ven,13 +lre22_dev_syvrt,eng-iaf,8 +lre22_dev_szmoc,ven-ven,6 +lre22_dev_szmwp,eng-ens,8 +lre22_dev_talec,ven-ven,11 +lre22_dev_tasfs,ven-ven,7 +lre22_dev_tbbrr,xho-xho,5 +lre22_dev_tbcun,ara-aeb,3 +lre22_dev_tbhnw,nbl-nbl,15 +lre22_dev_tblhf,ven-ven,12 +lre22_dev_tbozq,xho-xho,1 +lre22_dev_tcckd,ara-ayl,3 +lre22_dev_tcele,tso-tso,11 +lre22_dev_tciob,tso-tso,10 +lre22_dev_tcpxj,tir-tir,9 +lre22_dev_tdejo,tir-tir,6 +lre22_dev_tdfqo,tso-tso,0 +lre22_dev_tdhhf,zul-zul,10 +lre22_dev_tdjje,ven-ven,10 +lre22_dev_tdkrp,orm-orm,6 +lre22_dev_tebop,tso-tso,10 +lre22_dev_teeqm,ven-ven,6 +lre22_dev_tejsn,tir-tir,12 +lre22_dev_teptc,ara-arq,10 +lre22_dev_tetmt,orm-orm,9 +lre22_dev_tfkij,ara-aeb,2 +lre22_dev_tfnin,tir-tir,3 +lre22_dev_tfyqz,tir-tir,3 +lre22_dev_tgbui,ara-aeb,5 +lre22_dev_tgixi,xho-xho,13 +lre22_dev_tgmud,eng-iaf,6 +lre22_dev_tgult,eng-ens,2 +lre22_dev_thcjv,tso-tso,5 +lre22_dev_thzir,eng-ens,11 +lre22_dev_tisfm,fra-ntf,9 +lre22_dev_tixou,xho-xho,2 +lre22_dev_tiyuw,afr-afr,5 +lre22_dev_tjdcc,afr-afr,13 +lre22_dev_tjikt,zul-zul,12 +lre22_dev_tjpdw,ara-arq,8 +lre22_dev_tkadi,ven-ven,12 +lre22_dev_tkcbm,afr-afr,6 +lre22_dev_tkgfw,eng-ens,11 +lre22_dev_tkiks,ara-aeb,6 +lre22_dev_tlgzi,xho-xho,1 +lre22_dev_tlhlw,tir-tir,6 +lre22_dev_tloqn,afr-afr,6 +lre22_dev_tmcje,eng-ens,4 +lre22_dev_tmjpw,eng-iaf,2 +lre22_dev_tmxtu,ven-ven,2 +lre22_dev_tngwh,tir-tir,8 +lre22_dev_tnqdv,ara-aeb,9 +lre22_dev_tnqro,xho-xho,15 +lre22_dev_tnqzy,orm-orm,7 +lre22_dev_tnskm,xho-xho,12 +lre22_dev_tnvhc,ven-ven,12 +lre22_dev_tofhy,zul-zul,6 +lre22_dev_tohkd,zul-zul,9 +lre22_dev_tonqb,ven-ven,6 +lre22_dev_tpbib,tso-tso,1 +lre22_dev_tpejq,ara-arq,3 +lre22_dev_tpfir,eng-ens,11 +lre22_dev_tphgn,zul-zul,12 +lre22_dev_tpidd,ara-arq,6 +lre22_dev_tpkce,eng-ens,11 +lre22_dev_tpszi,orm-orm,15 +lre22_dev_tpwcn,eng-iaf,6 +lre22_dev_trdfy,ara-ayl,3 +lre22_dev_tsbms,ara-ayl,4 +lre22_dev_tslui,tso-tso,6 +lre22_dev_tsvvy,zul-zul,10 +lre22_dev_tsyey,xho-xho,10 +lre22_dev_ttlco,eng-iaf,12 +lre22_dev_tubpr,orm-orm,13 +lre22_dev_tugpl,eng-ens,9 +lre22_dev_tuoiq,tir-tir,4 +lre22_dev_tuxfx,zul-zul,3 +lre22_dev_tvahj,tir-tir,9 +lre22_dev_tvewc,eng-iaf,3 +lre22_dev_tvfvc,ara-ayl,8 +lre22_dev_tvkod,xho-xho,5 +lre22_dev_tvkwe,zul-zul,9 +lre22_dev_tvopo,xho-xho,12 +lre22_dev_tvqui,eng-ens,7 +lre22_dev_tvsbw,ara-arq,6 +lre22_dev_tvxvk,ven-ven,8 +lre22_dev_twbkf,nbl-nbl,9 +lre22_dev_twfot,ara-arq,6 +lre22_dev_twkns,ara-ayl,4 +lre22_dev_twuvf,eng-ens,10 +lre22_dev_txahv,eng-ens,8 +lre22_dev_txcob,ara-aeb,6 +lre22_dev_txnvi,zul-zul,3 +lre22_dev_txurh,afr-afr,7 +lre22_dev_txzkl,ara-arq,5 +lre22_dev_tyfad,tso-tso,7 +lre22_dev_tyhwp,ara-aeb,8 +lre22_dev_tzism,tir-tir,12 +lre22_dev_tzsfj,tir-tir,12 +lre22_dev_tzwof,eng-iaf,9 +lre22_dev_uahzm,afr-afr,5 +lre22_dev_uajwt,tso-tso,7 +lre22_dev_uanlr,zul-zul,13 +lre22_dev_uaoju,zul-zul,8 +lre22_dev_uaryk,xho-xho,15 +lre22_dev_ubfaf,ven-ven,12 +lre22_dev_ucbje,ara-aeb,8 +lre22_dev_ucrpa,ara-arq,3 +lre22_dev_udtzx,eng-iaf,7 +lre22_dev_uduja,fra-ntf,6 +lre22_dev_udxpl,tso-tso,2 +lre22_dev_uesmx,eng-iaf,5 +lre22_dev_ufewk,eng-iaf,8 +lre22_dev_ugjxy,tir-tir,4 +lre22_dev_ugsxl,eng-ens,3 +lre22_dev_ugvov,tso-tso,8 +lre22_dev_uhmdw,tso-tso,10 +lre22_dev_uhqng,nbl-nbl,12 +lre22_dev_uhymw,tir-tir,8 +lre22_dev_uhzmr,eng-ens,2 +lre22_dev_uimtg,ara-ayl,4 +lre22_dev_uirdr,nbl-nbl,13 +lre22_dev_uiszj,ara-aeb,8 +lre22_dev_ujada,ara-ayl,9 +lre22_dev_ujmqw,ven-ven,4 +lre22_dev_ujswr,afr-afr,11 +lre22_dev_ujvve,xho-xho,10 +lre22_dev_ukfha,ara-ayl,6 +lre22_dev_ukkpr,eng-ens,10 +lre22_dev_ukpdg,fra-ntf,13 +lre22_dev_ukpoy,nbl-nbl,15 +lre22_dev_uktod,ara-ayl,4 +lre22_dev_uktvh,zul-zul,13 +lre22_dev_ukuwo,ara-ayl,5 +lre22_dev_ukynv,zul-zul,12 +lre22_dev_ulepv,ara-ayl,5 +lre22_dev_ulgtj,zul-zul,7 +lre22_dev_ulofk,eng-iaf,11 +lre22_dev_uluog,ara-arq,3 +lre22_dev_umbpy,zul-zul,13 +lre22_dev_umjzo,tso-tso,5 +lre22_dev_uncdb,ara-arq,9 +lre22_dev_unffr,ara-ayl,8 +lre22_dev_unpif,eng-ens,9 +lre22_dev_uoikj,eng-iaf,13 +lre22_dev_uopfp,nbl-nbl,7 +lre22_dev_upenl,eng-iaf,13 +lre22_dev_uphuw,xho-xho,11 +lre22_dev_upkbw,ara-ayl,4 +lre22_dev_uplen,xho-xho,9 +lre22_dev_upqod,orm-orm,6 +lre22_dev_upspe,afr-afr,12 +lre22_dev_uqnkk,tir-tir,12 +lre22_dev_uqvxc,eng-ens,0 +lre22_dev_urgqx,ara-ayl,8 +lre22_dev_urkgk,tir-tir,12 +lre22_dev_uscky,xho-xho,3 +lre22_dev_usiwx,tir-tir,9 +lre22_dev_usnzj,zul-zul,5 +lre22_dev_usopt,xho-xho,8 +lre22_dev_uswgv,nbl-nbl,11 +lre22_dev_uszcb,ara-arq,4 +lre22_dev_utahf,ara-ayl,7 +lre22_dev_utaxq,tso-tso,9 +lre22_dev_utcwb,afr-afr,10 +lre22_dev_uuhry,tir-tir,9 +lre22_dev_uuprr,eng-ens,7 +lre22_dev_uuvqh,zul-zul,2 +lre22_dev_uwcmh,orm-orm,4 +lre22_dev_uwiev,zul-zul,13 +lre22_dev_uwjzb,ven-ven,10 +lre22_dev_uwony,orm-orm,1 +lre22_dev_uwqeq,orm-orm,2 +lre22_dev_uwvfl,nbl-nbl,5 +lre22_dev_uxdjn,xho-xho,12 +lre22_dev_uxqte,zul-zul,13 +lre22_dev_uxryh,ven-ven,11 +lre22_dev_uyhzp,orm-orm,15 +lre22_dev_uyrjl,tso-tso,10 +lre22_dev_uyzcl,eng-ens,11 +lre22_dev_uzbqz,fra-ntf,4 +lre22_dev_uzoxq,ara-aeb,9 +lre22_dev_vabxl,nbl-nbl,11 +lre22_dev_vafyo,nbl-nbl,15 +lre22_dev_vascl,nbl-nbl,0 +lre22_dev_vauqx,ara-arq,10 +lre22_dev_vbscm,xho-xho,3 +lre22_dev_vbulh,xho-xho,12 +lre22_dev_vbwwp,xho-xho,15 +lre22_dev_vbznk,ara-arq,6 +lre22_dev_vcibu,nbl-nbl,9 +lre22_dev_vcjun,zul-zul,12 +lre22_dev_vckxt,xho-xho,7 +lre22_dev_vdkjy,fra-ntf,14 +lre22_dev_vdmyt,ara-ayl,0 +lre22_dev_vdoif,ven-ven,13 +lre22_dev_vdvjv,orm-orm,12 +lre22_dev_vebet,ara-aeb,1 +lre22_dev_velkr,ara-aeb,1 +lre22_dev_vgbmm,tir-tir,9 +lre22_dev_vgucw,nbl-nbl,7 +lre22_dev_vhiyb,afr-afr,9 +lre22_dev_vhoej,tir-tir,5 +lre22_dev_vhryd,orm-orm,13 +lre22_dev_vhzdh,tso-tso,10 +lre22_dev_viapx,tso-tso,3 +lre22_dev_vifdj,ara-ayl,4 +lre22_dev_vijbo,zul-zul,12 +lre22_dev_virnr,eng-ens,6 +lre22_dev_vjhbd,orm-orm,6 +lre22_dev_vjoca,ara-aeb,10 +lre22_dev_vjtou,eng-ens,5 +lre22_dev_vjxpv,ara-aeb,10 +lre22_dev_vkmab,fra-ntf,2 +lre22_dev_vkrvz,tir-tir,8 +lre22_dev_vkwwf,tso-tso,9 +lre22_dev_vlbdk,zul-zul,6 +lre22_dev_vliie,orm-orm,9 +lre22_dev_vlrve,eng-iaf,2 +lre22_dev_vmaet,tir-tir,3 +lre22_dev_vmdhi,eng-ens,10 +lre22_dev_vmdjw,nbl-nbl,13 +lre22_dev_vmjut,fra-ntf,9 +lre22_dev_vmrrg,eng-ens,3 +lre22_dev_vnjxn,nbl-nbl,7 +lre22_dev_vnmxm,ven-ven,12 +lre22_dev_vnykj,zul-zul,10 +lre22_dev_vovab,zul-zul,11 +lre22_dev_vovvl,zul-zul,11 +lre22_dev_vpcey,tir-tir,6 +lre22_dev_vpodd,nbl-nbl,11 +lre22_dev_vptke,eng-ens,4 +lre22_dev_vpulr,xho-xho,15 +lre22_dev_vpuve,tir-tir,8 +lre22_dev_vqttr,eng-iaf,12 +lre22_dev_vqzae,eng-iaf,11 +lre22_dev_vrnsg,tso-tso,8 +lre22_dev_vshpc,ara-aeb,6 +lre22_dev_vslbh,ara-arq,9 +lre22_dev_vsmaz,tir-tir,5 +lre22_dev_vsnez,tso-tso,8 +lre22_dev_vsnjp,fra-ntf,14 +lre22_dev_vsocn,ven-ven,7 +lre22_dev_vsvom,afr-afr,8 +lre22_dev_vtnfc,tir-tir,4 +lre22_dev_vtnlb,eng-ens,4 +lre22_dev_vubwb,eng-ens,8 +lre22_dev_vufsn,ara-aeb,3 +lre22_dev_vuiqu,tir-tir,8 +lre22_dev_vumeq,xho-xho,0 +lre22_dev_vupse,ven-ven,6 +lre22_dev_vvauz,xho-xho,14 +lre22_dev_vvfze,eng-ens,11 +lre22_dev_vviyr,zul-zul,12 +lre22_dev_vvwiq,fra-ntf,5 +lre22_dev_vwnkj,zul-zul,5 +lre22_dev_vwoww,orm-orm,7 +lre22_dev_vwtne,afr-afr,5 +lre22_dev_vwxgt,ara-arq,10 +lre22_dev_vxabl,eng-ens,8 +lre22_dev_vxnsl,afr-afr,7 +lre22_dev_vxslj,tir-tir,10 +lre22_dev_vxsvc,tir-tir,11 +lre22_dev_vxuiz,ara-aeb,10 +lre22_dev_vzarl,ara-ayl,7 +lre22_dev_vzeew,ven-ven,6 +lre22_dev_vzjtc,ara-arq,0 +lre22_dev_vzkdb,tso-tso,10 +lre22_dev_vzvpq,ara-arq,9 +lre22_dev_waqyh,xho-xho,15 +lre22_dev_wawwu,xho-xho,14 +lre22_dev_wbgqi,tso-tso,11 +lre22_dev_wcctp,eng-ens,10 +lre22_dev_wdcer,afr-afr,3 +lre22_dev_wdeor,fra-ntf,14 +lre22_dev_wdfdd,eng-iaf,2 +lre22_dev_wdkvb,eng-ens,11 +lre22_dev_wdogx,ara-aeb,7 +lre22_dev_wdqdq,ara-arq,10 +lre22_dev_wdxwu,tir-tir,5 +lre22_dev_weaek,ara-arq,4 +lre22_dev_wefui,tso-tso,10 +lre22_dev_wehjh,tir-tir,10 +lre22_dev_weypz,nbl-nbl,12 +lre22_dev_wffdy,zul-zul,12 +lre22_dev_wffgq,tso-tso,8 +lre22_dev_wfvlh,ven-ven,8 +lre22_dev_wgago,eng-ens,5 +lre22_dev_wglzd,afr-afr,11 +lre22_dev_wgsbu,afr-afr,5 +lre22_dev_whdhw,nbl-nbl,7 +lre22_dev_whogu,eng-iaf,13 +lre22_dev_whpee,tso-tso,9 +lre22_dev_whqpd,ara-aeb,9 +lre22_dev_wikrr,ven-ven,11 +lre22_dev_witju,fra-ntf,11 +lre22_dev_wjcme,orm-orm,10 +lre22_dev_wkare,ara-arq,2 +lre22_dev_wkbfe,afr-afr,9 +lre22_dev_wkecn,xho-xho,13 +lre22_dev_wkhxo,afr-afr,9 +lre22_dev_wlgae,ara-arq,6 +lre22_dev_wlnls,eng-iaf,7 +lre22_dev_wlsxb,eng-ens,1 +lre22_dev_wlwuc,nbl-nbl,8 +lre22_dev_wnaqr,nbl-nbl,9 +lre22_dev_wndpq,fra-ntf,13 +lre22_dev_wnkdc,ara-ayl,2 +lre22_dev_wnknc,nbl-nbl,9 +lre22_dev_wnppz,orm-orm,15 +lre22_dev_wpzgm,afr-afr,13 +lre22_dev_wqhqj,ara-ayl,9 +lre22_dev_wqreb,afr-afr,11 +lre22_dev_wqrez,eng-ens,4 +lre22_dev_wqtsf,ara-arq,8 +lre22_dev_wqwtc,orm-orm,3 +lre22_dev_wrfwf,ven-ven,7 +lre22_dev_wrqqt,orm-orm,15 +lre22_dev_wrutf,afr-afr,7 +lre22_dev_wrvzk,nbl-nbl,1 +lre22_dev_wrxly,fra-ntf,13 +lre22_dev_wsbiw,ara-aeb,8 +lre22_dev_wshay,zul-zul,8 +lre22_dev_wsous,tso-tso,5 +lre22_dev_wszpj,ven-ven,7 +lre22_dev_wtksi,afr-afr,8 +lre22_dev_wugbw,xho-xho,6 +lre22_dev_wujfv,afr-afr,11 +lre22_dev_wuwek,xho-xho,12 +lre22_dev_wvhhk,fra-ntf,2 +lre22_dev_wvosz,nbl-nbl,3 +lre22_dev_wwagu,xho-xho,14 +lre22_dev_wwbuj,eng-iaf,2 +lre22_dev_wwgnr,afr-afr,10 +lre22_dev_wwjev,afr-afr,12 +lre22_dev_wwmsu,ara-arq,4 +lre22_dev_wwrmy,ven-ven,7 +lre22_dev_wwvhd,ara-arq,9 +lre22_dev_wxdjv,ara-ayl,6 +lre22_dev_wygox,tir-tir,6 +lre22_dev_wyhuq,zul-zul,13 +lre22_dev_wzoir,xho-xho,15 +lre22_dev_wzvwa,orm-orm,6 +lre22_dev_xapvn,tso-tso,8 +lre22_dev_xarkl,eng-ens,5 +lre22_dev_xavhh,nbl-nbl,10 +lre22_dev_xazuy,orm-orm,3 +lre22_dev_xbnft,eng-iaf,0 +lre22_dev_xbqbc,fra-ntf,7 +lre22_dev_xbzfw,tir-tir,11 +lre22_dev_xccde,ara-arq,3 +lre22_dev_xcdty,zul-zul,8 +lre22_dev_xcjkb,ara-ayl,7 +lre22_dev_xcmty,ara-arq,10 +lre22_dev_xcsbc,tso-tso,1 +lre22_dev_xdkjb,nbl-nbl,11 +lre22_dev_xdknq,nbl-nbl,11 +lre22_dev_xdoik,eng-ens,10 +lre22_dev_xdtyd,nbl-nbl,4 +lre22_dev_xearl,eng-iaf,3 +lre22_dev_xedqa,nbl-nbl,11 +lre22_dev_xefnx,eng-ens,11 +lre22_dev_xeipr,tir-tir,11 +lre22_dev_xekhs,zul-zul,9 +lre22_dev_xelzr,ara-aeb,9 +lre22_dev_xenhb,ara-aeb,3 +lre22_dev_xfdsx,xho-xho,12 +lre22_dev_xfggl,xho-xho,9 +lre22_dev_xgspz,eng-iaf,13 +lre22_dev_xgwmu,tso-tso,8 +lre22_dev_xhbmk,orm-orm,15 +lre22_dev_xhdtl,orm-orm,3 +lre22_dev_xisjn,ara-arq,8 +lre22_dev_xitdz,nbl-nbl,10 +lre22_dev_xizbg,xho-xho,14 +lre22_dev_xjcph,xho-xho,10 +lre22_dev_xjcvd,zul-zul,7 +lre22_dev_xjlgm,ara-aeb,3 +lre22_dev_xjxzy,eng-ens,2 +lre22_dev_xkfsd,ven-ven,12 +lre22_dev_xkktj,eng-iaf,12 +lre22_dev_xkmmy,ara-aeb,10 +lre22_dev_xltgz,ara-ayl,5 +lre22_dev_xmbby,orm-orm,3 +lre22_dev_xmcmv,xho-xho,14 +lre22_dev_xngam,fra-ntf,14 +lre22_dev_xnsev,ara-ayl,8 +lre22_dev_xnwsq,ara-arq,8 +lre22_dev_xnwwh,zul-zul,13 +lre22_dev_xobeh,tir-tir,11 +lre22_dev_xolau,ven-ven,13 +lre22_dev_xoqtn,eng-iaf,10 +lre22_dev_xovpd,eng-iaf,10 +lre22_dev_xpaff,eng-ens,9 +lre22_dev_xpahm,ara-arq,4 +lre22_dev_xpcrs,tso-tso,5 +lre22_dev_xpdsg,eng-iaf,5 +lre22_dev_xpjqj,nbl-nbl,6 +lre22_dev_xqwtk,ara-arq,10 +lre22_dev_xrfge,ara-arq,8 +lre22_dev_xrhka,orm-orm,9 +lre22_dev_xrpup,zul-zul,8 +lre22_dev_xsbff,ara-aeb,9 +lre22_dev_xsffv,tso-tso,1 +lre22_dev_xstnu,eng-ens,5 +lre22_dev_xthfd,ara-aeb,8 +lre22_dev_xthzz,ven-ven,4 +lre22_dev_xtmgg,eng-iaf,13 +lre22_dev_xtyic,nbl-nbl,14 +lre22_dev_xucyl,eng-ens,7 +lre22_dev_xudii,ara-ayl,3 +lre22_dev_xugux,afr-afr,0 +lre22_dev_xuqnj,ara-ayl,4 +lre22_dev_xvaoh,nbl-nbl,9 +lre22_dev_xvclh,afr-afr,9 +lre22_dev_xveae,xho-xho,4 +lre22_dev_xxpqz,ara-arq,9 +lre22_dev_xxqad,tso-tso,10 +lre22_dev_xybed,tir-tir,9 +lre22_dev_xyrex,eng-ens,11 +lre22_dev_xzlas,eng-iaf,9 +lre22_dev_xztyr,orm-orm,9 +lre22_dev_yaxkb,zul-zul,12 +lre22_dev_ybcvu,xho-xho,13 +lre22_dev_ybjon,orm-orm,2 +lre22_dev_ybubm,ven-ven,5 +lre22_dev_ycarc,eng-ens,6 +lre22_dev_ychjj,orm-orm,2 +lre22_dev_ycnyc,tir-tir,7 +lre22_dev_ycsvt,afr-afr,12 +lre22_dev_ydaxa,nbl-nbl,8 +lre22_dev_ydrxu,nbl-nbl,1 +lre22_dev_yeekw,fra-ntf,13 +lre22_dev_yevan,tir-tir,11 +lre22_dev_yfaan,tir-tir,10 +lre22_dev_yfayx,afr-afr,6 +lre22_dev_yfpsd,fra-ntf,1 +lre22_dev_yfxkm,ven-ven,7 +lre22_dev_yguqk,ven-ven,3 +lre22_dev_yhrgj,afr-afr,8 +lre22_dev_yhzyq,ara-ayl,5 +lre22_dev_yiqui,eng-iaf,12 +lre22_dev_yjens,ara-ayl,7 +lre22_dev_yjkxx,eng-ens,8 +lre22_dev_yjypk,ara-ayl,9 +lre22_dev_ykchd,ven-ven,8 +lre22_dev_ykktl,xho-xho,0 +lre22_dev_ylhwh,orm-orm,9 +lre22_dev_ylnms,tso-tso,2 +lre22_dev_ylsdz,ven-ven,7 +lre22_dev_ymcmp,eng-iaf,8 +lre22_dev_ymfzx,tso-tso,7 +lre22_dev_ymizm,fra-ntf,0 +lre22_dev_ympvj,tir-tir,9 +lre22_dev_ymslh,tir-tir,12 +lre22_dev_ynavg,zul-zul,9 +lre22_dev_ynhlk,tir-tir,9 +lre22_dev_ynnkb,eng-ens,10 +lre22_dev_yogkc,fra-ntf,7 +lre22_dev_yokld,eng-ens,4 +lre22_dev_yokve,tir-tir,6 +lre22_dev_yomdz,ara-ayl,6 +lre22_dev_yomuu,xho-xho,12 +lre22_dev_yoobm,ara-ayl,8 +lre22_dev_yoocz,eng-ens,10 +lre22_dev_yopyf,eng-iaf,5 +lre22_dev_yoxoc,tir-tir,8 +lre22_dev_ypaem,afr-afr,5 +lre22_dev_ypamp,afr-afr,7 +lre22_dev_ypjpq,tir-tir,8 +lre22_dev_yplba,ara-arq,9 +lre22_dev_ypnrh,fra-ntf,1 +lre22_dev_ypqfg,eng-ens,7 +lre22_dev_yrdsl,eng-ens,2 +lre22_dev_yrtkv,afr-afr,7 +lre22_dev_yrwrb,nbl-nbl,9 +lre22_dev_ysmlk,eng-ens,11 +lre22_dev_yspja,orm-orm,5 +lre22_dev_ytfnn,fra-ntf,14 +lre22_dev_yturp,ara-aeb,6 +lre22_dev_ytvbd,afr-afr,4 +lre22_dev_yuhvo,tso-tso,8 +lre22_dev_yundi,ara-arq,3 +lre22_dev_yvmnx,ara-arq,10 +lre22_dev_yvqud,xho-xho,15 +lre22_dev_yvxdd,ara-ayl,4 +lre22_dev_ywjtq,xho-xho,5 +lre22_dev_ywnza,fra-ntf,12 +lre22_dev_yxnno,tso-tso,10 +lre22_dev_yxoww,tir-tir,7 +lre22_dev_yxpgi,ara-arq,5 +lre22_dev_yxsta,eng-ens,7 +lre22_dev_yyltz,xho-xho,8 +lre22_dev_yyqqx,fra-ntf,12 +lre22_dev_yzloh,ara-ayl,7 +lre22_dev_zacdy,ara-ayl,3 +lre22_dev_zadkk,tir-tir,9 +lre22_dev_zalpc,afr-afr,6 +lre22_dev_zarod,orm-orm,8 +lre22_dev_zasvb,afr-afr,11 +lre22_dev_zazom,ara-arq,9 +lre22_dev_zbfqk,afr-afr,13 +lre22_dev_zbqew,tso-tso,2 +lre22_dev_zbrkn,eng-ens,7 +lre22_dev_zbubp,zul-zul,9 +lre22_dev_zbytc,ara-arq,8 +lre22_dev_zcfns,tir-tir,6 +lre22_dev_zcfzk,afr-afr,7 +lre22_dev_zcrgv,ara-arq,10 +lre22_dev_zdxdn,ara-ayl,7 +lre22_dev_zdydi,eng-ens,1 +lre22_dev_zebzq,ven-ven,4 +lre22_dev_zedlk,xho-xho,14 +lre22_dev_zeqpp,tir-tir,12 +lre22_dev_zfjbm,ara-arq,10 +lre22_dev_zfkne,nbl-nbl,13 +lre22_dev_zflnr,ven-ven,13 +lre22_dev_zfoyd,xho-xho,4 +lre22_dev_zgdyu,eng-iaf,8 +lre22_dev_zgmja,zul-zul,9 +lre22_dev_zgvfs,ara-arq,6 +lre22_dev_zhmud,orm-orm,14 +lre22_dev_zhoml,tso-tso,9 +lre22_dev_zijcb,xho-xho,10 +lre22_dev_ziktm,ara-aeb,10 +lre22_dev_zipxy,ara-arq,9 +lre22_dev_ziqxc,eng-iaf,1 +lre22_dev_zjhir,ven-ven,7 +lre22_dev_zjmqp,orm-orm,13 +lre22_dev_zjrrk,tso-tso,11 +lre22_dev_zjtwd,ara-aeb,3 +lre22_dev_zkfcf,xho-xho,6 +lre22_dev_zkftc,nbl-nbl,4 +lre22_dev_zkqei,ara-ayl,7 +lre22_dev_zkwqo,zul-zul,11 +lre22_dev_zlamn,nbl-nbl,6 +lre22_dev_zlbor,xho-xho,14 +lre22_dev_zloet,ven-ven,8 +lre22_dev_zlvhk,zul-zul,5 +lre22_dev_zlzqv,fra-ntf,12 +lre22_dev_zmobq,ara-ayl,7 +lre22_dev_zmuiv,zul-zul,9 +lre22_dev_znvqw,zul-zul,4 +lre22_dev_znzuu,tir-tir,0 +lre22_dev_zoava,eng-iaf,6 +lre22_dev_zodvu,tso-tso,0 +lre22_dev_zosdw,nbl-nbl,15 +lre22_dev_zpnvq,xho-xho,6 +lre22_dev_zqeby,eng-iaf,12 +lre22_dev_zqgdd,nbl-nbl,9 +lre22_dev_zqhaw,nbl-nbl,5 +lre22_dev_zqkau,orm-orm,8 +lre22_dev_zqkel,ara-ayl,9 +lre22_dev_zqlnd,ara-aeb,8 +lre22_dev_zrnpw,orm-orm,8 +lre22_dev_zrqvc,afr-afr,9 +lre22_dev_zrrgq,ven-ven,8 +lre22_dev_zryit,zul-zul,8 +lre22_dev_zsckt,zul-zul,4 +lre22_dev_zucqq,orm-orm,4 +lre22_dev_zusln,orm-orm,11 +lre22_dev_zuxzw,tir-tir,0 +lre22_dev_zvabs,tir-tir,11 +lre22_dev_zvlid,tso-tso,11 +lre22_dev_zvned,eng-iaf,5 +lre22_dev_zvtwr,xho-xho,11 +lre22_dev_zwmim,orm-orm,11 +lre22_dev_zwnsu,ara-arq,8 +lre22_dev_zwtxn,ara-arq,10 +lre22_dev_zxfcm,orm-orm,3 +lre22_dev_zxsgm,tir-tir,5 +lre22_dev_zybya,eng-iaf,10 +lre22_dev_zygak,zul-zul,1 +lre22_dev_zylqc,eng-ens,3 +lre22_dev_zyppc,fra-ntf,8 +lre22_dev_zywem,eng-ens,8 +lre22_dev_zzapx,ara-ayl,5 +lre22_dev_zzumc,ara-arq,2 +lre22_dev_zzvdl,fra-ntf,5 +lre22_dev_zzvjv,nbl-nbl,14 diff --git a/egs/lre22/fixed.v1.8k/resources/lre17_ara-ary/segs_ara-ary.csv b/egs/lre22/fixed.v1.8k/resources/lre17_ara-ary/segs_ara-ary.csv new file mode 100644 index 00000000..4f5caa4d --- /dev/null +++ b/egs/lre22/fixed.v1.8k/resources/lre17_ara-ary/segs_ara-ary.csv @@ -0,0 +1,1306 @@ +id,class_id,logp +20110112_085632_25-a.sph,ara-arq,0.9999215882183581 +20110112_085632_25-b.sph,ara-arq,0.9933264028811798 +20110112_093821_26-a.sph,ara-arq,0.9982419072530201 +20110112_093821_26-b.sph,ara-arq,0.9877989962861538 +20110112_100739_27-a.sph,ara-arq,0.9998601825318931 +20110112_100739_27-b.sph,ara-arq,0.9998461026324816 +20110112_102931_28-a.sph,ara-arq,0.9988240996203235 +20110112_102931_28-b.sph,ara-arq,0.9992945069664346 +20110112_110035_29-a.sph,ara-arq,0.9956050254373241 +20110112_110035_29-b.sph,ara-arq,0.9998179655506749 +20110112_120034_30-a.sph,ara-arq,0.9999956254632509 +20110112_120034_30-b.sph,ara-arq,0.9999961650306969 +20110112_121837_31-a.sph,ara-arq,0.9992484722468692 +20110112_121837_31-b.sph,ara-arq,0.9988521768999281 +20110112_125124_32-a.sph,ara-arq,0.9992562768302394 +20110112_125124_32-b.sph,ara-arq,0.9965659470692162 +20110112_131159_33-a.sph,ara-arq,0.9999484673386585 +20110112_131159_33-b.sph,ara-arq,0.999988595598022 +20110112_135057_34-a.sph,ara-arq,0.9964035382836839 +20110112_140409_35-b.sph,ara-arq,0.9993163776118849 +20110112_143151_37-a.sph,ara-arq,0.9978091561892594 +20110112_144321_38-a.sph,ara-arq,0.9882030389155663 +20110112_151915_40-a.sph,ara-arq,0.9999863814255752 +20110112_151915_40-b.sph,ara-arq,0.9999867872908822 +20110112_164438_41-a.sph,ara-arq,0.9911464286505152 +20110112_164438_41-b.sph,ara-arq,0.9982784286981317 +20110112_170310_42-b.sph,ara-arq,0.9957287722811907 +20110112_174334_44-a.sph,ara-arq,0.9994637389176244 +20110112_174334_44-b.sph,ara-arq,0.9999979913404728 +20110112_175917_46-a.sph,ara-arq,0.999896350765443 +20110112_175917_46-b.sph,ara-arq,0.9992906517833624 +20110112_181316_47-a.sph,ara-arq,0.9865135533386489 +20110112_181316_47-b.sph,ara-arq,0.9799283164010801 +20110112_184303_48-a.sph,ara-arq,0.9999941253436267 +20110112_184303_48-b.sph,ara-arq,0.9894727409462367 +20110112_185018_49-a.sph,ara-arq,0.9993235515178335 +20110112_185018_49-b.sph,ara-arq,0.9964110986149859 +20110112_190919_51-a.sph,ara-arq,0.9975136717392243 +20110112_195355_54-a.sph,ara-arq,0.9994973714549525 +20110112_210716_56-b.sph,ara-arq,0.9999953505624561 +20110113_154325_58-a.sph,ara-arq,0.9999971862717217 +20110113_155707_60-a.sph,ara-aeb,0.999254762801491 +20110113_155707_60-b.sph,ara-arq,0.9999588882350571 +20110113_160907_61-a.sph,ara-arq,0.9999960880781135 +20110113_160907_61-b.sph,ara-arq,0.9999863740819315 +20110113_210803_66-a.sph,ara-arq,0.9973323794741106 +20110113_210803_66-b.sph,ara-arq,0.9989444092443852 +20110114_010743_69-a.sph,ara-arq,0.9999099284602719 +20110114_010743_69-b.sph,ara-arq,0.9763890642555946 +20110114_132253_70-a.sph,ara-arq,0.9999970659217092 +20110114_170901_71-a.sph,ara-arq,0.9999919868727419 +20110114_170901_71-b.sph,ara-arq,0.9854190368540645 +20110114_174847_72-a.sph,ara-arq,0.9999794141191631 +20110114_174847_72-b.sph,ara-arq,0.9999942469709167 +20110115_083054_73-a.sph,ara-arq,0.999835593793634 +20110115_083054_73-b.sph,ara-arq,0.9874468440342952 +20110115_090248_75-a.sph,ara-arq,0.9999406720119036 +20110115_090248_75-b.sph,ara-arq,0.992340537440409 +20110115_093602_76-a.sph,ara-arq,0.999989554911121 +20110115_093602_76-b.sph,ara-arq,0.9999977173414476 +20110115_094928_77-a.sph,ara-arq,0.9978338195939027 +20110115_094928_77-b.sph,ara-arq,0.9999543347173833 +20110115_101940_80-a.sph,ara-arq,0.9999919712722404 +20110115_101940_80-b.sph,ara-arq,0.9999936335315381 +20110115_114622_88-a.sph,ara-arq,0.9994230181724559 +20110115_114622_88-b.sph,ara-arq,0.9987494039544832 +20110115_115414_89-a.sph,ara-arq,0.9997873068050884 +20110115_115414_89-b.sph,ara-arq,0.9996879869582883 +20110115_120333_90-a.sph,ara-arq,0.9999911829644114 +20110115_120333_90-b.sph,ara-aeb,0.9907357013205295 +20110115_154229_93-a.sph,ara-arq,0.9999908976134848 +20110115_160534_94-a.sph,ara-arq,0.9999941336401025 +20110115_160534_94-b.sph,ara-arq,0.9970317402712736 +20110115_170405_96-a.sph,ara-arq,0.9991143532054212 +20110115_170405_96-b.sph,ara-arq,0.9996419863942765 +20110115_172633_97-a.sph,ara-arq,0.983493662520541 +20110115_172633_97-b.sph,ara-arq,0.9999960891934105 +20110115_173918_98-b.sph,ara-arq,0.9899208181081557 +20110116_212111_99-a.sph,ara-arq,0.9998892280312053 +20110118_122005_102-a.sph,ara-arq,0.9999988862765399 +20110118_122005_102-b.sph,ara-arq,0.9909318482173783 +20110118_154651_104-a.sph,ara-arq,0.999987636189934 +20110119_101115_108-a.sph,ara-arq,0.9999649201104261 +20110119_101115_108-b.sph,ara-arq,0.9970793271378511 +20110119_103907_109-a.sph,ara-arq,0.999982064110703 +20110119_103907_109-b.sph,ara-arq,0.9930181910779016 +20110119_123138_110-b.sph,ara-arq,0.9865006019221569 +20110119_130923_111-a.sph,ara-arq,0.997152167078472 +20110119_130923_111-b.sph,ara-arq,0.9981643655714929 +20110119_131501_113-b.sph,ara-arq,0.9999035286024007 +20110119_162158_114-a.sph,ara-arq,0.9999999535234435 +20110119_162158_114-b.sph,ara-arq,0.9972175314415649 +20110119_164045_115-a.sph,ara-arq,0.999995752838363 +20110119_164045_115-b.sph,ara-arq,0.9997628034439311 +20110119_185412_118-a.sph,ara-arq,0.9998202289146529 +20110119_185412_118-b.sph,ara-arq,0.9984931074535468 +20110119_191933_119-a.sph,ara-arq,0.9998427532239414 +20110119_191933_119-b.sph,ara-arq,0.9999158034849497 +20110120_063303_126-a.sph,ara-aeb,0.9983408495530686 +20110120_065333_127-a.sph,ara-arq,0.9992096176188296 +20110120_065333_127-b.sph,ara-arq,0.9999999917747343 +20110120_103241_131-a.sph,ara-arq,0.9993182345500153 +20110121_133744_171-a.sph,ara-arq,0.9999999215696308 +20110121_135108_172-a.sph,ara-arq,0.9997247255224011 +20110121_135108_172-b.sph,ara-arq,0.9948115916234687 +20110121_150759_174-a.sph,ara-arq,0.9999584736982609 +20110121_205639_189-a.sph,ara-arq,0.9999897466147079 +20110121_205639_189-b.sph,ara-arq,0.9999990343864227 +20110122_102217_196-a.sph,ara-arq,0.980904081808585 +20110122_182307_237-b.sph,ara-arq,0.9999628382800599 +20110122_213252_254-a.sph,ara-aeb,0.9977468847806278 +20110123_065916_259-a.sph,ara-arq,0.9998361236710239 +20110123_065916_259-b.sph,ara-arq,0.9999198626609019 +20110123_082139_260-a.sph,ara-arq,0.9998122368739342 +20110123_091452_261-a.sph,ara-arq,0.9937530884216169 +20110124_150410_307-b.sph,ara-arq,0.9998523760407785 +20110124_160331_310-a.sph,ara-arq,0.9999119580552518 +20110124_160331_310-b.sph,ara-arq,0.9996355765737956 +20110126_231521_427-a.sph,ara-arq,0.9999990527242926 +20110126_231521_427-b.sph,ara-arq,0.999082522282352 +20110126_233137_428-a.sph,ara-arq,0.9999980559906911 +20110126_233137_428-b.sph,ara-arq,0.999549593523283 +20110127_113123_434-a.sph,ara-arq,0.9998900000422434 +20110127_133351_443-a.sph,ara-arq,0.9999924768392251 +20110127_133351_443-b.sph,ara-arq,0.9984673062603949 +20110127_200135_452-a.sph,ara-arq,0.9995916079616751 +20110127_200135_452-b.sph,ara-arq,0.9928940070693326 +20110127_201455_453-a.sph,ara-arq,0.9999998409583539 +20110127_201455_453-b.sph,ara-arq,0.9917325527348603 +20110127_211633_454-a.sph,ara-arq,0.9999252600002078 +20110128_182748_472-a.sph,ara-arq,0.9999937657077407 +20110128_182748_472-b.sph,ara-arq,0.996288694787586 +20110128_185835_473-b.sph,ara-arq,0.9993941153143588 +20110128_193520_475-a.sph,ara-arq,0.9999530106214491 +20110128_193520_475-b.sph,ara-arq,0.9999527486872603 +20110128_200815_476-a.sph,ara-arq,0.9999873059955551 +20110128_200815_476-b.sph,ara-arq,0.9964487496070723 +20110128_203824_477-b.sph,ara-arq,0.9997165845051637 +20110128_222333_480-a.sph,ara-arq,0.9999970783462304 +20110130_080611_510-a.sph,ara-arq,0.9999749215499425 +20110130_080611_510-b.sph,ara-aeb,0.9999966593413755 +20110130_085820_512-a.sph,ara-arq,0.9999636011539246 +20110130_085820_512-b.sph,ara-arq,0.9813498028090423 +20110130_092246_513-b.sph,ara-arq,0.9999059749555838 +20110130_100253_514-a.sph,ara-arq,0.9921278081859116 +20110130_100253_514-b.sph,ara-arq,0.9999516899828312 +20110130_155522_528-a.sph,ara-arq,0.9999069680862643 +20110130_155522_528-b.sph,ara-arq,0.9997900343245884 +20110130_161649_529-a.sph,ara-arq,0.9998613523919572 +20110130_161649_529-b.sph,ara-arq,0.9977029871347945 +20110130_164452_531-a.sph,ara-arq,0.9999976354441193 +20110130_164452_531-b.sph,ara-arq,0.9965312960271767 +20110130_184540_532-a.sph,ara-ayl,0.999999999998713 +20110130_184540_532-b.sph,ara-arq,0.9849336598416535 +20110201_140835_576-a.sph,ara-arq,0.9975300712529358 +20110201_140835_576-b.sph,ara-arq,0.9999993561949782 +20110201_163316_581-a.sph,ara-arq,0.9999983446810359 +20110201_163316_581-b.sph,ara-aeb,0.9924557962400737 +20110203_191239_616-b.sph,ara-arq,0.9998885155856538 +20110204_153604_625-a.sph,ara-arq,0.9999972105890849 +20110204_153604_625-b.sph,ara-arq,0.9999410840414572 +20110204_163201_626-a.sph,ara-arq,0.9976110910766881 +20110204_163201_626-b.sph,ara-arq,0.9999876934539865 +20110204_164625_627-b.sph,ara-arq,0.9999477802299377 +20110204_171649_628-a.sph,ara-arq,0.999905415195764 +20110204_171649_628-b.sph,ara-arq,0.9999920273859438 +20110204_174823_629-a.sph,ara-arq,0.9999950593350131 +20110204_183311_631-a.sph,ara-arq,0.9999953899932322 +20110204_183311_631-b.sph,ara-arq,0.9999635149153278 +20110204_190013_632-a.sph,ara-arq,0.9999387012014681 +20110204_190013_632-b.sph,ara-arq,0.9999962749405605 +20110204_190208_633-a.sph,ara-arq,0.9999999768635758 +20110204_190208_633-b.sph,ara-arq,0.9977926594648339 +20110204_200618_634-a.sph,ara-arq,0.9936747368214611 +20110204_200618_634-b.sph,ara-arq,0.9999479114428771 +20110204_203655_635-b.sph,ara-arq,0.9996727006128494 +20110204_205300_638-a.sph,ara-arq,0.9992486308165472 +20110205_153631_666-b.sph,ara-arq,0.9996547635561664 +20110205_172120_671-a.sph,ara-arq,0.9999915914855924 +20110206_095118_685-a.sph,ara-arq,0.9999999541857382 +20110206_105102_688-a.sph,ara-arq,0.9999997707236105 +20110206_105102_688-b.sph,ara-arq,0.9993810260864877 +20110206_105820_689-a.sph,ara-arq,0.9987592088484722 +20110206_113326_691-a.sph,ara-arq,0.9998279296292647 +20110206_120354_693-b.sph,ara-arq,0.9980948428595703 +20110206_122113_696-a.sph,ara-arq,0.9995909791238744 +20110206_122113_696-b.sph,ara-arq,0.9999192768200029 +20110206_132644_702-a.sph,ara-arq,0.9999998034776016 +20110206_152016_714-a.sph,ara-arq,0.9999867036857935 +20110206_155159_717-a.sph,ara-arq,0.9998418634479207 +20110206_155159_717-b.sph,ara-arq,0.9888916327994056 +20110206_165119_720-a.sph,ara-arq,0.9999902498906699 +20110206_165119_720-b.sph,ara-arq,0.9920166455204702 +20110206_172320_721-a.sph,ara-arq,0.9997423969682552 +20110206_172320_721-b.sph,ara-arq,0.9804934273026574 +20110206_192709_726-a.sph,ara-arq,0.9840175835722847 +20110206_192709_726-b.sph,ara-arq,0.9977287985909209 +20110206_194621_727-a.sph,ara-arq,0.9831344460618126 +20110208_175519_758-b.sph,ara-arq,0.9999988769528441 +20110209_181948_779-a.sph,ara-arq,0.9936522404471088 +20110209_181948_779-b.sph,ara-arq,0.9997791918710421 +20110209_183724_782-a.sph,ara-arq,0.9998784380878949 +20110209_183724_782-b.sph,ara-arq,0.9999246025038221 +20110210_183402_800-a.sph,ara-arq,0.9984594573811986 +20110210_183402_800-b.sph,ara-arq,0.9980188028033471 +20110210_185230_803-a.sph,ara-arq,0.9999982013809502 +20110210_185230_803-b.sph,ara-arq,0.9995995024241807 +20110211_120852_808-a.sph,ara-arq,0.9932259220823395 +20110211_152026_820-a.sph,ara-arq,0.9999980021493747 +20110211_153702_822-a.sph,ara-arq,0.9969999901106751 +20110211_153702_822-b.sph,ara-arq,0.9997316547372318 +20110211_155607_823-a.sph,ara-arq,0.9999999999999418 +20110211_155607_823-b.sph,ara-arq,0.9998718954393995 +20110212_181444_870-a.sph,ara-arq,0.9989480808880217 +20110212_181444_870-b.sph,ara-arq,0.9999872566336785 +20110212_183328_871-a.sph,ara-arq,0.9903448729423271 +20110212_183328_871-b.sph,ara-arq,0.9810930960909464 +20110212_185203_872-a.sph,ara-arq,0.981273333555111 +20110212_191246_873-b.sph,ara-arq,0.9995332526132436 +20110213_114400_879-a.sph,ara-arq,0.9960907157237656 +20110213_114400_879-b.sph,ara-arq,0.9857366670372355 +20110213_120949_881-a.sph,ara-arq,0.9844207993983157 +20110213_120949_881-b.sph,ara-arq,0.9903953303779026 +20110213_122816_882-a.sph,ara-arq,0.9891871489623377 +20110213_122816_882-b.sph,ara-arq,0.9999935070606318 +20110213_131054_885-b.sph,ara-arq,0.9999028302287125 +20110213_133818_888-b.sph,ara-arq,0.9970198413986752 +20110213_142146_896-a.sph,ara-arq,0.9997721189705135 +20110213_142146_896-b.sph,ara-arq,0.998800376513235 +20110213_144952_900-a.sph,ara-arq,0.9968885288172212 +20110213_164838_913-b.sph,ara-arq,0.9999808228052904 +20110213_181716_914-a.sph,ara-arq,0.9996267031488214 +20110213_181716_914-b.sph,ara-arq,0.9978692419798673 +20110214_210504_930-a.sph,ara-arq,0.9999999993069508 +20110214_212408_932-b.sph,ara-arq,0.9994783176628016 +20110217_133012_975-b.sph,ara-arq,0.9999978652333108 +20110217_134937_976-b.sph,ara-arq,0.9981620062883075 +20110217_135627_977-a.sph,ara-arq,0.9923823762680786 +20110217_135627_977-b.sph,ara-ayl,0.99994304153697 +20110217_140828_980-b.sph,ara-arq,0.9998865552711499 +20110217_142557_982-a.sph,ara-arq,0.9927005912288147 +20110217_145020_983-a.sph,ara-arq,0.9992976883509107 +20110217_145020_983-b.sph,ara-arq,0.9994242961468159 +20110217_171932_987-a.sph,ara-arq,0.9998300688490686 +20110217_171932_987-b.sph,ara-arq,0.9999938868544527 +20110217_173619_988-a.sph,ara-arq,0.9998859524996734 +20110217_173619_988-b.sph,ara-arq,0.9993538778019107 +20110218_143916_1008-a.sph,ara-arq,0.9995276489555424 +20110218_152219_1012-a.sph,ara-arq,0.9989564931737804 +20110218_152219_1012-b.sph,ara-arq,0.983424400422087 +20110218_154208_1013-a.sph,ara-arq,0.9996610622089865 +20110218_171114_1015-a.sph,ara-arq,0.9983391451448304 +20110218_171114_1015-b.sph,ara-aeb,0.9839049923252243 +20110219_130356_1026-a.sph,ara-arq,0.9800875696973508 +20110219_130356_1026-b.sph,ara-arq,0.9981097426100024 +20110220_153604_1050-b.sph,ara-arq,0.9992922307310227 +20110226_102551_1168-a.sph,ara-arq,0.9956164145717841 +20110226_104245_1169-a.sph,ara-arq,0.999615990129898 +20110226_104245_1169-b.sph,ara-arq,0.9999432762407108 +20110226_105951_1171-b.sph,ara-arq,0.9963320797245594 +20110227_115638_1179-a.sph,ara-arq,0.9979731787181892 +20110227_115638_1179-b.sph,ara-arq,0.9999984412629119 +20110227_123734_1181-a.sph,ara-arq,0.9996205551160018 +20110227_125439_1182-a.sph,ara-arq,0.9996946469526378 +20110227_125439_1182-b.sph,ara-arq,0.9995090769571758 +20110227_131635_1183-a.sph,ara-arq,0.9981612635005762 +20110227_131635_1183-b.sph,ara-arq,0.9999225601135852 +20110227_134655_1184-a.sph,ara-arq,0.9993111982898277 +20110227_140420_1185-b.sph,ara-arq,0.9988598965943818 +20110227_142125_1186-a.sph,ara-arq,0.9948006601325144 +20110227_142125_1186-b.sph,ara-arq,0.9988307215422513 +20110227_154132_1189-a.sph,ara-arq,0.9938418813114719 +20110227_154132_1189-b.sph,ara-arq,0.9999920687652308 +20110227_155909_1191-b.sph,ara-arq,0.998912461185742 +20110227_162241_1192-a.sph,ara-arq,0.9984703552540448 +20110227_162241_1192-b.sph,ara-arq,0.9999895197829509 +20110227_163935_1195-b.sph,ara-arq,0.9949971842748578 +20110228_174826_1217-a.sph,ara-arq,0.9999886137143743 +20110228_174826_1217-b.sph,ara-arq,0.9993742634620741 +20110301_154921_1230-a.sph,ara-arq,0.9997093954036395 +20110301_154921_1230-b.sph,ara-arq,0.9986268296407617 +20110305_101932_1323-a.sph,ara-arq,0.9961912502914853 +20110305_101932_1323-b.sph,ara-arq,0.9992166940246625 +20110305_103655_1326-a.sph,ara-arq,0.9964829278234681 +20110305_175842_1332-a.sph,ara-arq,0.9971054262198007 +20110305_175842_1332-b.sph,ara-arq,0.9999802993572223 +20110305_181929_1333-b.sph,ara-arq,0.9999661470430923 +20110306_111437_1342-b.sph,ara-arq,0.9997018164726993 +20110306_113229_1344-b.sph,ara-arq,0.9999988318789892 +20110306_115706_1347-b.sph,ara-arq,0.9978191157389156 +20110306_121619_1348-a.sph,ara-arq,0.9827291278544082 +20110306_123404_1350-a.sph,ara-arq,0.9988325513754784 +20110306_155835_1353-b.sph,ara-arq,0.9841830936168241 +20110308_141939_1391-a.sph,ara-arq,0.9927596828605247 +20110309_090633_1407-a.sph,ara-arq,0.9974682774936627 +20110309_090633_1407-b.sph,ara-arq,0.9999472047179431 +20110309_092426_1408-a.sph,ara-arq,0.9891944796228922 +20110309_190600_1415-b.sph,ara-arq,0.9992401337610775 +20110312_100116_1442-b.sph,ara-aeb,0.9999964907283014 +lre11ablk.sph,ara-arq,0.9999907113220694 +lre11aedq.sph,ara-arq,0.9998876721921264 +lre11afar.sph,ara-arq,0.9998482741863788 +lre11aglc.sph,ara-arq,0.9999898456118099 +lre11ahqo.sph,ara-arq,0.9918603244627976 +lre11alas.sph,ara-arq,0.9908901959831016 +lre11alwj.sph,ara-arq,0.9894410455076774 +lre11amzo.sph,ara-arq,0.9786892682080185 +lre11anjz.sph,ara-arq,0.9999981759198692 +lre11aojl.sph,ara-arq,0.9997424488929775 +lre11apsf.sph,ara-arq,0.9849324254750552 +lre11avmm.sph,ara-arq,0.9999504288541092 +lre11axmy.sph,ara-arq,0.9999523003483873 +lre11azex.sph,ara-arq,0.9995398052261274 +lre11bbvj.sph,ara-arq,0.9999335460724018 +lre11bcek.sph,ara-arq,0.9999851926128998 +lre11bcpv.sph,ara-arq,0.984682382200886 +lre11biws.sph,ara-arq,0.9998120788157547 +lre11bnkp.sph,ara-arq,0.9995987125056803 +lre11bnsh.sph,ara-arq,0.9996962089592643 +lre11bnsx.sph,ara-arq,0.9977111633190938 +lre11bpyg.sph,ara-arq,0.9959055192068114 +lre11bpzi.sph,ara-arq,0.9905831011969168 +lre11bqon.sph,ara-arq,0.9969555758000546 +lre11brct.sph,ara-arq,0.9975246452439199 +lre11bsry.sph,ara-arq,0.9894503783769195 +lre11byco.sph,ara-arq,0.9999976971030877 +lre11bzjc.sph,ara-arq,0.999543540290902 +lre11bzlo.sph,ara-arq,0.9999970319421916 +lre11cejv.sph,ara-arq,0.9982811975618141 +lre11cesz.sph,ara-arq,0.998990976426668 +lre11cfgz.sph,ara-arq,0.9935160910053205 +lre11cfwm.sph,ara-arq,0.9999964881312391 +lre11cgay.sph,ara-arq,0.9996390794313366 +lre11cgxl.sph,ara-arq,0.9888137475556131 +lre11cian.sph,ara-arq,0.9999456258260779 +lre11cjxu.sph,ara-arq,0.9999091559138382 +lre11cmat.sph,ara-arq,0.999993807597705 +lre11cmnm.sph,ara-arq,0.9968753878911075 +lre11cpyg.sph,ara-arq,0.99778707361397 +lre11cysx.sph,ara-arq,0.9999531509783556 +lre11czoc.sph,ara-arq,0.9999723971738445 +lre11czzz.sph,ara-arq,0.9967642546011689 +lre11dcxm.sph,ara-arq,0.9988254822426347 +lre11dkdu.sph,ara-arq,0.9995208011254988 +lre11dmgu.sph,ara-arq,0.9999964088803398 +lre11dnsn.sph,ara-arq,0.9999932007597087 +lre11dtba.sph,ara-arq,0.9999993605172935 +lre11dtee.sph,ara-arq,0.9998725513672501 +lre11dtma.sph,ara-arq,0.977093779334459 +lre11dwvy.sph,ara-arq,0.9912064733461909 +lre11dzmv.sph,ara-arq,0.9893557384712441 +lre11edst.sph,ara-arq,0.9996483218194355 +lre11efjk.sph,ara-arq,0.9997175005082578 +lre11eiyw.sph,ara-arq,0.9876725818861913 +lre11ekip.sph,ara-arq,0.9996941527919115 +lre11eohx.sph,ara-arq,0.9999635932561415 +lre11erez.sph,ara-arq,0.9965516335686703 +lre11erxq.sph,ara-arq,0.9996087806099485 +lre11erxr.sph,ara-arq,0.9775705269985646 +lre11eufb.sph,ara-arq,0.9843849296538485 +lre11fagv.sph,ara-arq,0.9998946883634996 +lre11fbda.sph,ara-arq,0.9978295206364557 +lre11fcjj.sph,ara-ayl,0.993383627457564 +lre11fkvi.sph,ara-arq,0.9995855863185078 +lre11fodl.sph,ara-aeb,0.9926903655566781 +lre11fqsk.sph,ara-arq,0.9999977131862029 +lre11ftfz.sph,ara-arq,0.9992187521546018 +lre11fvvi.sph,ara-arq,0.9998858777480336 +lre11fwev.sph,ara-arq,0.9895982019894587 +lre11fwgy.sph,ara-arq,0.9999585016943806 +lre11fxxe.sph,ara-arq,0.9999324150603885 +lre11fyfu.sph,ara-arq,0.9998347600563473 +lre11fyul.sph,ara-arq,0.9936234560510464 +lre11fzut.sph,ara-arq,0.9842511617654447 +lre11gdzy.sph,ara-arq,0.999889190713416 +lre11gezd.sph,ara-arq,0.9998699060210541 +lre11gfzz.sph,ara-arq,0.9999891185814981 +lre11ggoj.sph,ara-arq,0.9939957237418172 +lre11ggpo.sph,ara-arq,0.9856255396828421 +lre11goba.sph,ara-arq,0.990344531513245 +lre11gobo.sph,ara-arq,0.9999875942826566 +lre11grhs.sph,ara-arq,0.9997556743150159 +lre11grvu.sph,ara-arq,0.9999968541025068 +lre11gugo.sph,ara-arq,0.9965937007797033 +lre11hfea.sph,ara-arq,0.9998608609896967 +lre11hhql.sph,ara-arq,0.9998338106542838 +lre11hnhd.sph,ara-arq,0.9772497743518446 +lre11honr.sph,ara-arq,0.9998932196048694 +lre11hqam.sph,ara-arq,0.9981273699564014 +lre11hqxf.sph,ara-arq,0.9994436545087109 +lre11hxhj.sph,ara-arq,0.9999969785031192 +lre11iape.sph,ara-arq,0.9805984363079878 +lre11ibqb.sph,ara-arq,0.9818806715807676 +lre11ijgj.sph,ara-arq,0.9990861345342709 +lre11ilih.sph,ara-arq,0.9999502485361578 +lre11imki.sph,ara-ayl,0.9999999999342657 +lre11iqwq.sph,ara-arq,0.9989577296816973 +lre11irup.sph,ara-arq,0.9987522767488314 +lre11itok.sph,ara-arq,0.9999898498294778 +lre11ivvj.sph,ara-arq,0.9989286012303883 +lre11ixke.sph,ara-arq,0.9999500945534188 +lre11jepu.sph,ara-arq,0.9972127532313085 +lre11jfrt.sph,ara-arq,0.999937410524841 +lre11jgdm.sph,ara-arq,0.9999313841413808 +lre11jgex.sph,ara-arq,0.9996981300593533 +lre11jjzk.sph,ara-arq,0.9969824384056464 +lre11jkcg.sph,ara-arq,0.9999901763442688 +lre11jlbb.sph,ara-ayl,0.9999797418290254 +lre11jmkp.sph,ara-aeb,0.9829506107982495 +lre11jnbo.sph,ara-arq,0.9955088149420692 +lre11joqm.sph,ara-arq,0.9979145377503337 +lre11jpnp.sph,ara-arq,0.9820469567638314 +lre11jqay.sph,ara-arq,0.9999059550640546 +lre11jsem.sph,ara-arq,0.9999659015380002 +lre11jtsu.sph,ara-arq,0.9999985874275487 +lre11jxjq.sph,ara-arq,0.9996252775252656 +lre11kcxw.sph,ara-arq,0.9999980198875125 +lre11kghl.sph,ara-arq,0.9999969246150942 +lre11khai.sph,ara-arq,0.9999850849749775 +lre11khpn.sph,ara-arq,0.9996319733879265 +lre11kizl.sph,ara-arq,0.9946944777387341 +lre11kjhr.sph,ara-arq,0.9999999478107263 +lre11kkvl.sph,ara-arq,0.9999331725841073 +lre11kmrd.sph,ara-arq,0.998407471871956 +lre11kvlp.sph,ara-arq,0.9881647482815468 +lre11kweb.sph,ara-arq,0.9959584099323461 +lre11laah.sph,ara-arq,0.9999994459184748 +lre11laym.sph,ara-arq,0.9996946633832319 +lre11lcve.sph,ara-ayl,0.9754071685038981 +lre11lgwf.sph,ara-arq,0.999998185212916 +lre11lkfn.sph,ara-arq,0.9871985523328064 +lre11lljy.sph,ara-arq,0.9999951992577 +lre11ltbl.sph,ara-arq,0.9925424060658127 +lre11lvrc.sph,ara-arq,0.999994692352155 +lre11lxeu.sph,ara-arq,0.9998494543892836 +lre11mciy.sph,ara-arq,0.9991649802638065 +lre11mcpb.sph,ara-arq,0.9998812879498469 +lre11mdaq.sph,ara-arq,0.9999993536606017 +lre11mdlw.sph,ara-arq,0.9999933631714529 +lre11megz.sph,ara-arq,0.999991578971874 +lre11mgcj.sph,ara-arq,0.9997465534552311 +lre11mgda.sph,ara-arq,0.9998675569243112 +lre11mhts.sph,ara-arq,0.9993162979080465 +lre11mimu.sph,ara-arq,0.9853693267657061 +lre11mmil.sph,ara-arq,0.9998013743994384 +lre11moic.sph,ara-arq,0.9999975487340111 +lre11mpyg.sph,ara-arq,0.9999918690235887 +lre11mrgx.sph,ara-arq,0.9916396537114318 +lre11muco.sph,ara-arq,0.9990051655062514 +lre11myev.sph,ara-arq,0.9999999709774632 +lre11mzsf.sph,ara-arq,0.9814633660769434 +lre11nfrs.sph,ara-arq,0.9993363405655639 +lre11nhol.sph,ara-arq,0.9997583313854068 +lre11nhpm.sph,ara-arq,0.9999766347438241 +lre11nisq.sph,ara-arq,0.9999493082338785 +lre11njwd.sph,ara-arq,0.9999968670068229 +lre11nlof.sph,ara-arq,0.997025558783378 +lre11nsiw.sph,ara-arq,0.9759841774215573 +lre11ntcf.sph,ara-arq,0.9777836287483279 +lre11nted.sph,ara-arq,0.9998141941637515 +lre11nvno.sph,ara-arq,0.9999964888490706 +lre11oavt.sph,ara-arq,0.9965200220599933 +lre11ocsv.sph,ara-arq,0.9999827956617842 +lre11oege.sph,ara-arq,0.9950563664377374 +lre11ofei.sph,ara-arq,0.9998876820742316 +lre11ohag.sph,ara-arq,0.9985768786597264 +lre11oije.sph,ara-arq,0.992072008446929 +lre11ojgd.sph,ara-arq,0.9940659470468965 +lre11ojvf.sph,ara-arq,0.9993122001400614 +lre11okxt.sph,ara-arq,0.9994222299446635 +lre11omni.sph,ara-arq,0.9998835501527243 +lre11onrg.sph,ara-arq,0.9916365803164747 +lre11ontl.sph,ara-arq,0.9995751330000039 +lre11opue.sph,ara-arq,0.9997738097677723 +lre11oqro.sph,ara-arq,0.999967793829243 +lre11otxd.sph,ara-arq,0.9999418179160698 +lre11ouii.sph,ara-arq,0.9998576517513276 +lre11ovwf.sph,ara-arq,0.9889696901091004 +lre11oydk.sph,ara-arq,0.9819224710181479 +lre11ozdn.sph,ara-arq,0.9998106016178588 +lre11pagq.sph,ara-arq,0.9801218480437873 +lre11paur.sph,ara-arq,0.999987431916008 +lre11pfti.sph,ara-arq,0.9863893498230394 +lre11pfzy.sph,ara-arq,0.9999851535829934 +lre11pprb.sph,ara-arq,0.9907367260657962 +lre11pqno.sph,ara-arq,0.9999947908245772 +lre11pvoj.sph,ara-arq,0.9999986601480322 +lre11pysj.sph,ara-arq,0.9818927362425611 +lre11pzsc.sph,ara-arq,0.9777996051185309 +lre11qaml.sph,ara-arq,0.9901897881820463 +lre11qcse.sph,ara-arq,0.9786650402081483 +lre11qhrk.sph,ara-arq,0.99996447871608 +lre11qilb.sph,ara-arq,0.9999944023407891 +lre11qjbu.sph,ara-arq,0.999998909993637 +lre11qoxa.sph,ara-arq,0.9992628389476516 +lre11qpqk.sph,ara-arq,0.9988889605651897 +lre11qrlt.sph,ara-arq,0.9993564789200029 +lre11qtkd.sph,ara-arq,0.9964050771568211 +lre11qupc.sph,ara-arq,0.9758848861520171 +lre11qwil.sph,ara-arq,0.9999982738573114 +lre11qwqs.sph,ara-arq,0.9998351735862747 +lre11rafm.sph,ara-arq,0.9999610086553676 +lre11rdod.sph,ara-arq,0.9883226566986822 +lre11rdqv.sph,ara-ayl,0.985738149564001 +lre11relv.sph,ara-arq,0.999999468502387 +lre11rjui.sph,ara-arq,0.9999924251546126 +lre11rkhs.sph,ara-arq,0.9956440478348277 +lre11rldx.sph,ara-arq,0.9989857178552667 +lre11rwqr.sph,ara-arq,0.9999995661106464 +lre11sgia.sph,ara-arq,0.9792334621995509 +lre11skba.sph,ara-arq,0.9971597662211976 +lre11smpy.sph,ara-arq,0.9999299231162642 +lre11snqm.sph,ara-arq,0.9998770993281684 +lre11snzs.sph,ara-arq,0.9998957560021872 +lre11svhq.sph,ara-arq,0.9926690106361062 +lre11sxdk.sph,ara-arq,0.9999894072598812 +lre11szjx.sph,ara-arq,0.9997496078258093 +lre11tisp.sph,ara-aeb,0.9935967845696344 +lre11tkeq.sph,ara-arq,0.999992668852024 +lre11tkgv.sph,ara-arq,0.9999634239721431 +lre11tlbn.sph,ara-arq,0.9916960032980205 +lre11tlgc.sph,ara-arq,0.9921670149117343 +lre11tnbu.sph,ara-arq,0.9999917662026707 +lre11tqjp.sph,ara-arq,0.9999997186207273 +lre11trmj.sph,ara-aeb,0.9985641023002019 +lre11txsn.sph,ara-arq,0.9895624098081941 +lre11ubjy.sph,ara-arq,0.9991221016840601 +lre11ubmu.sph,ara-arq,0.9999965435512681 +lre11uhux.sph,ara-arq,0.997435675415044 +lre11ujqi.sph,ara-arq,0.9939996473996353 +lre11ullo.sph,ara-arq,0.9999999998276814 +lre11umdt.sph,ara-arq,0.9962758965298869 +lre11unmt.sph,ara-arq,0.9985618690998576 +lre11uqzm.sph,ara-arq,0.9985040805093104 +lre11urlw.sph,ara-arq,0.9998924522602656 +lre11usmv.sph,ara-arq,0.9997207805439943 +lre11uvte.sph,ara-arq,0.9983265140452946 +lre11uwxi.sph,ara-arq,0.9993165982905879 +lre11vcwy.sph,ara-arq,0.9969565461227344 +lre11veuu.sph,ara-arq,0.9999796896377858 +lre11vezt.sph,ara-arq,0.9767680653788202 +lre11vfna.sph,ara-aeb,0.9964120446674009 +lre11vhhz.sph,ara-arq,0.998103902690531 +lre11vhvh.sph,ara-arq,0.9999927635072146 +lre11vjcl.sph,ara-arq,0.9999819610209169 +lre11vkma.sph,ara-arq,0.9945794427407135 +lre11vncd.sph,ara-arq,0.9907523248594148 +lre11vrrg.sph,ara-arq,0.99460105641934 +lre11vsry.sph,ara-arq,0.9951631752607728 +lre11vssm.sph,ara-arq,0.9804358152668605 +lre11vukq.sph,ara-arq,0.998095638681888 +lre11vwzy.sph,ara-arq,0.9999994453927953 +lre11vxev.sph,ara-arq,0.9986174583419248 +lre11vyma.sph,ara-arq,0.9935618499200927 +lre11vzdv.sph,ara-arq,0.9940242404482954 +lre11wjmo.sph,ara-arq,0.9946379138594132 +lre11wlmf.sph,ara-arq,0.9985332278711876 +lre11wogz.sph,ara-arq,0.9999996831958213 +lre11wpeu.sph,ara-arq,0.987053003738009 +lre11xesf.sph,ara-arq,0.9926552158707163 +lre11xlhq.sph,ara-arq,0.9861699702078971 +lre11xmop.sph,ara-arq,0.9998106693232437 +lre11xncb.sph,ara-arq,0.9997518363731595 +lre11xsib.sph,ara-arq,0.9999999575778398 +lre11yfkq.sph,ara-arq,0.9978416409757853 +lre11yfuh.sph,ara-arq,0.9852732209813364 +lre11yjtr.sph,ara-arq,0.9999875286020034 +lre11ykqy.sph,ara-arq,0.999994599228917 +lre11ynky.sph,ara-arq,0.9870374231633215 +lre11ynub.sph,ara-arq,0.9975464312675795 +lre11ynut.sph,ara-arq,0.994865816956002 +lre11yohv.sph,ara-arq,0.9999860886998846 +lre11ypuu.sph,ara-arq,0.999939937136318 +lre11yqmg.sph,ara-arq,0.9988753925216532 +lre11yskr.sph,ara-arq,0.9990482482843873 +lre11yysp.sph,ara-arq,0.9909036055745113 +lre11zaix.sph,ara-arq,0.9993370825297897 +lre11zcny.sph,ara-arq,0.9863060880274914 +lre11zgmi.sph,ara-arq,0.9852780607073358 +lre11znqr.sph,ara-arq,0.999970678330779 +lre11zosk.sph,ara-aeb,0.9992041227652806 +lre11zsfl.sph,ara-arq,0.9999316377930149 +lre11zvte.sph,ara-arq,0.9994693690287269 +lre11zwzv.sph,ara-arq,0.998921262378038 +lre11zxvd.sph,ara-arq,0.9970232759937795 +lre11zzww.sph,ara-arq,0.9968858579018414 +lre17_abtnjqwo.sph,ara-arq,0.999946799291851 +lre17_acckmchx.sph,ara-aeb,0.9986711227355097 +lre17_acoxtkfz.sph,ara-aeb,0.9842666785196265 +lre17_adharrss.flac,ara-arq,0.9947182189896948 +lre17_adharrss.flac-gsm,ara-arq,0.9956384390235652 +lre17_aduhvtel.sph,ara-arq,0.9999889349109844 +lre17_aekqbfnc.sph,ara-arq,0.9984883022389972 +lre17_afpeboji.sph,ara-arq,0.9999593134555593 +lre17_agovhiqf.sph,ara-arq,0.9980746218401729 +lre17_aiesnqgx.sph,ara-arq,0.9999860691612521 +lre17_aipdwmxb.sph,ara-aeb,0.9989779255268202 +lre17_anlfomhj.flac,ara-ayl,0.9994229196677115 +lre17_anocqhav.sph,ara-arq,0.9997793277424729 +lre17_ansqkmxg.sph,ara-arq,0.9990411219484128 +lre17_aokadzsc.sph,ara-arq,0.9999717598295709 +lre17_aqfwvqpg.sph,ara-arq,0.9999999815159134 +lre17_aquebikd.sph,ara-arq,0.9999983875736405 +lre17_arzwedtw.sph,ara-arq,0.9945282050060794 +lre17_asbkyxts.sph,ara-arq,0.9902930286287294 +lre17_astrrnby.flac,ara-arq,0.9755268847782662 +lre17_astrrnby.flac-g711a,ara-arq,0.9854914061367727 +lre17_avfgalrs.flac-gsm,ara-arq,0.9999972882283532 +lre17_avrkkwph.sph,ara-arq,0.9988768871380727 +lre17_axiutoza.sph,ara-arq,0.9997779954741397 +lre17_ayizvbkc.sph,ara-arq,0.9999840812557923 +lre17_azjbrozk.sph,ara-arq,0.9953169139803245 +lre17_bdpgpxku.flac-g711a,ara-ayl,0.9999977914337187 +lre17_beeeutoh.flac,ara-arq,0.9942343106320238 +lre17_beeeutoh.flac-g723_1,ara-arq,0.9974204443646679 +lre17_bfaxqjqb.sph,ara-arq,0.999936007945235 +lre17_bfdjopui.sph,ara-aeb,0.9781197289083158 +lre17_bjfsfjit.flac,ara-arq,0.9879397124065223 +lre17_bjkkkuno.sph,ara-arq,0.999487026079353 +lre17_bjoiupem.sph,ara-arq,0.9880927563645315 +lre17_bjzozier.sph,ara-aeb,0.9997062951168889 +lre17_bkcpyhve.sph,ara-arq,0.9993564259542776 +lre17_bkjcaggk.flac,ara-arq,0.9994997776308957 +lre17_bkjcaggk.flac-g711mu,ara-arq,0.9993395177228688 +lre17_bktxvmar.sph,ara-arq,0.9999917820037525 +lre17_blazxkfa.sph,ara-arq,0.9999639560062823 +lre17_blbxhpiv.sph,ara-arq,0.9949809784805863 +lre17_blljbkpf.sph,ara-arq,0.9934970373407573 +lre17_bmujmfhj.sph,ara-arq,0.9775708820251491 +lre17_boryyjhf.sph,ara-arq,0.9977484576765338 +lre17_bowjoyjr.sph,ara-arq,0.9816765571144581 +lre17_bqxjnfxx.sph,ara-aeb,0.9929235091866946 +lre17_bqyznxui.flac-g711a,ara-arq,0.9999829540600199 +lre17_brpyutxm.sph,ara-aeb,0.9832823165542701 +lre17_bswfxzyr.sph,ara-arq,0.9999980843040208 +lre17_btafdkdg.flac,ara-arq,0.9992005312956213 +lre17_btafdkdg.flac-gsm,ara-arq,0.9992352599146054 +lre17_buwtqeqb.flac,ara-arq,0.987792729025184 +lre17_buwtqeqb.flac-g726,ara-arq,0.999145758855682 +lre17_bvqgsidl.sph,ara-aeb,0.9862447101786331 +lre17_bwxfqusr.sph,ara-aeb,0.9936207466860856 +lre17_bxwvpnfw.flac-opus,ara-arq,0.9999995104818462 +lre17_bymvcgmj.sph,ara-aeb,0.9997234164043465 +lre17_byzcayjn.flac,ara-arq,0.998985645903786 +lre17_byzcayjn.flac-opus,ara-arq,0.9999450148660525 +lre17_bzmjxehu.flac-opus,ara-arq,0.9999664531787248 +lre17_cairjuvk.sph,ara-arq,0.9987867029788313 +lre17_campowcv.sph,ara-arq,0.999787205693806 +lre17_ccazlpob.flac-g723_1,ara-arq,0.9974334229828895 +lre17_cccspkdm.flac,ara-arq,0.9880424513373823 +lre17_cccspkdm.flac-g726,ara-arq,0.9954627517627038 +lre17_ccjrvsph.sph,ara-arq,0.9999054835372087 +lre17_ccoewvvh.sph,ara-arq,0.9893765175449871 +lre17_ccypdbbu.sph,ara-arq,0.9999252890248448 +lre17_cdavzdsz.flac-g722,ara-arq,0.9997403404294325 +lre17_cflgybxg.sph,ara-arq,0.9940658569786607 +lre17_cfnizhql.sph,ara-arq,0.9961060724590401 +lre17_cfwfsjev.sph,ara-arq,0.9999998965888017 +lre17_cfznpgjd.sph,ara-arq,0.9971102748785 +lre17_cgmytvfk.sph,ara-aeb,0.9997875043473322 +lre17_cipisqbs.flac-g723_1,ara-arq,0.999066551053694 +lre17_cluexwgz.sph,ara-arq,0.9998680970232108 +lre17_cnfohesd.flac,ara-arq,0.9999887142160848 +lre17_cnfohesd.flac-g723_1,ara-arq,0.9995963322893509 +lre17_cphdyjdq.sph,ara-arq,0.9906215457248232 +lre17_csenuaki.sph,ara-arq,0.9999827796312072 +lre17_ctilbvnd.sph,ara-arq,0.9941490810957968 +lre17_ctrojttf.sph,ara-arq,0.9997415475987689 +lre17_ctudkyri.sph,ara-arq,0.999816925839242 +lre17_cupizrsx.sph,ara-arq,0.9977854992043466 +lre17_cureptst.flac,ara-arq,0.9966603829855006 +lre17_cureptst.flac-g726,ara-arq,0.9981724402252548 +lre17_cuvtxdbp.sph,ara-arq,0.9992149913711046 +lre17_cvdmebty.sph,ara-arq,0.9986034345626393 +lre17_cwbzqjzi.sph,ara-arq,0.9947360860803344 +lre17_cwdccgrs.sph,ara-arq,0.9954471876114118 +lre17_cwlcovrq.flac,ara-arq,0.9836373295795586 +lre17_cwlcovrq.flac-g726,ara-arq,0.9772066393285307 +lre17_cxfmtvjk.flac,ara-arq,0.998626631969006 +lre17_cxfmtvjk.flac-gsm,ara-arq,0.9995994571705398 +lre17_czdgssvb.sph,ara-arq,0.9839800680795405 +lre17_daifemlo.flac,ara-arq,0.9999923124695197 +lre17_daifemlo.flac-opus,ara-arq,0.9984999152656753 +lre17_dbwacwxo.sph,ara-aeb,0.9959928678878073 +lre17_dctjgdcf.sph,ara-arq,0.9999693971426666 +lre17_degmucpq.flac,ara-arq,0.9998238238013132 +lre17_degmucpq.flac-gsm,ara-arq,0.9997379324783554 +lre17_dfotbhmi.sph,ara-arq,0.9999967791115594 +lre17_dhsngizg.flac,ara-arq,0.9982745689022119 +lre17_dhsngizg.flac-g722,ara-arq,0.9899076755215684 +lre17_dhttmloy.flac,ara-aeb,0.9965398617526191 +lre17_dhttmloy.flac-g726,ara-arq,0.9999896912229259 +lre17_dimkfdga.sph,ara-aeb,0.9968909392210388 +lre17_dkorjmpr.sph,ara-arq,0.9993720459297395 +lre17_dlkdkiml.sph,ara-arq,0.9998570457972521 +lre17_dmptasts.sph,ara-arq,0.9999900227916614 +lre17_dmxpkcsa.sph,ara-aeb,0.9886627341769952 +lre17_dqynyyeg.flac-gsm,ara-arq,0.9885091570772832 +lre17_dreturny.sph,ara-arq,0.9999666233198918 +lre17_drvwwpat.sph,ara-arq,0.9986475256791328 +lre17_dsyovtja.sph,ara-arq,0.9993324032717326 +lre17_dtfklpze.flac,ara-arq,0.980875299084855 +lre17_duwzoctt.sph,ara-arq,0.9944810760803787 +lre17_dxcmnnvm.sph,ara-arq,0.998013143951749 +lre17_dyhzanuz.flac,ara-arq,0.9990022950435253 +lre17_dzdfoalc.flac-g711mu,ara-arq,0.9992312655669962 +lre17_dzpjlevc.sph,ara-arq,0.9999957026648084 +lre17_ecoxuoxn.sph,ara-arq,0.9991002623934314 +lre17_ecphppxx.sph,ara-arq,0.9999618844845741 +lre17_ecwsvpey.sph,ara-arq,0.9976190788174759 +lre17_edrerhyd.flac,ara-ayl,0.9999948782776695 +lre17_edrerhyd.flac-g726,ara-ayl,0.999989105843972 +lre17_efuktxso.sph,ara-arq,0.9941980497799132 +lre17_ehubbeoo.sph,ara-arq,0.9996933627528989 +lre17_ejqromcl.sph,ara-arq,0.9925673764218849 +lre17_ekecvked.sph,ara-arq,0.9994069294232564 +lre17_eknvksdj.sph,ara-arq,0.9999954297332013 +lre17_elihcnoy.sph,ara-arq,0.9999453376193964 +lre17_elkbffyz.sph,ara-aeb,0.9994985958843845 +lre17_elyyerit.sph,ara-arq,0.9969900208425778 +lre17_emvsmkok.sph,ara-arq,0.9999992133320064 +lre17_enrykydq.flac,ara-arq,0.9997480550009473 +lre17_enrykydq.flac-g726,ara-ayl,0.9999971853826948 +lre17_eqodnzbt.flac,ara-arq,0.9975299669768147 +lre17_eqzepcqb.sph,ara-arq,0.9999815043421167 +lre17_erwwlbkn.sph,ara-arq,0.997330169701168 +lre17_esimguhv.flac-g723_1,ara-arq,0.9990357824171696 +lre17_ewmwbivr.flac,ara-aeb,0.9941407951069882 +lre17_ewsegeoy.sph,ara-aeb,0.9999996013414438 +lre17_extnxkey.sph,ara-arq,0.9899611206184787 +lre17_fgvuyqrc.sph,ara-arq,0.9999740198189395 +lre17_fheeozab.sph,ara-arq,0.9999998531340115 +lre17_fhjonuvo.flac,ara-arq,0.9912514441770434 +lre17_fhjonuvo.flac-gsm,ara-arq,0.9986942437203434 +lre17_fhobhhji.flac-g711mu,ara-arq,0.9999982807618869 +lre17_fhqkrhdc.sph,ara-aeb,0.9974882737208554 +lre17_fjeaknag.sph,ara-arq,0.9999528433654035 +lre17_flghevgj.sph,ara-arq,0.9999275347920555 +lre17_flllshvw.sph,ara-arq,0.9995890566887757 +lre17_fmaaifty.sph,ara-arq,0.9967330411235821 +lre17_fmyxmvuh.sph,ara-arq,0.9947613058963332 +lre17_fobsmsvj.sph,ara-arq,0.9985126437061927 +lre17_fosfumyj.flac,ara-arq,0.9981175606728612 +lre17_fosfumyj.flac-opus,ara-arq,0.9973412918879427 +lre17_fovgucqc.flac-gsm,ara-arq,0.9902063770771822 +lre17_fpzybapz.flac,ara-arq,0.9999831117898691 +lre17_fpzybapz.flac-g722,ara-arq,0.9999745011543022 +lre17_frfvxgkm.flac,ara-arq,0.9982105807022026 +lre17_frfvxgkm.flac-g723_1,ara-arq,0.9999990437299212 +lre17_frldxzov.flac,ara-arq,0.9999911187378006 +lre17_frnemphs.sph,ara-aeb,0.9999309467267882 +lre17_frrujsta.sph,ara-aeb,0.9827694350674886 +lre17_fsibsssn.flac,ara-arq,0.9967716611519729 +lre17_fsibsssn.flac-opus,ara-arq,0.9943914629735336 +lre17_fstjhoom.sph,ara-arq,0.9999560494459958 +lre17_fuelrqpq.sph,ara-arq,0.998314155825479 +lre17_fwyhddxz.sph,ara-arq,0.9999653658276243 +lre17_fxhpiabv.flac-g722,ara-arq,0.9785533261819718 +lre17_fyoimwzn.sph,ara-aeb,0.9933070038389972 +lre17_fyousbwl.sph,ara-arq,0.9997738038053198 +lre17_fzetpzrs.sph,ara-arq,0.9845858022736108 +lre17_gbdwksrl.flac-opus,ara-ayl,0.9999810055915502 +lre17_gbkeixqy.sph,ara-arq,0.9995010489207078 +lre17_gbmrfptf.sph,ara-arq,0.9995997838188411 +lre17_gcwvbecw.flac,ara-arq,0.9999989525506976 +lre17_gcwvbecw.flac-g726,ara-arq,0.9999995241973817 +lre17_gekpnsqw.flac,ara-arq,0.9995617602232915 +lre17_gekpnsqw.flac-g711a,ara-arq,0.9990205101656683 +lre17_gfmhcimo.flac,ara-arq,0.9843261830443644 +lre17_gfmhcimo.flac-g711a,ara-arq,0.9920939572460264 +lre17_giljetfl.sph,ara-arq,0.9998866157683133 +lre17_givvturo.flac,ara-arq,0.9999960772188857 +lre17_givvturo.flac-g722,ara-arq,0.9998983053609016 +lre17_gkfwivzq.sph,ara-arq,0.9980134657798864 +lre17_gokkodsj.flac-g722,ara-ayl,0.9962500403266442 +lre17_gpvtlzov.flac-g711a,ara-arq,0.9999996204042616 +lre17_gqcxwuze.sph,ara-arq,0.9997783053110009 +lre17_gqpcfrwm.flac-g711mu,ara-arq,0.9817550583044142 +lre17_grjpzakf.sph,ara-arq,0.9839396690676935 +lre17_grjzqftr.sph,ara-arq,0.9877772556918923 +lre17_gszgcsjf.sph,ara-arq,0.998185259970527 +lre17_gvcqvsap.sph,ara-ayl,0.9997241868465031 +lre17_gxvmjddr.sph,ara-arq,0.9986899594684224 +lre17_hbopaybj.flac,ara-arq,0.987858946064221 +lre17_hbopaybj.flac-g726,ara-arq,0.999994436902088 +lre17_hchvsbqr.sph,ara-arq,0.999906917330984 +lre17_hdofrwsf.sph,ara-arq,0.9933958450004624 +lre17_heemkdqp.flac,ara-arq,0.9999909671052553 +lre17_heemkdqp.flac-g711mu,ara-arq,0.9985133817101537 +lre17_hezbzaqo.flac,ara-arq,0.9850199928962854 +lre17_hezbzaqo.flac-opus,ara-arq,0.9999841295369671 +lre17_hfcpmeoa.flac,ara-arq,0.9947181969213107 +lre17_hfcpmeoa.flac-g711a,ara-arq,0.9999138159106336 +lre17_hfjennzi.sph,ara-aeb,0.9983301362771589 +lre17_hhbqfxfc.sph,ara-aeb,0.9906856058776015 +lre17_hhdplflf.sph,ara-arq,0.9999949491011441 +lre17_hjimhzob.sph,ara-arq,0.9983113233299764 +lre17_hkeqbypc.flac,ara-arq,0.999999661618148 +lre17_hkeqbypc.flac-gsm,ara-arq,0.9966513627962669 +lre17_hlegmknx.sph,ara-arq,0.9999782289720263 +lre17_hmmdberw.sph,ara-aeb,0.9998194744091253 +lre17_hmptzweu.sph,ara-arq,0.9999971779992906 +lre17_hmqodybe.sph,ara-ayl,0.999996399948908 +lre17_hqrhzhyj.sph,ara-arq,0.9999231926652757 +lre17_hqzkhrhn.sph,ara-arq,0.9998770075415304 +lre17_hromittp.flac-g711a,ara-arq,0.9873451303247496 +lre17_hsdzydln.flac,ara-arq,0.9821628698106489 +lre17_hsdzydln.flac-g722,ara-arq,0.9988122191294789 +lre17_hsyuvhtp.sph,ara-arq,0.9990478816052286 +lre17_hvweyrfw.sph,ara-aeb,0.9988668377871749 +lre17_hwnjyblc.sph,ara-arq,0.9999249104513325 +lre17_hxpvwduf.flac-g711a,ara-aeb,0.9938488854312174 +lre17_hyhwjuli.sph,ara-arq,0.998858421685253 +lre17_hyreqvpy.flac,ara-arq,0.9878561156668769 +lre17_hyreqvpy.flac-g711mu,ara-arq,0.9999928431157828 +lre17_ibclsyjb.sph,ara-aeb,0.9846988495735338 +lre17_ifdrxwfj.sph,ara-arq,0.9988623308711881 +lre17_igayvnul.sph,ara-arq,0.9858583197264382 +lre17_igvjetcy.sph,ara-arq,0.9997565397210374 +lre17_igvlwujq.sph,ara-aeb,0.9942243168589683 +lre17_iibcchiq.flac-gsm,ara-arq,0.9989051845669153 +lre17_ilmlmyvv.sph,ara-aeb,0.9879290883225061 +lre17_inhzmrxh.sph,ara-arq,0.9999602544207984 +lre17_inufxzrc.sph,ara-arq,0.9997778215419035 +lre17_iqtqtuvc.flac,ara-arq,0.999987834966952 +lre17_iqtqtuvc.flac-opus,ara-arq,0.9817938892370449 +lre17_itjgcxig.sph,ara-arq,0.999561265994042 +lre17_itsqwgkz.sph,ara-arq,0.9999999957865953 +lre17_ittvvvfb.sph,ara-arq,0.9999964209775712 +lre17_ivcdeiky.flac,ara-arq,0.9873438502201111 +lre17_iwtlmazd.sph,ara-arq,0.9873719419778358 +lre17_ixbvjxte.sph,ara-arq,0.9997976143150719 +lre17_iycttrsq.sph,ara-arq,0.987846742780538 +lre17_iyqnjpod.sph,ara-arq,0.9936664779953471 +lre17_izhxudfa.sph,ara-arq,0.9999249686091597 +lre17_javisjpg.sph,ara-arq,0.9933263960275387 +lre17_jclfqqom.sph,ara-arq,0.9996552571484193 +lre17_jcperagi.sph,ara-arq,0.9999535310829344 +lre17_jcueuvkk.sph,ara-arq,0.9998819304923648 +lre17_jgqtrgqt.sph,ara-arq,0.9999823610331084 +lre17_jgzyarns.sph,ara-arq,0.9999898713367306 +lre17_jhjgasxv.sph,ara-aeb,0.9991139740455672 +lre17_jhoqfjpk.flac,ara-arq,0.9999995523948527 +lre17_jhoqfjpk.flac-g711a,ara-arq,0.9944430263756097 +lre17_jiakkjtr.sph,ara-arq,0.9999993323735444 +lre17_jilypibp.flac-gsm,ara-arq,0.9996434093761065 +lre17_jiowcahg.sph,ara-arq,0.992648625274396 +lre17_jlvgsuxh.sph,ara-aeb,0.9948123012485498 +lre17_jlvtorab.sph,ara-arq,0.983513534636461 +lre17_jmkuwbpc.sph,ara-arq,0.9994527050835158 +lre17_jnipskqx.flac-g711mu,ara-arq,0.9999742870473751 +lre17_jpeqxepv.sph,ara-aeb,0.9994104144919757 +lre17_jpeyombi.sph,ara-arq,0.9999998044387237 +lre17_jpjtuxvw.flac,ara-arq,0.9996847495267612 +lre17_jqmoqqfm.flac,ara-arq,0.9999869216223071 +lre17_jqmoqqfm.flac-g726,ara-arq,0.9999999894357187 +lre17_jtdfvpln.sph,ara-arq,0.999997871153658 +lre17_jtqoxxtm.sph,ara-aeb,0.9965086342211626 +lre17_jvpfjwdp.flac,ara-ayl,0.9999999997451912 +lre17_jvurmddm.flac,ara-arq,0.9987187046194855 +lre17_jvurmddm.flac-gsm,ara-arq,0.9983730947085013 +lre17_jwkybctt.sph,ara-arq,0.9999989593481196 +lre17_jxcmtrxm.sph,ara-ayl,0.9996284167713838 +lre17_jywavsuu.flac,ara-arq,0.9913930400541082 +lre17_kaaesmko.flac,ara-arq,0.9999294732363818 +lre17_kbmrgfwm.sph,ara-arq,0.9999528508197458 +lre17_kbodxjcn.flac,ara-arq,0.9982992372902407 +lre17_kcdcpzly.sph,ara-arq,0.9759220472115765 +lre17_keetepyz.flac,ara-arq,0.9927333842986636 +lre17_keetepyz.flac-g722,ara-arq,0.9999998412855006 +lre17_kfmsssrs.sph,ara-arq,0.9997193659178423 +lre17_kfsotues.sph,ara-arq,0.999998669422541 +lre17_khygxcdj.sph,ara-arq,0.9868874065356342 +lre17_kjtqnjgt.sph,ara-arq,0.9791232775307577 +lre17_kkcxpjzr.flac,ara-arq,0.9999999887350973 +lre17_kkcxpjzr.flac-g711a,ara-arq,0.9999944954328739 +lre17_kmzwffxp.sph,ara-arq,0.9782750887595135 +lre17_knkvczhw.flac-g722,ara-arq,0.9999106890724243 +lre17_kpbzatbg.sph,ara-arq,0.9935992901995973 +lre17_kpcquycc.sph,ara-arq,0.9988726069205118 +lre17_kugvrfiw.sph,ara-arq,0.9986475838029554 +lre17_kuzbruhc.sph,ara-arq,0.9998879396014413 +lre17_kwvzftsa.sph,ara-arq,0.9906568648956764 +lre17_kzutiwjm.sph,ara-arq,0.9883352489803169 +lre17_larfsawf.sph,ara-arq,0.9833813699639339 +lre17_lectmxiy.sph,ara-arq,0.9997601079206343 +lre17_lfdmjqzk.sph,ara-arq,0.9997174449458649 +lre17_lfqfgpty.flac,ara-arq,0.9877470265323836 +lre17_lfqfgpty.flac-opus,ara-arq,0.9998989537391589 +lre17_lgimdxjv.sph,ara-arq,0.9897020483952464 +lre17_lgmtfuaf.sph,ara-arq,0.9997034751344174 +lre17_lgzhdvir.flac,ara-arq,0.9999666603862899 +lre17_lgzhdvir.flac-g723_1,ara-arq,0.9999814711501472 +lre17_litfqatc.sph,ara-arq,0.9999697307957149 +lre17_ljqkqvuk.sph,ara-arq,0.9931204062930487 +lre17_lkeepofx.sph,ara-aeb,0.9815662246718163 +lre17_lkvpiaco.flac-g711mu,ara-aeb,0.9894714726927342 +lre17_llwfixbt.flac,ara-arq,0.9998571656021117 +lre17_llwfixbt.flac-opus,ara-arq,0.9999840733365404 +lre17_llxcpovx.sph,ara-arq,0.9999726611852431 +lre17_lmtexhdt.sph,ara-arq,0.9955378310817409 +lre17_lnlzbiqv.sph,ara-arq,0.9845486175862881 +lre17_lnwqjgum.sph,ara-arq,0.9994084640832857 +lre17_logsuwkc.sph,ara-ayl,0.9973010083242871 +lre17_lpdrjcmf.sph,ara-arq,0.9999999987748978 +lre17_lpnxjclp.sph,ara-arq,0.999990551187932 +lre17_lpwlbnvd.sph,ara-aeb,0.9985688096228789 +lre17_lqcxhbgx.flac,ara-aeb,0.9789437284063228 +lre17_lqeynset.sph,ara-arq,0.9946714116424836 +lre17_lqqtwkna.sph,ara-arq,0.9959561849558086 +lre17_lrchzlnf.sph,ara-arq,0.9999964986490049 +lre17_lriptaxa.sph,ara-arq,0.9999861992203515 +lre17_lrmpuslv.sph,ara-arq,0.9999833158908321 +lre17_lsglcrqu.sph,ara-arq,0.9994974702145716 +lre17_ltobvlca.flac,ara-aeb,0.9998321920878662 +lre17_ltobvlca.flac-g726,ara-arq,0.9808235035540288 +lre17_lumlsydt.flac,ara-arq,0.9993847597571562 +lre17_lvwbcjui.sph,ara-arq,0.9999990130410765 +lre17_lwbqplua.flac,ara-arq,0.9985377563185653 +lre17_lyvsulsp.sph,ara-arq,0.9996779798745427 +lre17_lzzfbiwk.sph,ara-arq,0.9999780111773144 +lre17_mazmicwf.flac,ara-arq,0.9761802678092957 +lre17_mazmicwf.flac-g726,ara-arq,0.9999987851548972 +lre17_mcchuzqa.flac,ara-arq,0.9994453172253329 +lre17_mcchuzqa.flac-g726,ara-arq,0.9994722195518764 +lre17_mhelcckx.sph,ara-aeb,0.9921911924278494 +lre17_minmrdvv.flac,ara-arq,0.9999806822091847 +lre17_minmrdvv.flac-g711mu,ara-arq,0.9999798110768492 +lre17_miyeplrp.flac,ara-ayl,0.999967537246669 +lre17_miyeplrp.flac-g722,ara-ayl,0.9999974966675732 +lre17_mjkrjctc.sph,ara-arq,0.9996993211891599 +lre17_mjuhytod.flac-g722,ara-aeb,0.9942683057158186 +lre17_mjxevtqw.flac,ara-arq,0.9789540403486894 +lre17_mllyvrkw.sph,ara-arq,0.9984655364684033 +lre17_mneiaioi.sph,ara-aeb,0.985452121186191 +lre17_mnoswtar.flac,ara-arq,0.9999465324732042 +lre17_mnoswtar.flac-g722,ara-arq,0.9994314827928369 +lre17_moihuogw.sph,ara-arq,0.9999981054273598 +lre17_moohuqbu.flac-opus,ara-arq,0.9946789594259231 +lre17_mpewcntj.sph,ara-arq,0.9999987697239342 +lre17_mtyfveku.sph,ara-arq,0.9829721690668127 +lre17_mvbpdkqz.sph,ara-ayl,0.9948321204607391 +lre17_mxcghtfj.sph,ara-arq,0.9927362055311203 +lre17_mxhoedfe.sph,ara-arq,0.9999999146737504 +lre17_mxmdmamo.sph,ara-aeb,0.988517700201585 +lre17_mxmjurdd.sph,ara-arq,0.9980132953988482 +lre17_mzdpsrvs.sph,ara-aeb,0.992494003405007 +lre17_mzsfsjad.sph,ara-aeb,0.99997586512649 +lre17_naeguqak.sph,ara-arq,0.9999929946428248 +lre17_nblzukhx.flac,ara-arq,0.9999359009222737 +lre17_nblzukhx.flac-gsm,ara-arq,0.9999999966088737 +lre17_ndkkdwgy.sph,ara-arq,0.9975163999653704 +lre17_negphusk.sph,ara-arq,0.9998989889366274 +lre17_nhdlsoit.sph,ara-arq,0.9916584056978099 +lre17_njbwudbl.sph,ara-arq,0.999934592749547 +lre17_njontgtu.sph,ara-arq,0.9794969009896114 +lre17_nkgdldta.sph,ara-arq,0.9916521956821477 +lre17_nkqygxxz.sph,ara-arq,0.9997656446176615 +lre17_nocucjva.sph,ara-arq,0.9995204775364295 +lre17_nojsrnhx.sph,ara-arq,0.999919125620621 +lre17_nowvnwzc.sph,ara-arq,0.995585233402159 +lre17_nqfliycm.sph,ara-arq,0.9999912069728009 +lre17_nqkyimjt.sph,ara-arq,0.9881291130932576 +lre17_nqxowwop.flac,ara-arq,0.9997443038852292 +lre17_nqxowwop.flac-g723_1,ara-arq,0.9914457208775102 +lre17_nrunzxja.flac,ara-arq,0.9993221612062564 +lre17_nrunzxja.flac-g711mu,ara-arq,0.9999566225291738 +lre17_nsiynodu.sph,ara-arq,0.9908214588078317 +lre17_nssuzfbr.sph,ara-arq,0.9999396177844772 +lre17_ntbrwymu.sph,ara-arq,0.9993012725372231 +lre17_nuvzuxee.sph,ara-arq,0.9996041721916568 +lre17_nvgpubxb.sph,ara-arq,0.9759857598176621 +lre17_nxjuqezl.flac,ara-arq,0.9995754800524955 +lre17_nxjuqezl.flac-gsm,ara-arq,0.9793987540104333 +lre17_nxvquxsr.sph,ara-arq,0.9990399807148835 +lre17_nzeyrrcl.sph,ara-arq,0.9999461953593082 +lre17_nzmnjjpc.flac,ara-aeb,0.9847092271434903 +lre17_obbtvsaj.flac-g711a,ara-arq,0.9999409869224803 +lre17_obkyiehe.sph,ara-arq,0.9998679965082828 +lre17_obrcwlmw.sph,ara-aeb,0.9998778475538475 +lre17_ogwcxkjw.sph,ara-arq,0.9895315802827847 +lre17_oireqedt.sph,ara-arq,0.9917281473076983 +lre17_oirnebxz.flac,ara-arq,0.9967110495563957 +lre17_oirnebxz.flac-opus,ara-aeb,0.9933118074655622 +lre17_oiveluew.sph,ara-arq,0.9968284888503907 +lre17_oizxklej.sph,ara-arq,0.9986908296100067 +lre17_olqpjrwd.sph,ara-ayl,0.985872505893845 +lre17_olwownje.sph,ara-arq,0.9999818242744661 +lre17_onckhujt.sph,ara-aeb,0.9979436467237117 +lre17_onknnaim.sph,ara-aeb,0.9987555260169619 +lre17_opsncnkb.sph,ara-aeb,0.9997592175168953 +lre17_opxoeses.flac-g711mu,ara-arq,0.9895267895164883 +lre17_oqnuceey.flac,ara-arq,0.9818564260274837 +lre17_oqnuceey.flac-opus,ara-arq,0.9960254767681471 +lre17_orthumig.sph,ara-arq,0.9979787737264081 +lre17_ouhrqmvj.sph,ara-arq,0.9999999735219096 +lre17_oukunjzc.flac,ara-arq,0.9999998874015028 +lre17_oukunjzc.flac-g722,ara-arq,0.9999998022867953 +lre17_ouvsypqp.sph,ara-arq,0.9999862194709894 +lre17_ownmyzum.sph,ara-arq,0.9983899224785795 +lre17_owxndsay.sph,ara-arq,0.9991660737491793 +lre17_oxoeettt.sph,ara-aeb,0.994764323060291 +lre17_oxvlijdf.sph,ara-arq,0.9980756971870425 +lre17_oylngzoh.sph,ara-arq,0.9999772205491734 +lre17_pbmuxcky.flac,ara-arq,0.9980374961356401 +lre17_pbmuxcky.flac-opus,ara-arq,0.9999987347640981 +lre17_pdcigndc.sph,ara-arq,0.9956870254382242 +lre17_pfcsmyfp.flac,ara-arq,0.999994210275427 +lre17_pfcsmyfp.flac-opus,ara-arq,0.9995142367581035 +lre17_pfecwivw.flac,ara-arq,0.997995447936321 +lre17_pfecwivw.flac-gsm,ara-arq,0.979556413285578 +lre17_pfenqxed.sph,ara-arq,0.9935592984355501 +lre17_pgqzdpfq.sph,ara-arq,0.9987770018281733 +lre17_phvwlddn.sph,ara-arq,0.9996126414779914 +lre17_piiiaqsg.sph,ara-arq,0.9883006332201746 +lre17_piixpsbr.flac,ara-arq,0.9999739707108446 +lre17_piixpsbr.flac-g722,ara-arq,0.9996474019470863 +lre17_pixqbtbm.flac,ara-arq,0.9949782923210799 +lre17_pixqbtbm.flac-g726,ara-ayl,0.9966265706203424 +lre17_pjfvtjab.sph,ara-arq,0.999885331527543 +lre17_pklmiexr.sph,ara-arq,0.9992720760130763 +lre17_pnlxhqnm.sph,ara-ayl,0.9999929993566911 +lre17_pnrhsfou.flac-g722,ara-arq,0.9999640996706576 +lre17_pnwenjwm.sph,ara-arq,0.9960418034539658 +lre17_poheolla.sph,ara-aeb,0.9985638045139876 +lre17_poysotsv.sph,ara-aeb,0.9856503882631178 +lre17_ppvtutvt.sph,ara-arq,0.9998354077570467 +lre17_pqawpvfb.flac,ara-arq,0.9950780853489194 +lre17_pqawpvfb.flac-g711a,ara-arq,0.9994370686040279 +lre17_pqwwzwxo.sph,ara-arq,0.999990243213515 +lre17_psacvdup.flac,ara-arq,0.9999630353117823 +lre17_psacvdup.flac-opus,ara-arq,0.9994055674097663 +lre17_pslkpzhl.sph,ara-arq,0.9997159742438066 +lre17_pufnzdvd.flac,ara-arq,0.9997950371376702 +lre17_pufnzdvd.flac-gsm,ara-arq,0.9999959634431062 +lre17_pujabbev.sph,ara-arq,0.9994079718102534 +lre17_pvfvlhsq.flac,ara-aeb,0.9999713112925558 +lre17_pvfvlhsq.flac-gsm,ara-arq,0.9999254611931253 +lre17_pwhqsovd.sph,ara-arq,0.9917133657396171 +lre17_pxekbodb.sph,ara-arq,0.9984745978775882 +lre17_qdwsexfm.sph,ara-arq,0.9999974580577462 +lre17_qhiyavse.sph,ara-arq,0.9912788455576231 +lre17_qivtcmgk.sph,ara-arq,0.9836174820047392 +lre17_qjitoyxc.sph,ara-arq,0.9999874042742806 +lre17_qkxouubm.sph,ara-arq,0.9998872491727429 +lre17_qljscllj.sph,ara-aeb,0.9979913152483216 +lre17_qlzldcpe.sph,ara-aeb,0.997053186781475 +lre17_qmcrgdzz.sph,ara-arq,0.9996004571017476 +lre17_qmjbylrs.flac,ara-arq,0.9999120076761361 +lre17_qmjbylrs.flac-g723_1,ara-arq,0.9942971237057362 +lre17_qmjpvlvg.sph,ara-arq,0.9998030655795183 +lre17_qogybjhz.sph,ara-arq,0.9994621165166646 +lre17_qpntxzjb.sph,ara-aeb,0.9957626204201693 +lre17_qpredbkv.sph,ara-arq,0.9990911968221025 +lre17_qrbdlmjx.sph,ara-arq,0.9999999999999865 +lre17_qrcvlqts.sph,ara-arq,0.9767514676069964 +lre17_qscgrzxe.flac,ara-arq,0.9998709648180928 +lre17_qsewfkyh.sph,ara-arq,0.9999999864987743 +lre17_qstdyztt.flac-g711mu,ara-arq,0.9999759932517555 +lre17_qszrgiyz.sph,ara-arq,0.9999496361715189 +lre17_qtaulytr.sph,ara-arq,0.9770776315818761 +lre17_qudalolg.sph,ara-arq,0.998897750323492 +lre17_qufteqvo.sph,ara-aeb,0.992027129263138 +lre17_qwiyjayz.sph,ara-arq,0.9995498080059056 +lre17_qwvrxfzu.sph,ara-arq,0.9988604816072997 +lre17_qyiarywg.flac,ara-arq,0.9999605279920688 +lre17_qyiarywg.flac-g723_1,ara-arq,0.9999999619112184 +lre17_qyzhxzvj.sph,ara-arq,0.9759510962602079 +lre17_rajrtwbo.sph,ara-aeb,0.9999999999944702 +lre17_rcryqfgn.sph,ara-arq,0.9972776568740012 +lre17_rcueudci.flac,ara-arq,0.9988837735514282 +lre17_rcueudci.flac-gsm,ara-arq,0.9999697760125505 +lre17_reicsaat.sph,ara-arq,0.9997754476127328 +lre17_reyualuk.flac,ara-arq,0.9992844879623304 +lre17_rfwyqutk.sph,ara-arq,0.9997722244477082 +lre17_rggtfbrd.sph,ara-aeb,0.9931773563621665 +lre17_rhepwrug.sph,ara-arq,0.9998787750778266 +lre17_rkocbhzs.sph,ara-arq,0.9960601282813184 +lre17_rlcyzlcy.sph,ara-arq,0.9994472570443922 +lre17_rlpbjbed.sph,ara-arq,0.9942325885969098 +lre17_rlqkwaeh.sph,ara-arq,0.9953431894962037 +lre17_rnveyooi.sph,ara-ayl,0.9996778752622651 +lre17_rnvyrkwg.flac-g723_1,ara-arq,0.9853171747622366 +lre17_rqacreai.sph,ara-arq,0.999207518939918 +lre17_rqlzthlg.sph,ara-aeb,0.989048786309874 +lre17_ruzqcwpn.sph,ara-arq,0.9999863065050799 +lre17_rwvdctfg.flac,ara-aeb,0.9815766373873294 +lre17_rynppewk.flac,ara-arq,0.9999708695439152 +lre17_rypzhghv.flac,ara-arq,0.9947011510267938 +lre17_rypzhghv.flac-g711a,ara-arq,0.9847132293141271 +lre17_sagynpjo.sph,ara-arq,0.999744534125517 +lre17_sbxerjvo.sph,ara-arq,0.999987473908599 +lre17_scfolxob.flac,ara-arq,0.9999713591244429 +lre17_serpsscu.flac,ara-arq,0.9793042704401821 +lre17_serpsscu.flac-g723_1,ara-arq,0.9999130486126522 +lre17_sffusbzg.sph,ara-arq,0.9999268097555194 +lre17_sfjwayps.flac,ara-ayl,0.9946715937173086 +lre17_sfjwayps.flac-g726,ara-arq,0.9982714238405073 +lre17_sgkgyjvk.flac,ara-arq,0.9995433812540649 +lre17_sgkgyjvk.flac-gsm,ara-arq,0.9996437880979923 +lre17_sjnfbigi.sph,ara-arq,0.99999366855751 +lre17_skdclppi.sph,ara-arq,0.9889180838738156 +lre17_smjdgznr.flac,ara-aeb,0.9964248254318828 +lre17_snfzxijz.sph,ara-ayl,0.9827938327458273 +lre17_sofspqyi.sph,ara-arq,0.9999994382673698 +lre17_sqoxhftl.sph,ara-arq,0.9996790538981134 +lre17_stbhhhou.sph,ara-arq,0.9945710415211226 +lre17_stpksvvi.sph,ara-aeb,0.996004757361174 +lre17_stxkelkq.sph,ara-ayl,0.9752463023195366 +lre17_suqttdyg.sph,ara-arq,0.997742051798683 +lre17_susdosey.sph,ara-arq,0.9977231084345539 +lre17_suvxbjhl.sph,ara-arq,0.9986381780682658 +lre17_svetuuie.sph,ara-arq,0.9998479453288084 +lre17_svzozbfk.sph,ara-arq,0.9998881824847226 +lre17_swgrlydv.sph,ara-arq,0.9990560832648376 +lre17_sxgfwork.sph,ara-arq,0.9999763564539524 +lre17_syatmwze.sph,ara-arq,0.9848649335693501 +lre17_syxmxolu.sph,ara-aeb,0.9867936744030255 +lre17_tbbuisna.sph,ara-arq,0.9999805669714006 +lre17_tbplljcp.flac,ara-arq,0.9998129818454303 +lre17_tbplljcp.flac-gsm,ara-arq,0.9999999839340195 +lre17_tcmjqsvf.sph,ara-arq,0.9936464811055075 +lre17_tcvunuvp.sph,ara-aeb,0.9946331881971427 +lre17_tduxpzqq.sph,ara-aeb,0.9996190225252365 +lre17_teyvymzd.flac-g711mu,ara-ayl,0.9787413632582724 +lre17_tfngvqdf.flac,ara-arq,0.999990203549186 +lre17_tfngvqdf.flac-g726,ara-arq,0.9983466771871533 +lre17_tforvtmc.sph,ara-arq,0.9935448102639823 +lre17_tfxmolis.sph,ara-arq,0.9998286292942293 +lre17_thjcyqwr.flac,ara-arq,0.9994467118163807 +lre17_thjcyqwr.flac-opus,ara-arq,0.9976422823383214 +lre17_thxeccdu.sph,ara-arq,0.9998750920305819 +lre17_ticjhhbi.sph,ara-arq,0.9794445128724558 +lre17_tjcshvrl.sph,ara-arq,0.9999926102290503 +lre17_tjremugr.sph,ara-arq,0.9999991435250514 +lre17_tlutsejs.sph,ara-arq,0.9988207070133517 +lre17_tnrvafxe.sph,ara-arq,0.9944573271724075 +lre17_tnxtgdnc.sph,ara-aeb,0.9942622096810594 +lre17_tolpbvsc.flac,ara-arq,0.9999996570853448 +lre17_tolpbvsc.flac-g711mu,ara-arq,0.9999930938174156 +lre17_totoyxhm.sph,ara-arq,0.999996710305506 +lre17_tqkpkxgu.sph,ara-arq,0.9999960590328173 +lre17_tqmnzgyb.sph,ara-arq,0.9999682738390965 +lre17_tsppppzj.sph,ara-arq,0.9999889892476231 +lre17_tssmuwge.sph,ara-arq,0.9993471439476459 +lre17_ttkmfmkk.sph,ara-arq,0.9961646913300042 +lre17_ttvvzlvt.sph,ara-arq,0.9945500617775027 +lre17_twkrspxj.flac-g711mu,ara-arq,0.9970066534454132 +lre17_tyqxhlrh.sph,ara-arq,0.9947460028171129 +lre17_tzwuzntv.flac,ara-arq,0.9999848694087901 +lre17_tzwuzntv.flac-g711a,ara-arq,0.9999999505751382 +lre17_uawwqpsa.sph,ara-arq,0.9857085643990153 +lre17_ubnnanex.sph,ara-arq,0.9999993559208963 +lre17_ucfvsgyr.flac,ara-arq,0.9961772950424368 +lre17_ucfvsgyr.flac-g711mu,ara-arq,0.9999597640499912 +lre17_ufifckts.flac-gsm,ara-ayl,0.9928071586629514 +lre17_uiyescxr.sph,ara-arq,0.9997891506043249 +lre17_ukkxkxxt.sph,ara-aeb,0.9935449493739165 +lre17_umissmzv.sph,ara-arq,0.9990005119204275 +lre17_unxhwqmy.flac,ara-aeb,0.9963270393303603 +lre17_upseluva.sph,ara-arq,0.9909948021770557 +lre17_upvapoke.sph,ara-arq,0.9930629481999376 +lre17_uqtiiong.sph,ara-arq,0.9999999685612003 +lre17_usdeaflg.sph,ara-arq,0.9857851998633298 +lre17_uszjbbko.sph,ara-arq,0.9999941374675029 +lre17_utjkjjcn.sph,ara-arq,0.9979757853366961 +lre17_utooogzo.sph,ara-ayl,0.9959130999661093 +lre17_uwescwtn.sph,ara-arq,0.9999992993153919 +lre17_uwldzayo.sph,ara-arq,0.9999916059792026 +lre17_uwuytsxe.sph,ara-arq,0.9947486052008054 +lre17_uzxmtvue.sph,ara-arq,0.9968064808522498 +lre17_vaugwmvv.sph,ara-arq,0.9994338833370221 +lre17_vbgmqfuo.flac,ara-arq,0.980847451266026 +lre17_vbjsoyeh.sph,ara-arq,0.9999116642269064 +lre17_vcksyiuy.flac-g711a,ara-aeb,0.9835977136688748 +lre17_vgxwjuno.sph,ara-arq,0.999983609611863 +lre17_vingckxa.flac,ara-arq,0.9986237456025335 +lre17_vjffccpz.sph,ara-arq,0.9999954081046549 +lre17_vjfjqitw.flac-gsm,ara-arq,0.9994035904368442 +lre17_vjtprfjw.flac,ara-ayl,0.9964430683674823 +lre17_vjtprfjw.flac-g726,ara-arq,0.9805106671407414 +lre17_vjvrlhfs.sph,ara-arq,0.9833183398241712 +lre17_vkqxvmtc.sph,ara-arq,0.9994368014427134 +lre17_vmssxzzd.sph,ara-arq,0.9996477418713372 +lre17_vndndpzq.sph,ara-arq,0.9791108970484209 +lre17_vnlvmhpc.flac-g711mu,ara-arq,0.9999365684834217 +lre17_vnxwpwge.sph,ara-aeb,0.9986622760430225 +lre17_vovpsxcd.sph,ara-arq,0.9998378398538086 +lre17_vpossvdt.flac-g711a,ara-arq,0.9999460791991176 +lre17_vrejajcm.sph,ara-arq,0.9776138943860346 +lre17_vswsposp.sph,ara-arq,0.9994020716061514 +lre17_vtigorkv.sph,ara-arq,0.9999623163215805 +lre17_vtkffspm.flac,ara-arq,0.9997898857084506 +lre17_vuznysrk.flac-g711mu,ara-arq,0.9979511877631668 +lre17_vvyqmniq.sph,ara-arq,0.9999515987872877 +lre17_vwijmoke.sph,ara-arq,0.9989975351908933 +lre17_vynkvprp.sph,ara-arq,0.9999845129327151 +lre17_wagzvxqz.sph,ara-arq,0.9973278031174633 +lre17_wahbanqs.sph,ara-arq,0.9998842310344779 +lre17_wairvblk.sph,ara-arq,0.9987016619787147 +lre17_wesfzmws.sph,ara-arq,0.9908093051797177 +lre17_wfvvkjuv.sph,ara-arq,0.999944678936039 +lre17_wggunlcp.sph,ara-arq,0.9996749635832616 +lre17_wgnbrmfd.sph,ara-aeb,0.980616930826586 +lre17_whjqstnl.sph,ara-ayl,0.999979615699586 +lre17_whqbhubs.sph,ara-arq,0.9898499177391032 +lre17_widuepdg.sph,ara-aeb,0.9817918573044054 +lre17_wkhkxpmr.sph,ara-ayl,0.9998286566004881 +lre17_wmtiighi.sph,ara-arq,0.987786199224793 +lre17_wnevoywa.sph,ara-arq,0.9999869741085323 +lre17_woccwvjw.sph,ara-arq,0.9999711680680078 +lre17_wrwmvkyy.sph,ara-arq,0.9999793167856921 +lre17_wryaaaay.sph,ara-arq,0.9791656416113507 +lre17_wtkatcwm.sph,ara-arq,0.9850427630920813 +lre17_wvgqdrqk.sph,ara-arq,0.9791490717212691 +lre17_wvyabqbx.sph,ara-arq,0.9990947686607856 +lre17_wwypkyea.sph,ara-aeb,0.9892611131137926 +lre17_wxaxnvpq.sph,ara-arq,0.996190338175843 +lre17_wxirsbfe.sph,ara-arq,0.9998855761433262 +lre17_wxwauidm.sph,ara-arq,0.9899615630910984 +lre17_wyjetcgf.sph,ara-arq,0.9999367576307792 +lre17_xaowthgy.sph,ara-arq,0.9983944376668455 +lre17_xdcmpfbl.sph,ara-arq,0.9914795091793974 +lre17_xdhrhgmk.flac,ara-arq,0.9999939811339105 +lre17_xdhrhgmk.flac-opus,ara-arq,0.9999380489585059 +lre17_xepisjpn.sph,ara-arq,0.998123428276411 +lre17_xhpkbvei.sph,ara-aeb,0.9896471029490118 +lre17_xhqfsfkf.sph,ara-arq,0.9999985545272336 +lre17_xkayfgzq.sph,ara-arq,0.9985227970239359 +lre17_xllwincb.sph,ara-arq,0.9999999189534862 +lre17_xlqqxoym.sph,ara-arq,0.9999541071953805 +lre17_xmvmloxn.flac,ara-arq,0.9827789901954631 +lre17_xmvmloxn.flac-g723_1,ara-arq,0.9998467402490113 +lre17_xnyjhsyy.sph,ara-arq,0.9999956069402056 +lre17_xovcjkso.sph,ara-aeb,0.9920305514596128 +lre17_xroveufz.sph,ara-aeb,0.9993682416393447 +lre17_xskjgkzq.sph,ara-arq,0.9970587807073615 +lre17_xsuhxjmz.sph,ara-arq,0.9999908698954791 +lre17_xtwbrgfu.sph,ara-aeb,0.9964922765834566 +lre17_xujatdxg.sph,ara-arq,0.9999544795771792 +lre17_xviuupwl.sph,ara-arq,0.9999998045117351 +lre17_xvxlncwz.sph,ara-arq,0.9999098345731946 +lre17_xyyhmsku.flac,ara-arq,0.998537851661698 +lre17_ybtygbuu.sph,ara-arq,0.9999292474625724 +lre17_yctdihii.sph,ara-arq,0.999999574179243 +lre17_ydmmannh.flac,ara-arq,0.9983920334462054 +lre17_ydmmannh.flac-g723_1,ara-arq,0.9995348938940224 +lre17_yekpxxwc.flac,ara-arq,0.9988877246272224 +lre17_yekpxxwc.flac-g711mu,ara-aeb,0.9923589185563311 +lre17_ygndvzfp.sph,ara-arq,0.9999911328372215 +lre17_yhjzokrv.sph,ara-arq,0.9958319219062072 +lre17_yilroulj.flac-g711a,ara-aeb,0.9795574945355306 +lre17_yivtnzkg.sph,ara-arq,0.9999715099952994 +lre17_yjoblztq.flac,ara-arq,0.9999973207194949 +lre17_yjoblztq.flac-opus,ara-arq,0.9998238058972634 +lre17_ykxiohej.sph,ara-arq,0.9990315895452987 +lre17_ylsidleu.flac-g711a,ara-arq,0.9984473055243461 +lre17_yltydxpy.sph,ara-arq,0.9956478528044228 +lre17_yownwnlt.flac,ara-arq,0.9998812691288554 +lre17_yownwnlt.flac-g722,ara-arq,0.9916894249368101 +lre17_ypetotbw.flac,ara-arq,0.9999949516854025 +lre17_ypetotbw.flac-gsm,ara-aeb,0.9972546803878858 +lre17_yqgdczse.flac,ara-arq,0.9930023849936759 +lre17_yqhtqtnl.flac,ara-arq,0.9993399870284819 +lre17_yqhtqtnl.flac-opus,ara-arq,0.9998349097677763 +lre17_yrzjdbif.sph,ara-aeb,0.9999958037591928 +lre17_ysadxqiw.sph,ara-arq,0.9999106154927021 +lre17_ysdzkrmo.flac-g711a,ara-arq,0.9974528853995988 +lre17_ytgfvwpa.flac,ara-arq,0.9998217502864875 +lre17_ytgfvwpa.flac-opus,ara-arq,0.9998524581773589 +lre17_yuduwhrd.flac,ara-arq,0.9955028997292512 +lre17_yuoequzk.sph,ara-arq,0.9999099394309094 +lre17_yuxtqtbd.sph,ara-arq,0.9999994359070692 +lre17_yvybpria.sph,ara-arq,0.9973286645664943 +lre17_ywssuzqt.sph,ara-arq,0.9999990761916652 +lre17_yygbpsdg.sph,ara-arq,0.9999490833528133 +lre17_yynyldnq.sph,ara-arq,0.9999228504305794 +lre17_yzbbhyzt.sph,ara-arq,0.9993694348705324 +lre17_yzjlvluy.sph,ara-arq,0.9996272929446929 +lre17_zaopfwhd.flac,ara-arq,0.9904595900538987 +lre17_zarwpotk.sph,ara-arq,0.9999289722738884 +lre17_zbolnsoz.sph,ara-arq,0.9999999973417777 +lre17_zcjklnfe.sph,ara-arq,0.9996813639473103 +lre17_zcxzxqos.sph,ara-arq,0.9999489746159186 +lre17_zdhatipt.flac,ara-ayl,0.9990280980242568 +lre17_zdhatipt.flac-g726,ara-aeb,0.9996834792362927 +lre17_zfajxywc.sph,ara-arq,0.9999934420544287 +lre17_zgdxpveq.flac-g711a,ara-arq,0.9999999408428903 +lre17_zgiksrvx.flac-g711mu,ara-arq,0.9993189757834657 +lre17_zhdbyfcw.sph,ara-arq,0.9997585048527508 +lre17_zhdfyrxw.sph,ara-arq,0.999148999070484 +lre17_zilbjisa.flac,ara-ayl,0.9999999857011468 +lre17_zilbjisa.flac-gsm,ara-ayl,0.9998704515922084 +lre17_zkdjfgbp.sph,ara-aeb,0.9951005308493373 +lre17_zmebjusq.sph,ara-arq,0.9993030221446528 +lre17_zmodeuem.sph,ara-arq,0.9999743212165114 +lre17_zmyziuxc.flac,ara-aeb,0.9999827977872252 +lre17_zmyziuxc.flac-g711a,ara-aeb,0.9950355679983685 +lre17_zpjrydvx.sph,ara-aeb,0.9994743369849513 +lre17_zrnsvuzf.sph,ara-arq,0.994174882388934 +lre17_zruejjuh.flac,ara-arq,0.9992671462220715 +lre17_zsrybjvn.sph,ara-arq,0.9993004483160852 +lre17_zvvdwwpv.flac,ara-aeb,0.9997026564129534 +lre17_zvvdwwpv.flac-g726,ara-aeb,0.9999912235110034 +lre17_zzkdjfea.sph,ara-arq,0.9842091321709953 diff --git a/egs/lre22/fixed.v1.8k/run_001_prepare_data.sh b/egs/lre22/fixed.v1.8k/run_001_prepare_data.sh new file mode 100755 index 00000000..60eb6891 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_001_prepare_data.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +stage=1 + +. parse_options.sh || exit 1; +. datapath.sh + + +if [ $stage -le 1 ];then + # Prepares voxlingua 107 for training + hyp_utils/conda_env.sh \ + local/prepare_voxlingua107.py \ + --corpus-dir $voxlingua_root \ + --output-dir data/voxlingua107 \ + --remove-langs en-en es-es ar-ar pt-pt \ + --map-langs-to-lre-codes \ + --target-fs 8000 + +fi + +if [ $stage -le 2 ];then + # Prepare LRE17 Training data + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_train \ + --subset train \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_cts \ + --subset dev \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_afv \ + --subset dev \ + --source vast \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_cts \ + --subset eval \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_afv \ + --subset eval \ + --source vast \ + --target-fs 8000 + +fi + +if [ $stage -le 3 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_dev.py \ + --corpus-dir $lre22_dev_root \ + --output-dir data/lre22_dev \ + --target-fs 8000 + +fi + +if [ $stage -le 4 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_eval.py \ + --corpus-dir $lre22_eval_root \ + --output-dir data/lre22_eval \ + --target-fs 8000 + +fi + +if [ $stage -le 5 ];then + local/download_lre22_scorer.sh + local/download_focal.sh +fi diff --git a/egs/lre22/fixed.v1.8k/run_002_compute_evad.sh b/egs/lre22/fixed.v1.8k/run_002_compute_evad.sh new file mode 100755 index 00000000..676ed335 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_002_compute_evad.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=b1 +storage_name=$(date +'%m_%d_%H_%M') +vaddir=`pwd`/exp/vad_e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/lre22-fixed-v1.8k-$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b1{0,1,2,3,4,5,6,7,8,9}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" + fi + fi +fi + +# VAD Train/Test Datasets +if [ $stage -le 2 ];then + for name in voxlingua107 \ + lre17_train \ + lre17_dev_cts lre17_dev_afv \ + lre17_eval_cts lre17_eval_afv \ + lre22_dev lre22_eval \ + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + diff --git a/egs/lre22/fixed.v1.8k/run_003_prepare_noises_rirs.sh b/egs/lre22/fixed.v1.8k/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..638143f0 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_003_prepare_noises_rirs.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 8 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../v1.16k/RIRS_NOISES ];then + ln -s ../v1.16k/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 8 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 8 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 8 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/lre22/fixed.v1.8k/run_004_apply_codecs.sh b/egs/lre22/fixed.v1.8k/run_004_apply_codecs.sh new file mode 100755 index 00000000..afd6a8ed --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_004_apply_codecs.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ];then + + for data in voxlingua107 lre17_dev_afv lre17_eval_afv + do + hyp_utils/conda_env.sh \ + local/apply_tel_codecs_to_kaldi_datadir.py \ + --input-dir data/$data \ + --output-dir data/${data}_codecs + done + +fi diff --git a/egs/lre22/fixed.v1.8k/run_010_prepare_xvec_train_data.sh b/egs/lre22/fixed.v1.8k/run_010_prepare_xvec_train_data.sh new file mode 100755 index 00000000..fbff4a02 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_010_prepare_xvec_train_data.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # This script preprocess audio for x-vector training + for name in voxlingua107_codecs \ + lre17_train \ + lre17_{dev,eval}_{cts,afv,afv_codecs} + do + steps_xvec/preprocess_audios_for_nnet_train.sh \ + --nj 40 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${name} data/${name}_proc_audio_no_sil exp/${name}_proc_audio_no_sil + utils/fix_data_dir.sh data/${name}_proc_audio_no_sil + done +fi + +if [ $stage -le 2 ];then + utils/combine_data.sh \ + data/lre17_proc_audio_no_sil \ + data/lre17_train_proc_audio_no_sil \ + data/lre17_{dev,eval}_{cts,afv,afv_codecs}_proc_audio_no_sil +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 3s + hyp_utils/remove_short_audios.sh --min-len 3 data/voxlingua107_codecs_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/lre17_proc_audio_no_sil +fi + +if [ $stage -le 4 ];then + # merge voxlingua and lre17 + utils/combine_data.sh \ + data/voxlingua107_lre17_proc_audio_no_sil \ + data/voxlingua107_codecs_proc_audio_no_sil \ + data/lre17_proc_audio_no_sil +fi + +if [ $stage -le 5 ]; then + for name in lre17_proc_audio_no_sil voxlingua107_lre17_proc_audio_no_sil + do + hyp_utils/conda_env.sh \ + local/split_segments_train_val.py \ + --segments-file data/$name/utt2lang \ + --recordings-file data/$name/wav.scp \ + --durations-file data/$name/utt2dur \ + --val-percent 2. \ + --output-dir data/$name/train_val_split + done +fi + +if [ $stage -le 6 ]; then + for name in voxlingua107_lre17_proc_audio_no_sil + do + hyp_utils/conda_env.sh \ + local/split_segments_train_val.py \ + --segments-file data/$name/utt2lang \ + --recordings-file data/$name/wav.scp \ + --durations-file data/$name/utt2dur \ + --remove-langs en-en es-es ar-ar pt-pt \ + --val-percent 2. \ + --ara-ary-seg-file resources/lre17_ara-ary/segs_ara-ary.csv \ + --output-dir data/$name/train_val_split_noary + done + mkdir data/voxlingua107_lre17_noary_proc_audio_no_sil + cd data/voxlingua107_lre17_noary_proc_audio_no_sil + ln -s ../voxlingua107_lre17_proc_audio_no_sil/wav.scp + ln -s ../voxlingua107_lre17_proc_audio_no_sil/train_val_split_noary train_val_split + cd - + +fi + +if [ $stage -le 7 ]; then + awk 'BEGIN{ +adapt_langs_list="ara-acm ara-aeb ara-apc ara-arq ara-arz ara-ayl eng-gbr eng-usg por-brz zho-cmn zho-nan am-am sn-sn fra-mix haw-haw ia-ia ceb-ceb tl-tl sa-sa su-su te-te yo-yo sw-sw war-war km-km tr-tr gn-gn ha-ha ln-ln mg-mg"; +nf=split(adapt_langs_list, f, " "); +for(i=1;i<=nf;i++){ adapt_langs[f[i]]=1;}; +FS=","; OFS=","; +getline; print $0; +} +{if ($1 in adapt_langs) { $3="1."} else{ $3="0.01"}; print $0}' \ + data/voxlingua107_lre17_noary_proc_audio_no_sil/train_val_split/class_file.csv > \ + data/voxlingua107_lre17_noary_proc_audio_no_sil/train_val_split/class_file_adapt_1.csv +fi diff --git a/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh b/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh new file mode 100755 index 00000000..c67c8741 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-fixed-v1.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu + +fi + +# Class balanced Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu + +fi +exit + +# Class-balanced + hard prototipe mining Fine-tuning +if [ $stage -le 3 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir \ + --num-gpus $ngpu + +fi + +exit + +# Fine-tuning +if [ $stage -le 4 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s4_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s4_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s4_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s4_base_cfg $nnet_s4_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s3 \ + --trainer.exp-path $nnet_s4_dir \ + --num-gpus $ngpu + +fi + + +# Fine-tuning +if [ $stage -le 5 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s5_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s5_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s5_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s5_base_cfg $nnet_s5_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s4 \ + --trainer.exp-path $nnet_s5_dir \ + --num-gpus $ngpu + +fi + +# Fine-tuning +if [ $stage -le 6 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s6_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s6_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s6_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s6_base_cfg $nnet_s6_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file_adapt_1.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s5 \ + --trainer.exp-path $nnet_s6_dir \ + --num-gpus $ngpu + +fi + diff --git a/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh~ b/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh~ new file mode 100755 index 00000000..2be763c7 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_011_train_xvector.sh~ @@ -0,0 +1,161 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-fixed-v1.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu + +fi + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu + +fi + +# Large Margin Fine-tuning +if [ $stage -le 3 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir \ + --num-gpus $ngpu + +fi + +# Large Margin Fine-tuning +if [ $stage -le 4 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s4_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s4_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s4_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s4_base_cfg $nnet_s4_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s3 \ + --trainer.exp-path $nnet_s4_dir \ + --num-gpus $ngpu + +fi + + +# Large Margin Fine-tuning +if [ $stage -le 5 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s5_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s5_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s5_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s5_base_cfg $nnet_s5_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s4 \ + --trainer.exp-path $nnet_s5_dir \ + --num-gpus $ngpu + +fi + +# Large Margin Fine-tuning +if [ $stage -le 6 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s6_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s6_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s6_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s6_base_cfg $nnet_s6_args $extra_args \ + --data.train.dataset.audio-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file_adapt_1.csv \ + --data.val.dataset.audio-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s5 \ + --trainer.exp-path $nnet_s6_dir \ + --num-gpus $ngpu + +fi + diff --git a/egs/lre22/fixed.v1.8k/run_030_extract_xvectors.sh b/egs/lre22/fixed.v1.8k/run_030_extract_xvectors.sh new file mode 100755 index 00000000..dc760d5b --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_030_extract_xvectors.sh @@ -0,0 +1,215 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=2 +nnet_stage=2 +config_file=default_config.sh +use_gpu=false +do_tsne=true +split_dev=false +xvec_chunk_length=12800 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 4G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +# if [ $stage -le 1 ]; then +# # Extract xvectors for training +# for name in lre17_proc_audio_no_sil voxlingua107_codecs_proc_audio_no_sil +# do +# steps_xvec/extract_xvectors_from_wav.sh \ +# --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ +# --use-bin-vad false \ +# --random-utt-length true --min-utt-length 300 --max-utt-length 3000 \ +# --feat-config $feat_config \ +# $nnet data/${name} \ +# $xvector_dir/${name} +# done +# fi + +if [ $stage -le 2 ]; then + # Extract xvectors for training + for name in lre22_dev + do + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --use-bin-vad true --num-augs 10 --aug-config conf/reverb_noise_aug.yaml \ + --random-utt-length true --min-utt-length 300 --max-utt-length 3000 \ + --feat-config $feat_config \ + $nnet data/${name} \ + $xvector_dir/${name}_aug \ + data/${name}_aug + done +fi + + +if [ $stage -le 3 ]; then + # Extracts x-vectors for dev and eval + for name in lre22_dev lre22_eval + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + +if [ $stage -le 4 ]; then + for name in lre22_dev + do + if [ "$do_tsne" == "true" ] || [ "$split_dev" == "true" ];then + $train_cmd \ + $xvector_dir/$name/tsne/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 + + $train_cmd \ + $xvector_dir/$name/tsne_per_class/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne_per_class.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne_per_class \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 \ + --do-ahc --cluster-tsne --ahc-thr -5 + + if [ "$split_dev" == "true" ];then + hyp_utils/conda_env.sh \ + local/split_dev.py \ + --segs-file $xvector_dir/$name/tsne_per_class/segments.csv \ + --output-dir ./resources/dev_splits \ + --num-folds 2 + + # delete the split data dirs so they are regenerated later + rm -rf data/lre22_dev_p{1,2} + + fi + fi + done +fi + +if [ $stage -le 5 ]; then + if [ ! -d data/lre22_dev_p1 ];then + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/train_segments.csv \ + > p1.lst + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/test_segments.csv \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev data/lre22_dev_$p + done + fi +fi + +if [ $stage -le 6 ]; then + if [ -d data/lre22_dev_aug ] && [ ! -d data/lre22_dev_aug_p1 ];then + awk -v fsegs=./resources/dev_splits/fold_0/train_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1] +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p1.lst + + awk -v fsegs=./resources/dev_splits/fold_0/test_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1]=1; +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev_aug data/lre22_dev_aug_$p + done + fi +fi + +if [ $stage -le 7 ];then + if [ -f $xvector_dir/lre22_dev_aug/xvector.scp ];then + mkdir -p $xvector_dir/lre22_dev_aug_clean + cat $xvector_dir/lre22_dev/xvector.scp \ + $xvector_dir/lre22_dev_aug/xvector.scp \ + > $xvector_dir/lre22_dev_aug_clean/xvector.scp + + for p in "" _p1 _p2 + do + if [ ! -d data/lre22_dev_aug_clean$p ]; then + utils/combine_data.sh \ + data/lre22_dev_aug_clean$p \ + data/lre22_dev$p \ + data/lre22_dev_aug$p + fi + done + fi +fi + +exit diff --git a/egs/lre22/fixed.v1.8k/run_040_be_final.sh b/egs/lre22/fixed.v1.8k/run_040_be_final.sh new file mode 100755 index 00000000..fe5b6f18 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_040_be_final.sh @@ -0,0 +1,434 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +fi + +xvector_dir=exp/xvectors/$nnet_name +be_base_dir=exp/be/$nnet_name +score_base_dir=exp/scores/$nnet_name + +if [ $stage -le 1 ];then + for r in 1 #0.9999 0.999 #0.99 0.975 0.95 + do + be_name=pca${r}_cw_lnorm_lgbe_lre22_aug + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + + + ) & + + done + + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}_p12/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + + + done + wait + +fi + +exit +# Back-ends below over-fitted + +if [ $stage -le 2 ];then + for r in 1 + do + for penalty in l2 #l1 + do + for c in 1 #0.1 1 + do + for ary_thr in 0.975 #0.85 0.7 #0.99 0.95 0.9 #15 ##1 5 10 20 + do + be_name=pca${r}_cw_lnorm_lsvm_${penalty}_c${c}_sqhinge_lre22_aug_lre17_aryt${ary_thr} + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + done + done + done + done + wait + +fi + +if [ $stage -le 3 ];then + for r in 1 # 0.9999 0.99 0.975 0.95 0.9 0.8 + do + for shrinking in true #false + do + for c in 1 10 #0.1 1 10 #0.01 0.1 1 10 # 0.0001 + do + for vl in false #true #false + do + if [ "$vl" == "true" ];then + do_vl="--do-vl" + else + do_vl="--no_do-vl" + fi + ary_thr=0.975 + be_name=pca${r}_cw_lnorm_gsvm_shrinking_${shrinking}_c${c}_lre17_aryt${ary_thr}_vl${vl}_aug_clean + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + #score_dir=$score_base_dir/${be_name}_logpost + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500\ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500 \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --svm.eval-type cat-log-post \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + + ) & + done + done + done + done + wait + +fi diff --git a/egs/lre22/fixed.v1.8k/run_050_fusion_v1.sh b/egs/lre22/fixed.v1.8k/run_050_fusion_v1.sh new file mode 100755 index 00000000..ffe3d6c6 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/run_050_fusion_v1.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +score_dir_0=exp/scores +nnet_1=fbank64_stmn_ecapatdnn2048x4_v1.0.s2 +nnet_2=fbank64_stmn_fwseres2net50s8_v1.0.s2 +be_1=pca1_cw_lnorm_lgbe_lre22_aug +score_dirs="$score_dir_0/$nnet_1/$be_1 +$score_dir_0/$nnet_2/$be_1" + +train_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"_p12/cal_v1" }; print $0}') +test_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"/cal_v1" }; print $0}') + +output_dir=exp/fusion/fus_v1.0 + +local/train_fusion_lre22.sh "$train_score_dirs" $output_dir/train +local/score_lre22.sh \ + dev \ + ${output_dir}/train/lre22_dev_scores.tsv \ + ${output_dir}/train/lre22_dev_results + +local/eval_fusion_lre22.sh "$test_score_dirs" $output_dir/train/fus.mat $output_dir/test + +local/score_lre22.sh \ + dev \ + ${output_dir}/test/lre22_dev_scores.tsv \ + ${output_dir}/test/lre22_dev_results + +local/score_lre22.sh eval \ + ${output_dir}/test/lre22_eval_scores.tsv \ + ${output_dir}/test/lre22_eval_results + + + + + + diff --git a/egs/lre22/fixed.v1.8k/steps b/egs/lre22/fixed.v1.8k/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/lre22/fixed.v1.8k/steps_be/eval_be_v1.py b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v1.py new file mode 100755 index 00000000..85fee18c --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v1.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("test acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels) + + +def train_be( + v_file, + trial_list, + class_name, + has_labels, + gbe, + model_dir, + score_file, + verbose, +): + config_logger(verbose) + model_dir = Path(model_dir) + output_dir = Path(score_file).parent + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + segs = SegmentSet.load(trial_list) + reader = DRF.create(v_file) + x = reader.read(segs["id"], squeeze=True) + del reader + logging.info("loaded %d samples", x.shape[0]) + + trans_file = model_dir / "transforms.h5" + if trans_file.is_file(): + logging.info("loading transform file %s", trans_file) + trans = TransformList.load(trans_file) + logging.info("applies transform") + x = trans(x) + + gbe_file = model_dir / "model_gbe.h5" + logging.info("loading GBE file %s", gbe_file) + gbe_model = GBE.load(gbe_file) + logging.info("GBE args=%s", str(gbe)) + logging.info("evals GBE") + scores = gbe_model(x, **gbe) + + if has_labels: + class_ids = segs[class_name] + y_true = np.asarray([gbe_model.labels.index(l) for l in class_ids]) + # labels, y_true = np.unique(class_ids, return_inverse=True) + y_pred = np.argmax(scores, axis=-1) + compute_metrics(y_true, y_pred, gbe_model.labels) + + logging.info("Saving scores to %s", score_file) + score_table = {"segmentid": segs["id"]} + for i, key in enumerate(gbe_model.labels): + score_table[key] = scores[:, i] + + score_table = pd.DataFrame(score_table) + score_table.to_csv(score_file, sep="\t", index=False) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Evals linear GBE", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--trial-list", required=True) + GBE.add_eval_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--has-labels", default=False, action=ActionYesNo) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/eval_be_v2.py b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v2.py new file mode 100755 index 00000000..78b50935 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v2.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("test acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + trial_list, + class_name, + has_labels, + svm, + model_dir, + score_file, + verbose, +): + config_logger(verbose) + model_dir = Path(model_dir) + output_dir = Path(score_file).parent + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + segs = SegmentSet.load(trial_list) + reader = DRF.create(v_file) + x = reader.read(segs["id"], squeeze=True) + del reader + logging.info("loaded %d samples", x.shape[0]) + + trans_file = model_dir / "transforms.h5" + if trans_file.is_file(): + logging.info("loading transform file %s", trans_file) + trans = TransformList.load(trans_file) + logging.info("applies transform") + x = trans(x) + + svm_file = model_dir / "model_svm.h5" + logging.info("loading SVM file %s", svm_file) + svm_model = SVM.load(svm_file) + logging.info("SVM args=%s", str(svm)) + logging.info("evals SVM") + scores = svm_model(x, **svm) + + if has_labels: + class_ids = segs[class_name] + y_true = np.asarray([svm_model.labels.index(l) for l in class_ids]) + # labels, y_true = np.unique(class_ids, return_inverse=True) + y_pred = np.argmax(scores, axis=-1) + compute_metrics(y_true, y_pred, svm_model.labels) + + logging.info("Saving scores to %s", score_file) + score_table = {"segmentid": segs["id"]} + for i, key in enumerate(svm_model.labels): + score_table[key] = scores[:, i] + + score_table = pd.DataFrame(score_table) + score_table.to_csv(score_file, sep="\t", index=False) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Evals linear SVM", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--trial-list", required=True) + SVM.add_eval_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--has-labels", default=False, action=ActionYesNo) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/eval_be_v5.py b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v5.py new file mode 100755 index 00000000..ad11a667 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/eval_be_v5.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np +import pandas as pd + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList +from hyperion.np.classifiers import GaussianSVMC as SVM +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("test acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + label_idxs = [i for i in range(len(labels))] + C = compute_confusion_matrix(y_true, y_pred, label_idxs, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, label_idxs, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + trial_list, + class_name, + has_labels, + svm, + model_dir, + score_file, + verbose, +): + config_logger(verbose) + model_dir = Path(model_dir) + output_dir = Path(score_file).parent + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + segs = SegmentSet.load(trial_list) + reader = DRF.create(v_file) + x = reader.read(segs["id"], squeeze=True) + del reader + logging.info("loaded %d samples", x.shape[0]) + + trans_file = model_dir / "transforms.h5" + if trans_file.is_file(): + logging.info("loading transform file %s", trans_file) + trans = TransformList.load(trans_file) + logging.info("applies transform") + x = trans(x) + + svm_file = model_dir / "model_svm.h5" + logging.info("loading SVM file %s", svm_file) + svm_model = SVM.load(svm_file) + if not isinstance(svm_model, SVM): + print("Model loading failed") + +# model_labels = ['afr-afr', 'ara-aeb', 'ara-arq', 'ara-ayl', 'eng-ens', 'eng-iaf', 'fra-ntf', 'nbl-nbl', 'orm-orm', 'tir-tir', 'tso-tso', 'ven-ven', 'xho-xho', 'zul-zul'] +# model_labels = list(svm_model.labels) +# print('model_labels', np.shape(model_labels)) +# if 'zzzzzz' in model_labels: +# model_labels.remove('zzzzzz') +# svm_model.labels = model_labels + print('svm_model.labels', np.shape(svm_model.labels)) + + logging.info("SVM args=%s", str(svm)) + logging.info("evals SVM") + scores = svm_model(x, **svm) + + if has_labels: + class_ids = segs[class_name] + y_true = np.asarray([svm_model.labels.index(l) for l in class_ids if l in svm_model.labels]) + # labels, y_true = np.unique(class_ids, return_inverse=True) + y_pred = np.argmax(scores, axis=-1) + compute_metrics(y_true, y_pred, svm_model.labels) + + logging.info("Saving scores to %s", score_file) + score_table = {"segmentid": segs["id"]} + for i, key in enumerate(svm_model.labels): + score_table[key] = scores[:, i] + + score_table = pd.DataFrame(score_table) + score_table.to_csv(score_file, sep="\t", index=False) + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Evals gaussian SVM", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--trial-list", required=True) + SVM.add_eval_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--has-labels", default=False, action=ActionYesNo) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/eval_fusion.m b/egs/lre22/fixed.v1.8k/steps_be/eval_fusion.m new file mode 100644 index 00000000..830ee6c8 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/eval_fusion.m @@ -0,0 +1,17 @@ +function eval_fusion(in_files, out_file, model_file) + + load(model_file, 'alpha', 'beta', 'labels'); + n_files = length(in_files); + scores={}; + for i=1:n_files + T_i = readtable(in_files{i}, 'FileType', 'delimitedtext', 'Delimiter','tab', 'ReadRowNames', true, 'VariableNamingRule', 'preserve'); + T_i = sortrows(T_i, 'RowNames'); + s_i = T_i.Variables'; + scores{i}=s_i; + end + scores = apply_nary_lin_fusion(scores, alpha, beta); + T_i.Variables = scores'; + %T_i.Properties.VariableNames = T_i.Properties.VariableDescriptions; + writetable(T_i, out_file, 'FileType', 'text', 'Delimiter','tab', 'WriteRowNames', true) + + \ No newline at end of file diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v1.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v1.py new file mode 100755 index 00000000..983d903d --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v1.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels) + + +def train_be( + v_file, + train_list, + class_name, + do_lnorm, + whiten, + pca, + gbe, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + train_reader = DRF.create(v_file) + x_trn = train_reader.read(train_segs["id"], squeeze=True) + del train_reader + class_ids = train_segs[class_name] + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("loaded %d samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("GBE args=%s", str(gbe)) + gbe = GBE(labels=labels, **gbe) + gbe.fit(x_trn, y_true) + logging.info("trained GBE") + scores = gbe.eval_linear(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and Gaussian BE") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + gbe.save(output_dir / "model_gbe.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Train linear GBE", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GBE.add_class_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v2.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v2.py new file mode 100755 index 00000000..599b55c4 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v2.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels) + + +def train_be( + v_file, + train_list, + class_name, + do_lnorm, + whiten, + pca, + svm, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + train_reader = DRF.create(v_file) + x_trn = train_reader.read(train_segs["id"], squeeze=True) + del train_reader + class_ids = train_segs[class_name] + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("loaded %d samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("SVM args=%s", str(svm)) + model = SVM(labels=labels, **svm) + model.fit(x_trn, y_true) + logging.info("trained SVM") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Train linear SVM Classifier", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + PCA.add_class_args(parser, prefix="pca") + SVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v3.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v3.py new file mode 100755 index 00000000..87009212 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v3.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + lre17_v_file, + lre17_list, + class_name, + do_lnorm, + whiten, + ary_thr, + num_nons, + pca, + svm, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_lre17 = SegmentSet.load(lre17_list) + ary_idx = segs_lre17[class_name] == "ara-ary" + # lre17_segs.loc[ara_ary_idx, class_name] = "ara-ayl" # "ara-arq" # "ara-aeb" + segs_ary = segs_lre17.loc[ary_idx] + + logging.info("label maghrebi arabic samples") + v_reader = DRF.create(lre17_v_file) + x_ary = v_reader.read(segs_ary["id"], squeeze=True) + logging.info("loaded %d lre17 ara-ary samples", x_ary.shape[0]) + + ara_idx = train_segs[class_name].isin(["ara-ayl", "ara-arq", "ara-aeb"]) + x_ara = x_trn[ara_idx] + class_ids_ara = train_segs.loc[ara_idx, class_name].values + + gbe_ara = GBE() + labels_ara, y_true_ara = np.unique(class_ids_ara, return_inverse=True) + gbe_ara.fit(x_ara, y_true_ara) + scores_ary = gbe_ara(x_ary) + y_pred_ary = np.argmax(scores_ary, axis=-1) + logp_ary = np.max(softmax(scores_ary, axis=-1), axis=-1) + print(logp_ary, y_pred_ary) + # dscores_ary = np.diff(np.sort(scores_ary, axis=-1), axis=-1)[:, -1] + # sel_ary = dscores_ary > ary_thr + sel_ary = logp_ary > ary_thr + segs_ary = segs_ary.loc[sel_ary] + y_pred_ary = y_pred_ary[sel_ary] + x_ary = x_ary[sel_ary] + segs_ary[class_name] = [labels_ara[c] for c in y_pred_ary] + logging.info("selected %d ara-ary segments", x_ary.shape[0]) + segs_ary["logp"] = logp_ary[sel_ary] + SegmentSet(segs_ary).save(output_dir / "segs_ary.csv") + + logging.info("selecting non-target segments") + segs_non = segs_lre17.loc[~ary_idx].copy() + segs_non[class_name] = "zzzzzz" + x_non = v_reader.read(segs_non["id"], squeeze=True) + logging.info("loaded %d lre17 non-tar samples", x_non.shape[0]) + + class_ids = train_segs[class_name].values + labels, y_true = np.unique(class_ids, return_inverse=True) + gbe = GBE() + gbe.fit(x_trn, y_true) + scores_non = np.max(gbe(x_non), axis=1) + sel_non = np.argsort(scores_non)[-num_nons:] + segs_non = segs_non.iloc[sel_non] + x_non = x_non[sel_non] + logging.info("selected %d non-tar segments", x_non.shape[0]) + + class_ids = ( + list(train_segs[class_name].values) + + list(segs_ary[class_name].values) + + list(segs_non[class_name].values) + ) + x_trn = np.concatenate((x_trn, x_ary, x_non), axis=0) + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("SVM args=%s", str(svm)) + model = SVM(labels=labels, **svm) + model.fit(x_trn, y_true) + logging.info("trained SVM") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.svm.coef_ = model.svm.coef_[:-1] + model.svm.intercept_ = model.svm.intercept_[:-1] + model.labels = model.labels[:-1] + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Train linear SVM Classifier", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + PCA.add_class_args(parser, prefix="pca") + SVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--ary-thr", default=10, type=float) + parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v4.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v4.py new file mode 100755 index 00000000..986393a8 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v4.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + lre17_v_file, + lre17_list, + class_name, + do_lnorm, + whiten, + ary_thr, + # num_nons, + pca, + gbe, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_lre17 = SegmentSet.load(lre17_list) + ary_idx = segs_lre17[class_name] == "ara-ary" + # lre17_segs.loc[ara_ary_idx, class_name] = "ara-ayl" # "ara-arq" # "ara-aeb" + segs_ary = segs_lre17.loc[ary_idx] + + logging.info("label maghrebi arabic samples") + v_reader = DRF.create(lre17_v_file) + x_ary = v_reader.read(segs_ary["id"], squeeze=True) + logging.info("loaded %d lre17 ara-ary samples", x_ary.shape[0]) + + ara_idx = train_segs[class_name].isin(["ara-ayl", "ara-arq", "ara-aeb"]) + x_ara = x_trn[ara_idx] + class_ids_ara = train_segs.loc[ara_idx, class_name].values + + gbe_ara = GBE() + labels_ara, y_true_ara = np.unique(class_ids_ara, return_inverse=True) + gbe_ara.fit(x_ara, y_true_ara) + scores_ary = gbe_ara(x_ary) + y_pred_ary = np.argmax(scores_ary, axis=-1) + p_ary = np.max(softmax(scores_ary, axis=-1), axis=-1) + sel_ary = p_ary > ary_thr + segs_ary = segs_ary.loc[sel_ary] + y_pred_ary = y_pred_ary[sel_ary] + x_ary = x_ary[sel_ary] + segs_ary[class_name] = [labels_ara[c] for c in y_pred_ary] + logging.info("selected %d ara-ary segments", x_ary.shape[0]) + segs_ary["p"] = p_ary[sel_ary] + SegmentSet(segs_ary).save(output_dir / "segs_ary.csv") + + # logging.info("selecting non-target segments") + # segs_non = segs_lre17.loc[~ary_idx].copy() + # segs_non[class_name] = "zzzzzz" + # x_non = v_reader.read(segs_non["id"], squeeze=True) + # logging.info("loaded %d lre17 non-tar samples", x_non.shape[0]) + + # class_ids = train_segs[class_name].values + # labels, y_true = np.unique(class_ids, return_inverse=True) + # gbe = GBE() + # gbe.fit(x_trn, y_true) + # scores_non = np.max(gbe(x_non), axis=1) + # sel_non = np.argsort(scores_non)[-num_nons:] + # segs_non = segs_non.iloc[sel_non] + # x_non = x_non[sel_non] + # logging.info("selected %d non-tar segments", x_non.shape[0]) + + # class_ids = ( + # list(train_segs[class_name].values) + # + list(segs_ary[class_name].values) + # + list(segs_non[class_name].values) + # ) + # x_trn = np.concatenate((x_trn, x_ary, x_non), axis=0) + class_ids = list(train_segs[class_name].values) + list(segs_ary[class_name].values) + x_trn = np.concatenate((x_trn, x_ary), axis=0) + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("GBE args=%s", str(gbe)) + model = GBE(labels=labels, **gbe) + model.fit(x_trn, y_true) + logging.info("trained GBE") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and GBE") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.save(output_dir / "model_gbe.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser( + description="Train linear GBE Classifier", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GBE.add_class_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--ary-thr", default=10, type=float) + # parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v5.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v5.py new file mode 100755 index 00000000..32cfd6c9 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v5.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.helpers import VectorClassReader as VCR +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.classifiers import GaussianSVMC as GSVM +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + lre17_v_file, + lre17_list, + voxlingua_v_file, + voxlingua_list, + class_name, + do_lnorm, + whiten, + ary_thr, + num_nons, + pca, + svm, + output_dir, + verbose, + do_vl, + do_lre17, +): + print(locals(), flush=True) + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + x_ary = [] + x_non = [] + y_ary = [] + y_non = [] + + if do_lre17: + segs_lre17 = SegmentSet.load(lre17_list) + ary_idx = segs_lre17[class_name] == "ara-ary" + # lre17_segs.loc[ara_ary_idx, class_name] = "ara-ayl" # "ara-arq" # "ara-aeb" + segs_ary = segs_lre17.loc[ary_idx] + + logging.info("label maghrebi arabic samples") + v_reader = DRF.create(lre17_v_file) + x_ary = v_reader.read(segs_ary["id"], squeeze=True) + logging.info("loaded %d lre17 ara-ary samples", x_ary.shape[0]) + + ara_idx = train_segs[class_name].isin(["ara-ayl", "ara-arq", "ara-aeb"]) + x_ara = x_trn[ara_idx] + class_ids_ara = train_segs.loc[ara_idx, class_name].values + + gbe_ara = GBE() + labels_ara, y_true_ara = np.unique(class_ids_ara, return_inverse=True) + gbe_ara.fit(x_ara, y_true_ara) + scores_ary = gbe_ara(x_ary) + y_pred_ary = np.argmax(scores_ary, axis=-1) + logp_ary = np.max(softmax(scores_ary, axis=-1), axis=-1) + print(logp_ary, y_pred_ary) + # dscores_ary = np.diff(np.sort(scores_ary, axis=-1), axis=-1)[:, -1] + # sel_ary = dscores_ary > ary_thr + sel_ary = logp_ary > ary_thr + segs_ary = segs_ary.loc[sel_ary] + y_pred_ary = y_pred_ary[sel_ary] + x_ary = x_ary[sel_ary] + segs_ary[class_name] = [labels_ara[c] for c in y_pred_ary] + logging.info("selected %d ara-ary segments", x_ary.shape[0]) + segs_ary["logp"] = logp_ary[sel_ary] + SegmentSet(segs_ary).save(output_dir / "segs_ary.csv") + + logging.info("selecting non-target segments") + lre17_close_idx = segs_lre17[class_name].isin( + ["ara-acm", "ara-apc", "eng-usg", "por-brz"] + ) + segs_non = segs_lre17.loc[lre17_close_idx].copy() + segs_non[class_name] = "zzzzzz" + x_non = v_reader.read(segs_non["id"], squeeze=True) + logging.info("loaded %d lre17 non-tar samples", x_non.shape[0]) + + y_ary = list(segs_ary[class_name].values) + y_non = list(segs_non[class_name].values) + + # class_ids = train_segs[class_name].values + # labels, y_true = np.unique(class_ids, return_inverse=True) + # gbe = GBE() + # gbe.fit(x_trn, y_true) + # scores_non = np.max(gbe(x_non), axis=1) + # sel_non = np.argsort(scores_non)[-num_nons:] + # segs_non = segs_non.iloc[sel_non] + # x_non = x_non[sel_non] + # logging.info("selected %d non-tar segments", x_non.shape[0]) + + if do_vl: + v_reader_vl = DRF.create(voxlingua_v_file) + segs_voxlingua = SegmentSet.load(voxlingua_list) + vl_close_idx = segs_voxlingua[class_name].isin( + [ + "en-en", + "am-am", + "sn-sn", + "fra-mix", + "haw-haw", + "zho-cmn", + "ia-ia", + "ceb-ceb", + "sa-sa", + "su-su", + "te-te", + "yo-yo", + "sw-sw", + "pt-pt", + "war-war", + "km-km", + "tr-tr", + "gn-gn", + ] + ) + segs_vl_close = segs_voxlingua.loc[vl_close_idx].copy() + segs_vl_close[class_name] = "zzzzzz" + x_non_vl = v_reader_vl.read(segs_vl_close["id"], squeeze=True) + + vl_afk_idx = segs_voxlingua[class_name] == "afr-afr" + if not np.any(vl_afk_idx): + vl_afk_idx = segs_voxlingua[class_name] == "af-af" + segs_vl_afk = segs_voxlingua.loc[vl_afk_idx].copy() + segs_vl_afk[class_name] = "afr-afr" + x_trn_vl = v_reader_vl.read(segs_vl_afk["id"], squeeze=True) + + y_trn_vl = list(segs_vl_afk[class_name].values) + y_non_vl = list(segs_vl_close[class_name].values) + + del v_reader_vl + else: + x_trn_vl = np.zeros((0, x_trn.shape[1])) + x_non_vl = np.zeros((0, x_trn.shape[1])) + y_trn_vl = [] + y_non_vl = [] + + class_ids = ( + list(train_segs[class_name].values) + y_trn_vl + y_ary + y_non + y_non_vl + ) + x_trn = np.concatenate((x_trn, x_trn_vl, x_ary, x_non, x_non_vl), axis=0) + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("Gaussian SVM args=%s", str(svm)) + model = GSVM(labels=labels, **svm) + model.fit(x_trn, y_true) + logging.info("trained SVM") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + # model.svm.coef_ = model.svm.coef_[:-1] + # model.svm.intercept_ = model.svm.intercept_[:-1] + model_labels = list(np.copy(model.labels)) + if "zzzzzz" in model_labels: + model_labels.remove("zzzzzz") + model.labels = model_labels + print("model.labels before save", np.shape(model.labels)) + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Train gaussian SVM Classifier", + ) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + parser.add_argument("--voxlingua-v-file", required=True) + parser.add_argument("--voxlingua-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GSVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--ary-thr", default=10, type=float) + parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--do-vl", default=True, action=ActionYesNo) + parser.add_argument("--do-lre17", default=True, action=ActionYesNo) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + # parser.add_argument("--classifier", default="lsvm", choices=["lsvm", "gsvm", "rf"], required=False) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v6.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v6.py new file mode 100755 index 00000000..d481a18d --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v6.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + +tar_langs = ( + "afr-afr", + "ara-aeb", + "ara-arq", + "ara-ayl", + "eng-ens", + "eng-iaf", + "fra-ntf", + "nbl-nbl", + "orm-orm", + "tir-tir", + "tso-tso", + "ven-ven", + "xho-xho", + "zul-zul", +) + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + cv_v_file, + cv_list, + afr_v_file, + afr_list, + class_name, + do_lnorm, + whiten, + pca, + gbe, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_cv = SegmentSet.load(cv_list) + # ary_idx = segs_lre17[class_name] == "ara-ary" + # segs_ary = segs_lre17.loc[ary_idx] + + segs_cv = SegmentSet.load(cv_list) + cv_idx = np.zeros((len(segs_cv),), dtype=bool) + for lang in tar_langs: + cv_idx_i = segs_cv[class_name] == lang + cv_idx = np.logical_or(cv_idx, cv_idx_i) + + segs_cv = segs_cv.loc[cv_idx] + # segs_cv.loc[segs_cv[class_name] == "eng-ine", class_name] = "eng-iaf" + + # v_reader = DRF.create(cv_v_file) + # x_cv = v_reader.read(segs_cv["id"], squeeze=True) + # logging.info("loaded %d cv samples", x_cv.shape[0]) + + segs_afr = SegmentSet.load(afr_list) + afr_idx = np.zeros((len(segs_afr),), dtype=bool) + for lang in tar_langs: + afr_idx_i = segs_afr[class_name] == lang + afr_idx = np.logical_or(afr_idx, afr_idx_i) + + segs_afr = segs_afr.loc[afr_idx] + + v_reader = DRF.create(afr_v_file) + x_afr = v_reader.read(segs_afr["id"], squeeze=True) + logging.info("loaded %d afr samples", x_afr.shape[0]) + + class_ids = ( + list(train_segs[class_name].values) + # + list(segs_cv[class_name].values) + + list(segs_afr[class_name].values) + ) + # x_trn = np.concatenate((x_trn, x_cv, x_afr), axis=0) + x_trn = np.concatenate((x_trn, x_afr), axis=0) + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + else: + lnorm = None + + logging.info("GBE args=%s", str(gbe)) + model = GBE(labels=labels, **gbe) + model.fit(x_trn, y_true) + logging.info("trained GBE") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and GBE") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.save(output_dir / "model_gbe.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train linear GBE Classifier",) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--cv-v-file", required=True) + parser.add_argument("--cv-list", required=True) + parser.add_argument("--afr-v-file", required=True) + parser.add_argument("--afr-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GBE.add_class_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v7.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v7.py new file mode 100755 index 00000000..1b37d92e --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v7.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearSVMC as SVM +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + +tar_langs = ( + "afr-afr", + "ara-aeb", + "ara-arq", + "ara-ayl", + "eng-ens", + "eng-iaf", + "fra-ntf", + "nbl-nbl", + "orm-orm", + "tir-tir", + "tso-tso", + "ven-ven", + "xho-xho", + "zul-zul", +) + +non_langs = ( + "fra-can", + "fra-fra", + "fra-ntf", + "afr-afr", + "ara-acm", + "ara-arz", + "ara-jor", + "ara-ksa", + "ara-kuw", + "ara-leb", + "ara-mau", + "ara-mor", + "ara-oma", + "ara-pal", + "ara-qat", + "ara-sud", + "ara-syr", + "ara-uae", + "ara-yem", + "ara-apc", + "eng-gbr", + "eng-usg", +) + + +def read_ood_data(train_list, v_file, class_name): + v_reader = DRF.create(v_file) + + segs = SegmentSet.load(train_list) + idx = np.zeros((len(segs),), dtype=bool) + for lang in tar_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_tar = segs.loc[idx].copy() + if len(segs_tar) > 0: + x_tar = v_reader.read(segs_tar["id"], squeeze=True) + else: + x_tar = None + + idx = np.zeros((len(segs),), dtype=bool) + for lang in non_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_non = segs.loc[idx].copy() + segs_non[class_name] = "zzzzzzz" + if len(segs_non) > 0: + x_non = v_reader.read(segs_non["id"], squeeze=True) + else: + x_non = None + + logging.info( + "read %s got ntar: %d nnon: %d", train_list, len(segs_tar), len(segs_non) + ) + return segs_tar, x_tar, segs_non, x_non + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + sre_v_file, + sre_list, + lre17_v_file, + lre17_list, + cv_v_file, + cv_list, + afr_v_file, + afr_list, + class_name, + do_lnorm, + whiten, + pca, + svm, + output_dir, + ood_weight, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_sre_tar, x_sre_tar, segs_sre_non, x_sre_non = read_ood_data( + sre_list, sre_v_file, class_name, + ) + _, _, segs_lre17_non, x_lre17_non = read_ood_data( + lre17_list, lre17_v_file, class_name, + ) + segs_cv_tar, x_cv_tar, segs_cv_non, x_cv_non = read_ood_data( + cv_list, cv_v_file, class_name + ) + segs_afr_tar, x_afr_tar, segs_afr_non, x_afr_non = read_ood_data( + afr_list, afr_v_file, class_name, + ) + + # class_ids = train_segs[class_name].values + # labels, y_true = np.unique(class_ids, return_inverse=True) + # gbe = GBE() + # gbe.fit(x_trn, y_true) + # scores_non = np.max(gbe(x_non), axis=1) + # sel_non = np.argsort(scores_non)[-num_nons:] + # segs_non = segs_non.iloc[sel_non] + # x_non = x_non[sel_non] + # logging.info("selected %d non-tar segments", x_non.shape[0]) + + # class_ids = ( + # list(train_segs[class_name].values) + # + list(segs_sre_tar[class_name].values) + # + list(segs_cv_tar[class_name].values) + # + list(segs_afr_tar[class_name].values) + # + list(segs_sre_non[class_name].values) + # + list(segs_lre17_non[class_name].values) + # + list(segs_cv_non[class_name].values) + # + list(segs_afr_non[class_name].values) + # ) + # x_trn = np.concatenate( + # ( + # x_trn, + # x_sre_tar, + # x_cv_tar, + # x_afr_tar, + # x_sre_non, + # x_lre17_non, + # x_cv_non, + # x_afr_non, + # ), + # axis=0, + # ) + class_ids = ( + list(train_segs[class_name].values) + + list(segs_sre_tar[class_name].values) + + list(segs_cv_tar[class_name].values) + + list(segs_afr_tar[class_name].values) + + list(segs_sre_non[class_name].values) + + list(segs_lre17_non[class_name].values) + + list(segs_cv_non[class_name].values) + + list(segs_afr_non[class_name].values) + ) + x = np.concatenate( + ( + x_trn, + x_sre_tar, + x_cv_tar, + x_afr_tar, + x_sre_non, + x_lre17_non, + x_cv_non, + x_afr_non, + ), + axis=0, + ) + sample_weight = np.concatenate( + ( + np.ones((len(train_segs),)), + ood_weight * np.ones((len(segs_sre_tar),)), + ood_weight * np.ones((len(segs_cv_tar),)), + ood_weight * np.ones((len(segs_afr_tar),)), + ood_weight * np.ones((len(segs_sre_non),)), + np.ones((len(segs_lre17_non),)), + ood_weight * np.ones((len(segs_cv_non),)), + ood_weight * np.ones((len(segs_afr_non),)), + ) + ) + + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x = pca(x) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x) + + logging.info("apply lnorm") + x = lnorm(x) + else: + lnorm = None + + logging.info("SVM args=%s", str(svm)) + model = SVM(labels=labels, **svm) + model.fit(x, y_true, sample_weight=sample_weight) + logging.info("trained SVM") + scores = model(x) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.svm.coef_ = model.svm.coef_[:-1] + model.svm.intercept_ = model.svm.intercept_[:-1] + model.labels = model.labels[:-1] + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train linear SVM Classifier",) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--sre-v-file", required=True) + parser.add_argument("--sre-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + parser.add_argument("--cv-v-file", required=True) + parser.add_argument("--cv-list", required=True) + parser.add_argument("--afr-v-file", required=True) + parser.add_argument("--afr-list", required=True) + PCA.add_class_args(parser, prefix="pca") + SVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + # parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--ood-weight", default=0.1, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v8.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v8.py new file mode 100755 index 00000000..ec9d5e56 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v8.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import GaussianSVMC as GSVM +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + +tar_langs = ( + "afr-afr", + "ara-aeb", + "ara-arq", + "ara-ayl", + "eng-ens", + "eng-iaf", + "fra-ntf", + "nbl-nbl", + "orm-orm", + "tir-tir", + "tso-tso", + "ven-ven", + "xho-xho", + "zul-zul", +) + +non_langs = ( + "fra-can", + "fra-fra", + "fra-ntf", + "afr-afr", + "ara-acm", + "ara-arz", + "ara-jor", + "ara-ksa", + "ara-kuw", + "ara-leb", + "ara-mau", + "ara-mor", + "ara-oma", + "ara-pal", + "ara-qat", + "ara-sud", + "ara-syr", + "ara-uae", + "ara-yem", + "ara-apc", + "eng-gbr", + "eng-usg", +) + + +def read_ood_data(train_list, v_file, class_name): + v_reader = DRF.create(v_file) + + segs = SegmentSet.load(train_list) + idx = np.zeros((len(segs),), dtype=bool) + for lang in tar_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_tar = segs.loc[idx].copy() + if len(segs_tar) > 0: + x_tar = v_reader.read(segs_tar["id"], squeeze=True) + else: + x_tar = None + + idx = np.zeros((len(segs),), dtype=bool) + for lang in non_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_non = segs.loc[idx].copy() + segs_non[class_name] = "zzzzzz" + if len(segs_non) > 0: + x_non = v_reader.read(segs_non["id"], squeeze=True) + else: + x_non = None + + logging.info( + "read %s got ntar: %d nnon: %d", train_list, len(segs_tar), len(segs_non) + ) + return segs_tar, x_tar, segs_non, x_non + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + sre_v_file, + sre_list, + lre17_v_file, + lre17_list, + cv_v_file, + cv_list, + afr_v_file, + afr_list, + class_name, + do_lnorm, + whiten, + pca, + svm, + output_dir, + ood_weight, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_sre_tar, x_sre_tar, segs_sre_non, x_sre_non = read_ood_data( + sre_list, sre_v_file, class_name, + ) + _, _, segs_lre17_non, x_lre17_non = read_ood_data( + lre17_list, lre17_v_file, class_name, + ) + segs_cv_tar, x_cv_tar, segs_cv_non, x_cv_non = read_ood_data( + cv_list, cv_v_file, class_name + ) + segs_afr_tar, x_afr_tar, segs_afr_non, x_afr_non = read_ood_data( + afr_list, afr_v_file, class_name, + ) + + # class_ids = train_segs[class_name].values + # labels, y_true = np.unique(class_ids, return_inverse=True) + # gbe = GBE() + # gbe.fit(x_trn, y_true) + # scores_non = np.max(gbe(x_non), axis=1) + # sel_non = np.argsort(scores_non)[-num_nons:] + # segs_non = segs_non.iloc[sel_non] + # x_non = x_non[sel_non] + # logging.info("selected %d non-tar segments", x_non.shape[0]) + + # class_ids = ( + # list(train_segs[class_name].values) + # + list(segs_sre_tar[class_name].values) + # + list(segs_cv_tar[class_name].values) + # + list(segs_afr_tar[class_name].values) + # + list(segs_sre_non[class_name].values) + # + list(segs_lre17_non[class_name].values) + # + list(segs_cv_non[class_name].values) + # + list(segs_afr_non[class_name].values) + # ) + # x_trn = np.concatenate( + # ( + # x_trn, + # x_sre_tar, + # x_cv_tar, + # x_afr_tar, + # x_sre_non, + # x_lre17_non, + # x_cv_non, + # x_afr_non, + # ), + # axis=0, + # ) + class_ids = ( + list(train_segs[class_name].values) + + list(segs_sre_tar[class_name].values) + + list(segs_cv_tar[class_name].values) + + list(segs_afr_tar[class_name].values) + + list(segs_sre_non[class_name].values) + + list(segs_lre17_non[class_name].values) + + list(segs_cv_non[class_name].values) + + list(segs_afr_non[class_name].values) + ) + x = np.concatenate( + ( + x_trn, + x_sre_tar, + x_cv_tar, + x_afr_tar, + x_sre_non, + x_lre17_non, + x_cv_non, + x_afr_non, + ), + axis=0, + ) + sample_weight = np.concatenate( + ( + np.ones((len(train_segs),)), + ood_weight * np.ones((len(segs_sre_tar),)), + ood_weight * np.ones((len(segs_cv_tar),)), + ood_weight * np.ones((len(segs_afr_tar),)), + ood_weight * np.ones((len(segs_sre_non),)), + np.ones((len(segs_lre17_non),)), + ood_weight * np.ones((len(segs_cv_non),)), + ood_weight * np.ones((len(segs_afr_non),)), + ) + ) + + labels, y_true = np.unique(class_ids, return_inverse=True) + logging.info("%d training samples", x_trn.shape[0]) + + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x = pca(x) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x) + + logging.info("apply lnorm") + x = lnorm(x) + else: + lnorm = None + + logging.info("SVM args=%s", str(svm)) + model = GSVM(labels=labels, **svm) + model.fit(x, y_true, sample_weight=sample_weight) + logging.info("trained SVM") + scores = model(x) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true, y_pred, labels) + + logging.info("Saving transforms and SVM") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model_labels = list(np.copy(model.labels)) + if "zzzzzz" in model_labels: + model_labels.remove("zzzzzz") + model.labels = model_labels + print("model.labels before save", np.shape(model.labels)) + model.save(output_dir / "model_svm.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train linear SVM Classifier",) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--sre-v-file", required=True) + parser.add_argument("--sre-list", required=True) + parser.add_argument("--lre17-v-file", required=True) + parser.add_argument("--lre17-list", required=True) + parser.add_argument("--cv-v-file", required=True) + parser.add_argument("--cv-list", required=True) + parser.add_argument("--afr-v-file", required=True) + parser.add_argument("--afr-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GSVM.add_class_args(parser, prefix="svm") + parser.add_argument("--class-name", default="class_id") + # parser.add_argument("--num-nons", default=10000, type=int) + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--ood-weight", default=0.1, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_be_v9.py b/egs/lre22/fixed.v1.8k/steps_be/train_be_v9.py new file mode 100755 index 00000000..5c174233 --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_be_v9.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import sys +import os +import logging +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, + ActionYesNo, +) +import time +from pathlib import Path + +import numpy as np + +from hyperion.hyp_defs import config_logger +from hyperion.utils import SegmentSet +from hyperion.utils.math import softmax +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.transforms import TransformList, PCA, LNorm +from hyperion.np.classifiers import LinearGBE as GBE +from hyperion.np.metrics import ( + compute_accuracy, + compute_confusion_matrix, + print_confusion_matrix, +) + +tar_langs = ( + "afr-afr", + "ara-aeb", + "ara-arq", + "ara-ayl", + "eng-ens", + "eng-iaf", + "fra-ntf", + "nbl-nbl", + "orm-orm", + "tir-tir", + "tso-tso", + "ven-ven", + "xho-xho", + "zul-zul", +) + + +def read_ood_data(train_list, v_file, class_name): + v_reader = DRF.create(v_file) + + segs = SegmentSet.load(train_list) + idx = np.zeros((len(segs),), dtype=bool) + for lang in tar_langs: + idx_i = segs[class_name] == lang + idx = np.logical_or(idx, idx_i) + + segs_tar = segs.loc[idx].copy() + if len(segs_tar) > 0: + x_tar = v_reader.read(segs_tar["id"], squeeze=True) + else: + x_tar = None + + logging.info( + "read %s got ntar: %d", train_list, len(segs_tar), + ) + return segs_tar, x_tar + + +def compute_metrics(y_true, y_pred, labels): + + acc = compute_accuracy(y_true, y_pred) + logging.info("training acc: %.2f %%", acc * 100) + logging.info("non-normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=False) + print_confusion_matrix(C, labels) + logging.info("normalized confusion matrix:") + C = compute_confusion_matrix(y_true, y_pred, normalize=True) + print_confusion_matrix(C * 100, labels, fmt=".2f") + + +def train_be( + v_file, + train_list, + sre_v_file, + sre_list, + cv_v_file, + cv_list, + afr_v_file, + afr_list, + class_name, + do_lnorm, + whiten, + pca, + gbe, + output_dir, + verbose, +): + config_logger(verbose) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + logging.info("loading data") + train_segs = SegmentSet.load(train_list) + v_reader = DRF.create(v_file) + x_trn = v_reader.read(train_segs["id"], squeeze=True) + del v_reader + logging.info("loaded %d train samples", x_trn.shape[0]) + + segs_sre, x_sre = read_ood_data(sre_list, sre_v_file, class_name,) + segs_cv, x_cv = read_ood_data(cv_list, cv_v_file, class_name) + segs_afr, x_afr = read_ood_data(afr_list, afr_v_file, class_name,) + + class_ids_trn = train_segs[class_name].values + x_ood = np.concatenate((x_sre, x_cv, x_afr), axis=0) + class_ids_ood = ( + list(segs_sre[class_name].values) + + list(segs_cv[class_name].values) + + list(segs_afr[class_name].values) + ) + + labels, y_true_trn = np.unique(class_ids_trn, return_inverse=True) + _, y_true_ood = np.unique( + np.concatenate((labels, class_ids_ood)), return_inverse=True + ) + y_true_ood = y_true_ood[len(labels) :] + + logging.info("%d ood samples", x_ood.shape[0]) + logging.info("%d training samples", x_trn.shape[0]) + + x_ood += np.mean(x_trn, axis=0, keepdims=True) - np.mean( + x_ood, axis=0, keepdims=True + ) + logging.info("PCA args=%s", str(pca)) + pca_var_r = pca["pca_var_r"] + pca_dim = pca["pca_dim"] + if pca_var_r is not None and pca_var_r < 1.0 or pca_dim is not None: + logging.info("training PCA") + pca = PCA(**pca) + pca.fit(x_trn) + logging.info("PCA dimension: %d", pca.pca_dim) + logging.info("apply PCA") + x_trn = pca(x_trn) + x_ood = pca(x_ood) + else: + pca = None + + if do_lnorm: + lnorm = LNorm() + if whiten: + logging.info("training whitening") + lnorm.fit(x_trn) + + logging.info("apply lnorm") + x_trn = lnorm(x_trn) + x_ood = lnorm(x_ood) + else: + lnorm = None + + prior_0 = GBE( + mu=np.zeros((len(labels), x_trn.shape[1])), + W=np.eye(x_trn.shape[1]), + beta=16, + nu=x_trn.shape[1], + ) + print(prior_0.__dict__) + prior = GBE(prior=prior_0) + prior.fit(x_ood, y_true_ood) + prior.nu = 0.1 * prior.nu + prior.beta = 0.01 * prior.beta + print(prior.__dict__) + model = GBE(labels=labels, prior=prior) + model.fit(x_trn, y_true_trn) + print(model.__dict__, flush=True) + logging.info("trained GBE") + scores = model(x_trn) + y_pred = np.argmax(scores, axis=-1) + + compute_metrics(y_true_trn, y_pred, labels) + + logging.info("Saving transforms and GBE") + transforms = [] + if pca is not None: + transforms.append(pca) + if lnorm is not None: + transforms.append(lnorm) + + if transforms: + transforms = TransformList(transforms) + transforms.save(output_dir / "transforms.h5") + + model.save(output_dir / "model_gbe.h5") + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Train linear GBE Classifier",) + + parser.add_argument("--v-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--sre-v-file", required=True) + parser.add_argument("--sre-list", required=True) + parser.add_argument("--cv-v-file", required=True) + parser.add_argument("--cv-list", required=True) + parser.add_argument("--afr-v-file", required=True) + parser.add_argument("--afr-list", required=True) + PCA.add_class_args(parser, prefix="pca") + GBE.add_class_args(parser, prefix="gbe") + parser.add_argument("--class-name", default="class_id") + parser.add_argument("--do-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--whiten", default=True, action=ActionYesNo) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + train_be(**namespace_to_dict(args)) diff --git a/egs/lre22/fixed.v1.8k/steps_be/train_fusion.m b/egs/lre22/fixed.v1.8k/steps_be/train_fusion.m new file mode 100644 index 00000000..8f1c3dda --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_be/train_fusion.m @@ -0,0 +1,16 @@ +function train_fusion(train_list, in_files, model_file) + + train_list = readtable(train_list, 'FileType', 'delimitedtext', 'Delimiter', ' ', 'ReadVariableNames', false, 'ReadRowNames', true); + train_list = sortrows(train_list, 'RowNames'); + [labels, ia, ic]=unique(train_list); + n_files = length(in_files); + scores={}; + for i=1:n_files + T_i = readtable(in_files{i}, 'FileType', 'delimitedtext', 'Delimiter','tab', 'ReadRowNames', true, 'VariableNamingRule', 'preserve'); + T_i = sortrows(T_i, 'RowNames'); + s_i = T_i.Variables'; + scores{i}=s_i; + end + [alpha, beta] = train_nary_llr_fusion(scores, ic, 0, 1e-6, [], ones(1,1)) + save(model_file, 'alpha', 'beta', 'labels'); + \ No newline at end of file diff --git a/egs/lre22/fixed.v1.8k/steps_xvec b/egs/lre22/fixed.v1.8k/steps_xvec new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/lre22/fixed.v1.8k/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file diff --git a/egs/lre22/fixed.v1.8k/utils b/egs/lre22/fixed.v1.8k/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/lre22/fixed.v1.8k/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/README.md b/egs/lre22/open.v1.8k/README.md new file mode 100644 index 00000000..877f99ca --- /dev/null +++ b/egs/lre22/open.v1.8k/README.md @@ -0,0 +1,43 @@ +# LRE22 Fixed Condition V1 + +Recipe for the NIST LRE22 fixed condition based to the JHU-MIT Submission. + +## Citing +``` +@inproceedings{villalba23_interspeech, + author={Jesús Villalba and Jonas Borgstrom and Maliha Jahan and Saurabh Kataria and Leibny Paola Garcia and Pedro Torres-Carrasquillo and Najim Dehak}, + title={{Advances in Language Recognition in Low Resource African Languages: The JHU-MIT Submission for NIST LRE22}}, + year=2023, + booktitle={Proc. INTERSPEECH 2023}, + pages={521--525}, + doi={10.21437/Interspeech.2023-1094} +} +``` + +## Training Data + + - x-Vector networks trained on: + - VoxLingua107 + - NIST LRE17 Train + Dev + Eval / CTS + AfV + - Gaussian back-end trained on: + - NIST LRE22 dev with 2-fold cross-val + x10 augmentations + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it uses ECAPA-TDNN 4 layers of 2048 dim. + - To change the default network run scripts with the config-file argument: +```bash +run_011_train_xvector.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +run_030_extract_xvectors.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh --use-gpu true +run_040_be_final.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +``` + +## Results + +| Config | Model Type | Model Details | Back-end | Dev MinCp | Dev ActCp | Eval MinCp | Eval ActCp | +| ------ | ---------- | ------------- | -------- | :-------: | :-------: | :--------: | :--------: | +| config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | ECAPA-TDNN 2048x4 | Stage-2 | GBE | 0.207 | 0.209 | 0.198 | 0.199 | +| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-2 | GBE | 0.227 | 0.229 | 0.213 | 0.215 | +| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | 0.182 | 0.183 | 0.180 | 0.181 | + diff --git a/egs/lre22/open.v1.8k/cmd.sh b/egs/lre22/open.v1.8k/cmd.sh new file mode 100755 index 00000000..f22c66b4 --- /dev/null +++ b/egs/lre22/open.v1.8k/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01][234589]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/lre22/open.v1.8k/conf/clsp.conf b/egs/lre22/open.v1.8k/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_bigmem.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_long.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_rtx.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_short.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/lre22/open.v1.8k/conf/coe_gpu_v100.conf b/egs/lre22/open.v1.8k/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/lre22/open.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml b/egs/lre22/open.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml new file mode 100644 index 00000000..fce3804a --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/fbank64_specaug1_stmn_8k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/open.v1.8k/conf/fbank64_stmn_8k.yaml b/egs/lre22/open.v1.8k/conf/fbank64_stmn_8k.yaml new file mode 100644 index 00000000..dfd0d3e5 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/fbank64_stmn_8k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/open.v1.8k/conf/other_conf/fbank64_stmn_8k.yaml b/egs/lre22/open.v1.8k/conf/other_conf/fbank64_stmn_8k.yaml new file mode 100644 index 00000000..dfd0d3e5 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/fbank64_stmn_8k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 8000 + frame_length: 25 + low_freq: 20 + high_freq: 3700 + num_filters: 64 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/open.v1.8k/conf/other_conf/fbank80_stmn_16k.yaml b/egs/lre22/open.v1.8k/conf/other_conf/fbank80_stmn_16k.yaml new file mode 100644 index 00000000..f4091f5d --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/fbank80_stmn_16k.yaml @@ -0,0 +1,12 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +mvn: + context: 150 + norm_var: false diff --git a/egs/lre22/open.v1.8k/conf/other_conf/reverb_noise_aug.yaml b/egs/lre22/open.v1.8k/conf/other_conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage1_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage1_v2.4.yaml new file mode 100644 index 00000000..16b17c08 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage1_v2.4.yaml @@ -0,0 +1,124 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + - 2048 + - 3072 + - 3072 + - 4096 + - 4096 + - 5120 + - 5120 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 2 + - 3 + - 2 + - 3 + - 2 + - 3 + resb_strides: + - 1 + - 1 + - 2 + - 1 + - 2 + - 1 + - 2 + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 7168 + endpoint_layers: + - 2 + - 4 + - 6 + - 8 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage2_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage2_v2.4.yaml new file mode 100644 index 00000000..2bc8675f --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage2_v2.4.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage3_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage3_v2.4.yaml new file mode 100644 index 00000000..2bc8675f --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048-5120x8_xvec_stage3_v2.4.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..f43b3712 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,99 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.1 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 15000 + use_amp: true + swa_start: 9 + swa_lr: 5e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 256 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.1.yaml new file mode 100644 index 00000000..5d98e662 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.1.yaml @@ -0,0 +1,95 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 256 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + cos_scale: 30.0 + margin: 0.2 + intertop_margin: 0.1 + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.2.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.2.yaml new file mode 100644 index 00000000..038e7207 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.2.yaml @@ -0,0 +1,97 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 256 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + intertop_margin: 0.1 + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.3.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.3.yaml new file mode 100644 index 00000000..f0200ad2 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.3.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + margin: 0. + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + train_mode: ft-embed-affine diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.4.yaml new file mode 100644 index 00000000..3718b10b --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v2.4.yaml @@ -0,0 +1,97 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 256 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml new file mode 100644 index 00000000..d1c87491 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + dropout_rate: 0.3 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.98 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..66c69e8e --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + dropout_rate: 0.3 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.98 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.2.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.2.yaml new file mode 100644 index 00000000..3a4a81a7 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.2.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + dropout_rate: 0.3 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adamw + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.98 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.5.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.5.yaml new file mode 100644 index 00000000..17b1b6cf --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage1_v3.5.yaml @@ -0,0 +1,98 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + hid_act: swish + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + dropout_rate: 0.1 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0. + intertop_margin: 0. + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.98 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 24000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.1.yaml new file mode 100644 index 00000000..54f76200 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.1.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.2.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.2.yaml new file mode 100644 index 00000000..d68860be --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.2.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.yaml new file mode 100644 index 00000000..54f76200 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.1.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.3.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.3.yaml new file mode 100644 index 00000000..465d92eb --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.3.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 2 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 8000 + #hold_steps: 10000 + decay_steps: 12000 + hold_steps: 12000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 4000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.4.yaml new file mode 100644 index 00000000..64e71f65 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage2_v2.4.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.1.yaml new file mode 100644 index 00000000..64e71f65 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.1.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.4.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.4.yaml new file mode 100644 index 00000000..64e71f65 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_ecapatdnn2048x4_xvec_stage3_v2.4.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.1.yaml new file mode 100644 index 00000000..fe0171d1 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.1.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + se_r: 512 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + cos_scale: 30.0 + margin: 0.2 + intertop_margin: 0.1 + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.2.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.2.yaml new file mode 100644 index 00000000..80925cc7 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.2.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + resnet_type: tseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + se_r: 512 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 128 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + intertop_margin: 0.1 + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 40000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.3.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.3.yaml new file mode 100644 index 00000000..11997c55 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage1_v2.3.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + margin: 0. + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + train_mode: ft-embed-affine diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.1.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.1.yaml new file mode 100644 index 00000000..cde840fe --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.1.yaml @@ -0,0 +1,79 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + #margin: 0.4 + #margin: 0.2 + margin: 0. + margin_warmup_epochs: 2 + #intertop_margin: 0.1 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + # decay_steps: 8000 + decay_steps: 4000 + # hold_steps: 10000 + hold_steps: 5000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + # warmup_steps: 10000 + warmup_steps: 5000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.3.yaml b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.3.yaml new file mode 100644 index 00000000..4f704b29 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/train_tseres2net50s8_xvec_stage2_v2.3.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 12 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 1 + class_name: class_id + seg_weight_mode: data-prior + num_hard_prototypes: 6 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 3 + class_name: class_id + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +feats: fbank64_stmn_8k.yaml +model: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.4 + margin_warmup_epochs: 2 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.001 + momentum: 0.9 + # opt_type: adam + # lr: 0.001 + # amsgrad: true + # beta1: 0.9 + # beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 8000 + #hold_steps: 10000 + decay_steps: 12000 + hold_steps: 12000 + min_lr: 1.0e-05 + update_lr_on_opt_step: true + warmup_steps: 4000 + use_amp: true + swa_start: 9 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 11 + eff_batch_size: 512 + diff --git a/egs/lre22/open.v1.8k/conf/other_conf/vad_16k.yaml b/egs/lre22/open.v1.8k/conf/other_conf/vad_16k.yaml new file mode 100644 index 00000000..5fb0111c --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/vad_16k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 16000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/lre22/open.v1.8k/conf/other_conf/vad_8k.yaml b/egs/lre22/open.v1.8k/conf/other_conf/vad_8k.yaml new file mode 100644 index 00000000..7592c9d1 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/other_conf/vad_8k.yaml @@ -0,0 +1,8 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: 5.5 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 diff --git a/egs/lre22/open.v1.8k/conf/reverb_noise_aug.yaml b/egs/lre22/open.v1.8k/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/lre22/open.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml b/egs/lre22/open.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..1448df98 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml @@ -0,0 +1,105 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + resnet_enc: + in_feats: 64 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 16 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.0 + intertop_margin: 0.0 + margin_warmup_epochs: 3.0 + dropout_rate: 0.2 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 65000 + warmup_steps: 15000 + min_lr: 1.0e-06 + #decay_steps: 16000 + #hold_steps: 40000 + #warmup_steps: 5000 + #min_lr: 1.0e-05 + update_lr_on_opt_step: true + use_amp: true + swa_start: 12 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 15 + #eff_batch_size: 512 + eff_batch_size: 256 diff --git a/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml b/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml new file mode 100644 index 00000000..13ce9445 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml @@ -0,0 +1,82 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 24 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +feats: fbank64_specaug1_stmn_8k.yaml +model: + resnet_type: fwseres2net50 + in_channels: 1 + in_feats: 64 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + res2net_width_factor: 3.25 + res2net_scale: 8 + se_r: 4 + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.0 + intertop_margin: 0.0 + margin_warmup_epochs: 3.0 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.95 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 65000 + warmup_steps: 15000 + min_lr: 1.0e-06 + #decay_steps: 16000 + #hold_steps: 40000 + #warmup_steps: 5000 + #min_lr: 1.0e-05 + update_lr_on_opt_step: true + use_amp: true + swa_start: 12 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 diff --git a/egs/lre22/open.v1.8k/conf/vad_8k.yaml b/egs/lre22/open.v1.8k/conf/vad_8k.yaml new file mode 100644 index 00000000..1cfe34b0 --- /dev/null +++ b/egs/lre22/open.v1.8k/conf/vad_8k.yaml @@ -0,0 +1,9 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: -4.89 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/lre22/open.v1.8k/datapath.sh b/egs/lre22/open.v1.8k/datapath.sh new file mode 100644 index 00000000..fec52329 --- /dev/null +++ b/egs/lre22/open.v1.8k/datapath.sh @@ -0,0 +1,87 @@ +# Copyright +# 2022 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + +#paths to databases + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + ldc_root3=/export/fs02/corpora3/LDC + ldc_root5=/export/corpora5/LDC + ldc_root=/export/corpora6/LDC + sre16_dev_root=$ldc_root/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=$ldc_root/LDC2019S20/data/eval/R149_0_1 + sre18_dev_root=$ldc_root5/LDC2018E46 + sre18_eval_root=$ldc_root3/LDC2018E51 + sre19cmn2_eval_root=$ldc_root3/LDC2019E58 + sre_superset_root=$ldc_root/LDC2021E08 + sre21_dev_root=$ldc_root/LDC2021E09 + sre21_eval_root=$ldc_root/LDC2021E10 + lre17_train_root=$ldc_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$ldc_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$ldc_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=/export/corpora6/lre22_test_data_v2 + voxlingua_root=/export/corpora6/voxlingua107 + musan_root=/export/corpora5/JHU/musan + babel_assamese_root=$ldc_root/LDC2016S06 + babel_bengali_root=$ldc_root/LDC2016S08 + babel_pashto_root=$ldc_root/LDC2016S09 + babel_turkish_root=$ldc_root/LDC2016S10 + babel_georgian_root=$ldc_root/LDC2016S12 + babel_vietnam_root=$ldc_root/LDC2017S01 + babel_haitian_root=$ldc_root/LDC2017S03 + babel_lao_root=$ldc_root/LDC2017S08 + babel_tamil_root=$ldc_root/LDC2017S13 + babel_zulu_root=$ldc_root/LDC2017S19 + babel_kurmanji_root=$ldc_root/LDC2017S22 + babel_tok_root=$ldc_root/LDC2018S02 + babel_kazakh_root=$ldc_root/LDC2018S13 + babel_telugu_root=$ldc_root/LDC2018S16 + babel_lithuanian_root=$ldc_root/LDC2019S03 + fleurs_root=/export/corpora6/LRE/FLEURS2022 + lwazi_root=/export/corpora6/LRE/Lwazi2009 + nchlt_root=/export/corpora6/LRE/NCHLT2014 + ammi_root=/export/corpora6/LRE/AMMI2020 + cv20_root=/export/corpora5/mozilla-common-voice/cv-corpus-5.1-2020-06-22 + cv22_root=/export/corpora6/LRE/CommonVoice2020/cv-corpus-11.0-2022-09-21 + adi_root=/export/corpora6/ADI17 + ast_root=/export/corpora6/LRE/AST2004 +elif [ "$(hostname --domain)" == "cm.gemini" ];then + ldc_root=/export/common/data/corpora/LDC + sre_root=/export/common/data/corpora/NIST/SRE + my_root=/exp/jvillalba/corpora + sre16_dev_root=/exp/jvillalba/corpora/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=/exp/jvillalba/corpora/LDC2019S20/data/eval/R149_0_1 + sre18_dev_root=$sre_root/SRE18/LDC2018E46_2018_NIST_Speaker_Recognition_Evaluation_Development_Set + sre18_eval_root=$sre_root/SRE18/Eval/LDC2018E51 + sre19cmn2_eval_root=/exp/jvillalba/corpora/LDC2019E58 + sre_superset_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E08 + sre21_dev_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E09 + sre21_eval_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E10 + lre17_train_root=$my_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$my_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$my_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=$my_root/lre22_test_data_v2 + voxlingua_root=$my_root/voxlingua107 + musan_root=/expscratch/dgromero/corpora/musan + babel_assamese_root=$ldc_root/LDC2016S06 + babel_bengali_root=$ldc_root/LDC2016S08 + babel_pashto_root=$ldc_root/LDC2016S09 + babel_turkish_root=$my_root/LDC2016S10 + babel_georgian_root=$my_root/LDC2016S12 + babel_vietnam_root=$my_root/LDC2017S01 + babel_haitian_root=$my_root/LDC2017S03 + babel_lao_root=$ldc_root/LDC2017S08 + babel_tamil_root=$ldc_root/LDC2017S13 + babel_zulu_root=$ldc_root/LDC2017S19 + babel_kurmanji_root=$ldc_root/LDC2017S22 + babel_tok_root=$my_root/LDC2018S02 + babel_kazakh_root=$ldc_root/LDC2018S13 + babel_telugu_root=$ldc_root/LDC2018S16 + babel_lithuanian_root=$my_root/LDC2019S03 + adi_root=/exp/jvillalba/corpora/ADI17 + +else + echo "Put your database paths here" + exit 1 +fi diff --git a/egs/lre22/open.v1.8k/default_config.sh b/egs/lre22/open.v1.8k/default_config.sh new file mode 120000 index 00000000..d1be989f --- /dev/null +++ b/egs/lre22/open.v1.8k/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh new file mode 100644 index 00000000..1abb3d3f --- /dev/null +++ b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh @@ -0,0 +1,20 @@ +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=open + +# x-vector cfg + +nnet_type=resnet1d +nnet_stages=1 +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v1.0.yaml + +nnet_name=${feat_type}_ecapatdnn2048x4_v1.0 +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0012.pth diff --git a/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh new file mode 100644 index 00000000..6a735e4c --- /dev/null +++ b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh @@ -0,0 +1,45 @@ +# acoustic features +feat_config=conf/fbank64_stmn_8k.yaml +feat_type=fbank64_stmn + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=open + +# x-vector cfg + +nnet_type=resnet +nnet_stages=2 +nnet_s1_base_cfg=conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml + +nnet_name=${feat_type}_fwseres2net50s8_v1.0 +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/swa_model_ep0012.pth +#nnet_s1=$nnet_s1_dir/model_ep0001.pth +nnet_s1=$nnet_s1_dir/model_ep0008.pth +nnet_s1=$nnet_s1_dir/model_ep0011.pth +nnet_s1=$nnet_s1_dir/model_ep0015.pth +nnet_s1=$nnet_s1_dir/swa_model_ep0016.pth + +nnet_s2_base_cfg=conf/train_tseres2net50s8_xvec_stage2_v1.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +#nnet_s2=$nnet_s2_dir/swa_model_ep0013.pth +nnet_s2=$nnet_s2_dir/model_ep0001.pth +nnet_s2=$nnet_s2_dir/model_ep0002.pth +nnet_s2=$nnet_s2_dir/model_ep0004.pth +# nnet_s2=$nnet_s2_dir/model_ep0008.pth +# nnet_s2=$nnet_s2_dir/swa_model_ep0012.pth + +nnet_s3_base_cfg=conf/train_tseres2net50s8_xvec_stage3_v2.1.yaml +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +#nnet_s3=$nnet_s3_dir/swa_model_ep0013.pth +#nnet_s3=$nnet_s3_dir/model_ep0007.pth +nnet_s3=$nnet_s3_dir/model_ep0001.pth +nnet_s3=$nnet_s3_dir/model_ep0004.pth +nnet_s3=$nnet_s3_dir/model_ep0008.pth + diff --git a/egs/lre22/open.v1.8k/hyp_utils b/egs/lre22/open.v1.8k/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/lre22/open.v1.8k/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/local b/egs/lre22/open.v1.8k/local new file mode 120000 index 00000000..c2a3fdea --- /dev/null +++ b/egs/lre22/open.v1.8k/local @@ -0,0 +1 @@ +../fixed.v1.8k/local \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/path.sh b/egs/lre22/open.v1.8k/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/lre22/open.v1.8k/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/lre22/open.v1.8k/resources b/egs/lre22/open.v1.8k/resources new file mode 120000 index 00000000..113b3492 --- /dev/null +++ b/egs/lre22/open.v1.8k/resources @@ -0,0 +1 @@ +../fixed.v1.8k/resources \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/run_001_prepare_data.sh b/egs/lre22/open.v1.8k/run_001_prepare_data.sh new file mode 100755 index 00000000..bb64cdbe --- /dev/null +++ b/egs/lre22/open.v1.8k/run_001_prepare_data.sh @@ -0,0 +1,342 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +stage=1 + +. parse_options.sh || exit 1; +. datapath.sh + + +if [ $stage -le 1 ];then + # Prepares voxlingua 107 for training + hyp_utils/conda_env.sh \ + local/prepare_voxlingua107.py \ + --corpus-dir $voxlingua_root \ + --output-dir data/voxlingua107 \ + --remove-langs en-en es-es ar-ar pt-pt \ + --map-langs-to-lre-codes \ + --target-fs 8000 + +fi + +if [ $stage -le 2 ];then + # Prepare LRE17 Training data + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_train \ + --subset train \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_cts \ + --subset dev \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_afv \ + --subset dev \ + --source vast \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_cts \ + --subset eval \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_afv \ + --subset eval \ + --source vast \ + --target-fs 8000 + +fi + +if [ $stage -le 3 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_dev.py \ + --corpus-dir $lre22_dev_root \ + --output-dir data/lre22_dev \ + --target-fs 8000 + +fi + +if [ $stage -le 4 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_eval.py \ + --corpus-dir $lre22_eval_root \ + --output-dir data/lre22_eval \ + --target-fs 8000 + +fi + +if [ $stage -le 5 ];then + local/make_sre16_train_dev.sh $sre16_dev_root 8 data + local/make_sre16_train_eval.sh $sre16_eval_root 8 data +fi + +if [ $stage -le 6 ];then + local/make_sre18_dev_unlabeled.sh $sre18_dev_root 8 data + local/make_sre18_train_dev.sh $sre18_dev_root 8 data + local/make_sre18_train_eval.sh $sre18_eval_root 8 data +fi + +if [ $stage -le 7 ];then + # Prepare sre19 + local/make_sre19cmn2_eval.sh $sre19cmn2_eval_root 8 data +fi + +if [ $stage -le 8 ];then + # Prepare SRE21 dev + hyp_utils/conda_env.sh \ + local/prepare_sre21av_dev_audio.py \ + --corpus-dir $sre21_dev_root \ + --target-fs 8000 \ + --output-path data/sre21_audio_dev \ + --av-output-path data/sre21_audio-visual_dev + # Prepare SRE21 eval + hyp_utils/conda_env.sh \ + local/prepare_sre21av_eval_audio.py \ + --corpus-dir $sre21_eval_root \ + --target-fs 8000 \ + --output-path data/sre21_audio_eval \ + --av-output-path data/sre21_audio-visual_eval + +fi + +if [ $stage -le 9 ];then + # Prepare SRE CTS superset + hyp_utils/conda_env.sh \ + local/prepare_sre_cts_superset.py \ + --corpus-dir $sre_superset_root \ + --target-fs 8000 \ + --output-dir data/sre_cts_superset +fi + +if [ $stage -le 10 ];then + # Prepare babel datasets + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_assamese_root \ + --target-fs 8000 \ + --lang-code as-as \ + --output-dir data/babel_assamese + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_bengali_root \ + --target-fs 8000 \ + --lang-code bn-bn \ + --output-dir data/babel_bengali + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_pashto_root \ + --target-fs 8000 \ + --lang-code ps-ps \ + --output-dir data/babel_pashto + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_turkish_root \ + --target-fs 8000 \ + --lang-code tr-tr \ + --output-dir data/babel_turkish + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_georgian_root \ + --target-fs 8000 \ + --lang-code ka-ka \ + --output-dir data/babel_georgian + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_vietnam_root \ + --target-fs 8000 \ + --lang-code vi-vi \ + --output-dir data/babel_vietnam + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_haitian_root \ + --target-fs 8000 \ + --lang-code ht-ht \ + --output-dir data/babel_haitian + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_lao_root \ + --target-fs 8000 \ + --lang-code lo-lo \ + --output-dir data/babel_lao + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_tamil_root \ + --target-fs 8000 \ + --lang-code ta-ta \ + --output-dir data/babel_tamil + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_zulu_root \ + --target-fs 8000 \ + --lang-code zul-zul \ + --output-dir data/babel_zulu + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_kurmanji_root \ + --target-fs 8000 \ + --lang-code kur-kur \ + --output-dir data/babel_kurmanji + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_tok_root \ + --target-fs 8000 \ + --lang-code tok-tok \ + --output-dir data/babel_tok + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_kazakh_root \ + --target-fs 8000 \ + --lang-code kk-kk \ + --output-dir data/babel_kazakh + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_telugu_root \ + --target-fs 8000 \ + --lang-code te-te \ + --output-dir data/babel_telugu + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_lithuanian_root \ + --target-fs 8000 \ + --lang-code lt-lt \ + --output-dir data/babel_lithuanian + +fi + +if [ $stage -le 11 ];then + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre.py \ + --corpus-dir $fleurs_root \ + --output-dir data/fleurs22 \ + --map-langs-to-lre-codes --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre_cat.py \ + --corpus-dir $lwazi_root \ + --output-dir data/lwazi09 \ + --map-langs-to-lre-codes --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre_cat.py \ + --corpus-dir $nchlt_root \ + --output-dir data/nchlt14 \ + --map-langs-to-lre-codes --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre_cat.py \ + --corpus-dir $ammi_root \ + --output-dir data/ammi20 \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 12 ];then + + hyp_utils/conda_env.sh \ + local/prepare_common_voice_cat.py \ + --corpus-dir $cv22_root \ + --output-dir data/cv22_tir \ + --keep-langs tir-tir \ + --map-langs-to-lre-codes --target-fs 8000 +fi + + +if [ $stage -le 13 ];then + hyp_utils/conda_env.sh \ + local/prepare_common_voice_accents_cat.py \ + --corpus-dir $cv20_root \ + --output-dir data/cv20_eng_ine \ + --lang en \ + --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_common_voice_accents_cat.py \ + --corpus-dir $cv20_root \ + --output-dir data/cv20_fra \ + --lang fr \ + --target-fs 8000 + +fi + +if [ $stage -le 14 ];then + hyp_utils/conda_env.sh \ + local/prepare_adi17.py \ + --corpus-dir $adi_root \ + --output-dir data/adi17 \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 15 ];then + hyp_utils/conda_env.sh \ + local/prepare_ast_cat.py \ + --corpus-dir $ast_root \ + --output-dir data/ast \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 16 ];then + #combine data + utils/combine_data.sh \ + data/babel \ + data/babel_{a*,b*,g*,k*,l*,p*,t*,v*,zulu} + + utils/combine_data.sh \ + data/cv \ + data/cv20_eng_ine data/cv20_fra data/cv22_tir + + utils/combine_data.sh \ + data/sre16 \ + data/sre16_train_{dev*,eval*} + + utils/combine_data.sh \ + data/sre18 \ + data/sre18_train_{dev*,eval*} data/sre18_dev_unlabeled + + utils/combine_data.sh \ + data/sre19 \ + data/sre19_eval_{enroll,test}_cmn2 + + utils/combine_data.sh \ + data/sre21_cts \ + data/sre21_*_cts + + utils/combine_data.sh \ + data/sre21_afv \ + data/sre21_audio*_{dev*,eval*}_afv + + utils/combine_data.sh \ + data/sre16-21_cts \ + data/sre1{6,8,9} data/sre21_cts + +fi + +if [ $stage -le 5 ];then + if [ -d ../fixed.v1.8k/lre-scorer ];then + ln -s ../fixed.v1.8k/lre-scorer + else + local/download_lre22_scorer.sh + fi + if [ -d ../fixed.v1.8k/focal_multiclass ];then + ln -s ../fixed.v1.8k/focal_multiclass + else + local/download_focal.sh + fi +fi diff --git a/egs/lre22/open.v1.8k/run_002_compute_evad.sh b/egs/lre22/open.v1.8k/run_002_compute_evad.sh new file mode 100755 index 00000000..f7ccdfa7 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_002_compute_evad.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=b1 +storage_name=$(date +'%m_%d_%H_%M') +vaddir=`pwd`/exp/vad_e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/lre22-fixed-v1.8k-$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b1{0,1,2,3,4,5,6,7,8,9}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + elif [ "$nodes" == "fs05" ];then + utils/create_split_dir.pl \ + /export/fs05/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" + fi + fi +fi + +# VAD Train/Test Datasets +if [ $stage -le 2 ];then + for name in voxlingua107 \ + lre17_train \ + lre17_dev_cts lre17_dev_afv \ + lre17_eval_cts lre17_eval_afv \ + lre22_dev lre22_eval \ + babel sre16-21_cts sre21_afv sre_cts_superset \ + lwazi09 nchlt14 adi17 fleurs22 ammi20 \ + ast cv + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + diff --git a/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh b/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..638143f0 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 8 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../v1.16k/RIRS_NOISES ];then + ln -s ../v1.16k/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 8 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 8 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 8 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/lre22/open.v1.8k/run_004_apply_codecs.sh b/egs/lre22/open.v1.8k/run_004_apply_codecs.sh new file mode 100755 index 00000000..6efc016b --- /dev/null +++ b/egs/lre22/open.v1.8k/run_004_apply_codecs.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ];then + + for data in voxlingua107 \ + lre17_dev_afv lre17_eval_afv \ + sre21_afv ast cv \ + lwazi09 nchlt14 adi17 fleurs22 ammi20 + do + hyp_utils/conda_env.sh \ + local/apply_tel_codecs_to_kaldi_datadir.py \ + --input-dir data/$data \ + --output-dir data/${data}_codecs + done + +fi diff --git a/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh b/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh new file mode 100755 index 00000000..d261a287 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # This script preprocess audio for x-vector training + for name in voxlingua107_codecs \ + lre17_train \ + lre17_{dev,eval}_{cts,afv,afv_codecs} \ + babel sre16-21_cts sre_cts_superset \ + sre21_afv_codecs cv_codecs adi17_codecs \ + lwazi09{,_codecs} nchlt14{,_codecs} fleurs22{,_codecs} ammi20{,_codecs} ast{,_codecs} + do + steps_xvec/preprocess_audios_for_nnet_train.sh \ + --nj 40 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${name} data/${name}_proc_audio_no_sil exp/${name}_proc_audio_no_sil + utils/fix_data_dir.sh data/${name}_proc_audio_no_sil + done +fi + +if [ $stage -le 2 ];then + utils/combine_data.sh \ + data/lre17_proc_audio_no_sil \ + data/lre17_train_proc_audio_no_sil \ + data/lre17_{dev,eval}_{cts,afv,afv_codecs}_proc_audio_no_sil + + utils/combine_data.sh \ + data/babel_sre_proc_audio_no_sil \ + data/{babel,sre16-21_cts,sre21_afv_codecs,sre_cts_superset}_proc_audio_no_sil + + utils/combine_data.sh \ + data/others_afr_proc_audio_no_sil \ + data/adi17_proc_audio_no_sil \ + data/{lwazi09,nchlt14,fleurs22,ammi20,ast}{,_codecs}_proc_audio_no_sil +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 3s + hyp_utils/remove_short_audios.sh --min-len 3 data/voxlingua107_codecs_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/lre17_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/babel_sre_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/others_afr_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/cv_codecs_proc_audio_no_sil +fi + +if [ $stage -le 4 ];then + # merge all data + utils/combine_data.sh \ + data/open_proc_audio_no_sil \ + data/{voxlingua107_codecs,lre17,babel_sre,cv_codecs,others_afr}_proc_audio_no_sil +fi + + +if [ $stage -le 5 ]; then + for name in open_proc_audio_no_sil + do + hyp_utils/conda_env.sh \ + local/split_segments_train_val.py \ + --segments-file data/$name/utt2lang \ + --recordings-file data/$name/wav.scp \ + --durations-file data/$name/utt2dur \ + --val-percent 2. \ + --remove-langs fra-mix ara-ary en-en es-es pt-pt ar-ar \ + --output-dir data/$name/train_val_split + done +fi + +if [ $stage -le 6 ]; then + awk 'BEGIN{ +adapt_langs_list="ara-acm ara-aeb ara-apc ara-arq ara-ary ara-arz ara-ayl ara-jor ara-ksa ara-kuw ara-leb ara-mau ara-mor ara-oma ara-pal ara-qat ara-sud ara-syr ara-uae ara-yem fra-can fra-fra fra-ntf eng-ens eng-gbr eng-iaf eng-ine eng-usg eng-zho afr-afr nbl-nbl orm-orm tir-tir tso-tso ven-ven xho-xho zul-zul"; +nf=split(adapt_langs_list, f, " "); +for(i=1;i<=nf;i++){ adapt_langs[f[i]]=1;}; +FS=","; OFS=","; +getline; print $0; +} +{ if ($1 in adapt_langs) { $3="1."} else{ $3="0.01"}; print $0}' \ + data/open_proc_audio_no_sil/train_val_split/class_file.csv > \ + data/open_proc_audio_no_sil/train_val_split/class_file_adapt_1.csv +fi diff --git a/egs/lre22/open.v1.8k/run_011_train_xvector.sh b/egs/lre22/open.v1.8k/run_011_train_xvector.sh new file mode 100755 index 00000000..056a9754 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_011_train_xvector.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-fixed-v1.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + mkdir -p $nnet_s1_dir/log + if [ ! -f "$nnet_s0" ];then + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu --master-port 3456 + else + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s0 \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu + + fi + +fi + + +# Class-balanced Fine-tuning +if [ $stage -le 2 ] && [ $max_stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_xvector_from_wav.py $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu + +fi diff --git a/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh b/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh new file mode 100755 index 00000000..ea2c59f6 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh @@ -0,0 +1,219 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=2 +nnet_stage=2 +config_file=default_config.sh +use_gpu=false +do_tsne=true +split_dev=false +xvec_chunk_length=12800 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 4G" +else + xvec_cmd="$train_cmd --mem 12G" +fi + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +# if [ $stage -le 1 ]; then +# # Extract xvectors for training +# for name in lre17_proc_audio_no_sil \ +# voxlingua107_codecs_proc_audio_no_sil \ +# babel_sre_proc_audio_no_sil \ +# cv_codecs_proc_audio_no_sil \ +# others_afr_proc_audio_no_sil +# do +# steps_xvec/extract_xvectors_from_wav.sh \ +# --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ +# --use-bin-vad false \ +# --random-utt-length true --min-utt-length 300 --max-utt-length 3000 \ +# --feat-config $feat_config \ +# $nnet data/${name} \ +# $xvector_dir/${name} +# done +# fi + +if [ $stage -le 2 ]; then + # Extract xvectors for training + for name in lre22_dev + do + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --use-bin-vad true --num-augs 10 --aug-config conf/reverb_noise_aug.yaml \ + --random-utt-length true --min-utt-length 300 --max-utt-length 3000 \ + --feat-config $feat_config \ + $nnet data/${name} \ + $xvector_dir/${name}_aug \ + data/${name}_aug + done +fi + + +if [ $stage -le 3 ]; then + # Extracts x-vectors for dev and eval + for name in lre22_dev lre22_eval + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_xvectors_from_wav.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + --feat-config $feat_config \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + +if [ $stage -le 4 ]; then + for name in lre22_dev + do + if [ "$do_tsne" == "true" ] || [ "$split_dev" == "true" ];then + $train_cmd \ + $xvector_dir/$name/tsne/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 + + $train_cmd \ + $xvector_dir/$name/tsne_per_class/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne_per_class.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne_per_class \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 \ + --do-ahc --cluster-tsne --ahc-thr -5 + + if [ "$split_dev" == "true" ];then + hyp_utils/conda_env.sh \ + local/split_dev.py \ + --segs-file $xvector_dir/$name/tsne_per_class/segments.csv \ + --output-dir ./resources/dev_splits \ + --num-folds 2 + + # delete the split data dirs so they are regenerated later + rm -rf data/lre22_dev_p{1,2} + + fi + fi + done +fi + +if [ $stage -le 5 ]; then + if [ ! -d data/lre22_dev_p1 ];then + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/train_segments.csv \ + > p1.lst + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/test_segments.csv \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev data/lre22_dev_$p + done + fi +fi + +if [ $stage -le 6 ]; then + if [ -d data/lre22_dev_aug ] && [ ! -d data/lre22_dev_aug_p1 ];then + awk -v fsegs=./resources/dev_splits/fold_0/train_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1] +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p1.lst + + awk -v fsegs=./resources/dev_splits/fold_0/test_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1]=1; +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev_aug data/lre22_dev_aug_$p + done + fi +fi + +if [ $stage -le 7 ];then + if [ -f $xvector_dir/lre22_dev_aug/xvector.scp ];then + mkdir -p $xvector_dir/lre22_dev_aug_clean + cat $xvector_dir/lre22_dev/xvector.scp \ + $xvector_dir/lre22_dev_aug/xvector.scp \ + > $xvector_dir/lre22_dev_aug_clean/xvector.scp + + for p in "" _p1 _p2 + do + if [ ! -d data/lre22_dev_aug_clean$p ]; then + utils/combine_data.sh \ + data/lre22_dev_aug_clean$p \ + data/lre22_dev$p \ + data/lre22_dev_aug$p + fi + done + fi +fi + +exit diff --git a/egs/lre22/open.v1.8k/run_040_be_final.sh b/egs/lre22/open.v1.8k/run_040_be_final.sh new file mode 100755 index 00000000..fe5b6f18 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_040_be_final.sh @@ -0,0 +1,434 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +fi + +xvector_dir=exp/xvectors/$nnet_name +be_base_dir=exp/be/$nnet_name +score_base_dir=exp/scores/$nnet_name + +if [ $stage -le 1 ];then + for r in 1 #0.9999 0.999 #0.99 0.975 0.95 + do + be_name=pca${r}_cw_lnorm_lgbe_lre22_aug + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + + + ) & + + done + + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}_p12/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + + + done + wait + +fi + +exit +# Back-ends below over-fitted + +if [ $stage -le 2 ];then + for r in 1 + do + for penalty in l2 #l1 + do + for c in 1 #0.1 1 + do + for ary_thr in 0.975 #0.85 0.7 #0.99 0.95 0.9 #15 ##1 5 10 20 + do + be_name=pca${r}_cw_lnorm_lsvm_${penalty}_c${c}_sqhinge_lre22_aug_lre17_aryt${ary_thr} + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + done + done + done + done + wait + +fi + +if [ $stage -le 3 ];then + for r in 1 # 0.9999 0.99 0.975 0.95 0.9 0.8 + do + for shrinking in true #false + do + for c in 1 10 #0.1 1 10 #0.01 0.1 1 10 # 0.0001 + do + for vl in false #true #false + do + if [ "$vl" == "true" ];then + do_vl="--do-vl" + else + do_vl="--no_do-vl" + fi + ary_thr=0.975 + be_name=pca${r}_cw_lnorm_gsvm_shrinking_${shrinking}_c${c}_lre17_aryt${ary_thr}_vl${vl}_aug_clean + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + #score_dir=$score_base_dir/${be_name}_logpost + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500\ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500 \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --svm.eval-type cat-log-post \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + + ) & + done + done + done + done + wait + +fi diff --git a/egs/lre22/open.v1.8k/steps b/egs/lre22/open.v1.8k/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/lre22/open.v1.8k/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/steps_be b/egs/lre22/open.v1.8k/steps_be new file mode 120000 index 00000000..48aedc5a --- /dev/null +++ b/egs/lre22/open.v1.8k/steps_be @@ -0,0 +1 @@ +../fixed.v1.8k/steps_be \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/steps_xvec b/egs/lre22/open.v1.8k/steps_xvec new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/lre22/open.v1.8k/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file diff --git a/egs/lre22/open.v1.8k/utils b/egs/lre22/open.v1.8k/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/lre22/open.v1.8k/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/cmd.sh b/egs/lre22/open.v2.8k/cmd.sh new file mode 100755 index 00000000..4efc96e1 --- /dev/null +++ b/egs/lre22/open.v2.8k/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/lre22/open.v2.8k/conf/clsp.conf b/egs/lre22/open.v2.8k/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_bigmem.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_long.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_rtx.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_short.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/lre22/open.v2.8k/conf/coe_gpu_v100.conf b/egs/lre22/open.v2.8k/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/lre22/open.v2.8k/conf/reverb_noise_aug.yaml b/egs/lre22/open.v2.8k/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..4fdf8068 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/reverb_noise_aug.yaml @@ -0,0 +1,35 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: scp:data/rirs_smallroom/rirs.scp + rir_norm: max + mediumroom: + weight: 1 + rir_path: scp:data/rirs_mediumroom/rirs.scp + rir_norm: max + realroom: + weight: 1 + rir_path: scp:data/rirs_real/rirs.scp + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/wav.scp + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/wav.scp + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/wav.scp + min_snr: 3 + max_snr: 18 + diff --git a/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml new file mode 100644 index 00000000..b8998830 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml @@ -0,0 +1,59 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 128 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml +trainer: + optim: + opt_type: sgd + lr: 0.45 + momentum: 0.9 + weight_decay: 4e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + #decay_steps: 4200 + #hold_steps: 1500 + decay_steps: 16000 + hold_steps: 18000 + min_lr: 4e-4 + warmup_steps: 4000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 12 + eff_batch_size: 1024 + train_mode: hf-feats-frozen-nograd + + \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml b/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml new file mode 100644 index 00000000..d8193f59 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml @@ -0,0 +1,47 @@ +hf_feats: + pretrained_model_path: facebook/wav2vec2-xls-r-300m +xvector: + resnet_enc: + in_feats: 1024 + in_conv_channels: 1024 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 1024 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 8 + multilayer: true + multilayer_concat: true + endpoint_channels: 3072 + dropout_rate: 0.05 + norm_before: false + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 32.0 + margin: 0. + margin_warmup_epochs: 5 + intertop_margin: 0. + dropout_rate: 0.0 + norm_before: false + hid_act: swish +feat_fusion_method: weighted-avg +feat_fusion_start: 2 diff --git a/egs/lre22/open.v2.8k/datapath.sh b/egs/lre22/open.v2.8k/datapath.sh new file mode 100644 index 00000000..fec52329 --- /dev/null +++ b/egs/lre22/open.v2.8k/datapath.sh @@ -0,0 +1,87 @@ +# Copyright +# 2022 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + +#paths to databases + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + ldc_root3=/export/fs02/corpora3/LDC + ldc_root5=/export/corpora5/LDC + ldc_root=/export/corpora6/LDC + sre16_dev_root=$ldc_root/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=$ldc_root/LDC2019S20/data/eval/R149_0_1 + sre18_dev_root=$ldc_root5/LDC2018E46 + sre18_eval_root=$ldc_root3/LDC2018E51 + sre19cmn2_eval_root=$ldc_root3/LDC2019E58 + sre_superset_root=$ldc_root/LDC2021E08 + sre21_dev_root=$ldc_root/LDC2021E09 + sre21_eval_root=$ldc_root/LDC2021E10 + lre17_train_root=$ldc_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$ldc_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$ldc_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=/export/corpora6/lre22_test_data_v2 + voxlingua_root=/export/corpora6/voxlingua107 + musan_root=/export/corpora5/JHU/musan + babel_assamese_root=$ldc_root/LDC2016S06 + babel_bengali_root=$ldc_root/LDC2016S08 + babel_pashto_root=$ldc_root/LDC2016S09 + babel_turkish_root=$ldc_root/LDC2016S10 + babel_georgian_root=$ldc_root/LDC2016S12 + babel_vietnam_root=$ldc_root/LDC2017S01 + babel_haitian_root=$ldc_root/LDC2017S03 + babel_lao_root=$ldc_root/LDC2017S08 + babel_tamil_root=$ldc_root/LDC2017S13 + babel_zulu_root=$ldc_root/LDC2017S19 + babel_kurmanji_root=$ldc_root/LDC2017S22 + babel_tok_root=$ldc_root/LDC2018S02 + babel_kazakh_root=$ldc_root/LDC2018S13 + babel_telugu_root=$ldc_root/LDC2018S16 + babel_lithuanian_root=$ldc_root/LDC2019S03 + fleurs_root=/export/corpora6/LRE/FLEURS2022 + lwazi_root=/export/corpora6/LRE/Lwazi2009 + nchlt_root=/export/corpora6/LRE/NCHLT2014 + ammi_root=/export/corpora6/LRE/AMMI2020 + cv20_root=/export/corpora5/mozilla-common-voice/cv-corpus-5.1-2020-06-22 + cv22_root=/export/corpora6/LRE/CommonVoice2020/cv-corpus-11.0-2022-09-21 + adi_root=/export/corpora6/ADI17 + ast_root=/export/corpora6/LRE/AST2004 +elif [ "$(hostname --domain)" == "cm.gemini" ];then + ldc_root=/export/common/data/corpora/LDC + sre_root=/export/common/data/corpora/NIST/SRE + my_root=/exp/jvillalba/corpora + sre16_dev_root=/exp/jvillalba/corpora/LDC2019S20/data/dev/R148_0_0 + sre16_eval_root=/exp/jvillalba/corpora/LDC2019S20/data/eval/R149_0_1 + sre18_dev_root=$sre_root/SRE18/LDC2018E46_2018_NIST_Speaker_Recognition_Evaluation_Development_Set + sre18_eval_root=$sre_root/SRE18/Eval/LDC2018E51 + sre19cmn2_eval_root=/exp/jvillalba/corpora/LDC2019E58 + sre_superset_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E08 + sre21_dev_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E09 + sre21_eval_root=/exp/jvillalba/corpora/sre21/releases/LDC2021E10 + lre17_train_root=$my_root/LDC2022E16_2017_NIST_Language_Recognition_Evaluation_Training_and_Development_Sets + lre17_eval_root=$my_root/LDC2022E17_2017_NIST_Language_Recognition_Evaluation_Test_Set + lre22_dev_root=$my_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data + lre22_eval_root=$my_root/lre22_test_data_v2 + voxlingua_root=$my_root/voxlingua107 + musan_root=/expscratch/dgromero/corpora/musan + babel_assamese_root=$ldc_root/LDC2016S06 + babel_bengali_root=$ldc_root/LDC2016S08 + babel_pashto_root=$ldc_root/LDC2016S09 + babel_turkish_root=$my_root/LDC2016S10 + babel_georgian_root=$my_root/LDC2016S12 + babel_vietnam_root=$my_root/LDC2017S01 + babel_haitian_root=$my_root/LDC2017S03 + babel_lao_root=$ldc_root/LDC2017S08 + babel_tamil_root=$ldc_root/LDC2017S13 + babel_zulu_root=$ldc_root/LDC2017S19 + babel_kurmanji_root=$ldc_root/LDC2017S22 + babel_tok_root=$my_root/LDC2018S02 + babel_kazakh_root=$ldc_root/LDC2018S13 + babel_telugu_root=$ldc_root/LDC2018S16 + babel_lithuanian_root=$my_root/LDC2019S03 + adi_root=/exp/jvillalba/corpora/ADI17 + +else + echo "Put your database paths here" + exit 1 +fi diff --git a/egs/lre22/open.v2.8k/default_config.sh b/egs/lre22/open.v2.8k/default_config.sh new file mode 120000 index 00000000..94d038cf --- /dev/null +++ b/egs/lre22/open.v2.8k/default_config.sh @@ -0,0 +1 @@ +global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh b/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh new file mode 100644 index 00000000..b39d817b --- /dev/null +++ b/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh @@ -0,0 +1,36 @@ +# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 1024x3 + +# hugging face model +hf_model_name=wav2vec2xlsr300m + +#vad +vad_config=conf/vad_8k.yaml + +# x-vector training +nnet_data=open + +# x-vector cfg + +nnet_type=hf_wav2vec2resnet1d + +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.2.yaml +nnet_s1_args="" + +nnet_name=${hf_model_name}_ecapatdnn1024x3_v2.2 +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0011.pth + +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v2.2.yaml +nnet_s2_args="" +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0008.pth + +nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage3_v2.2.yaml +nnet_s3_args="" +nnet_s3_name=${nnet_name}.s3 +nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name +nnet_s3=$nnet_s3_dir/model_ep0002.pth +nnet_s3=$nnet_s3_dir/model_ep0005.pth diff --git a/egs/lre22/open.v2.8k/hyp_utils b/egs/lre22/open.v2.8k/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/lre22/open.v2.8k/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/local b/egs/lre22/open.v2.8k/local new file mode 120000 index 00000000..c2a3fdea --- /dev/null +++ b/egs/lre22/open.v2.8k/local @@ -0,0 +1 @@ +../fixed.v1.8k/local \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/path.sh b/egs/lre22/open.v2.8k/path.sh new file mode 100644 index 00000000..6994fdab --- /dev/null +++ b/egs/lre22/open.v2.8k/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/lre22/open.v2.8k/resources b/egs/lre22/open.v2.8k/resources new file mode 120000 index 00000000..113b3492 --- /dev/null +++ b/egs/lre22/open.v2.8k/resources @@ -0,0 +1 @@ +../fixed.v1.8k/resources \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/run_001_prepare_data.sh b/egs/lre22/open.v2.8k/run_001_prepare_data.sh new file mode 100755 index 00000000..99a72cab --- /dev/null +++ b/egs/lre22/open.v2.8k/run_001_prepare_data.sh @@ -0,0 +1,330 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +config_file=default_config.sh +stage=1 + +. parse_options.sh || exit 1; +. datapath.sh + + +if [ $stage -le 1 ];then + # Prepares voxlingua 107 for training + hyp_utils/conda_env.sh \ + local/prepare_voxlingua107.py \ + --corpus-dir $voxlingua_root \ + --output-dir data/voxlingua107 \ + --remove-langs en-en es-es ar-ar pt-pt \ + --map-langs-to-lre-codes \ + --target-fs 8000 + +fi + +if [ $stage -le 2 ];then + # Prepare LRE17 Training data + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_train \ + --subset train \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_cts \ + --subset dev \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_train_root \ + --output-dir data/lre17_dev_afv \ + --subset dev \ + --source vast \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_cts \ + --subset eval \ + --source mls14 \ + --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_lre17.py \ + --corpus-dir $lre17_eval_root \ + --output-dir data/lre17_eval_afv \ + --subset eval \ + --source vast \ + --target-fs 8000 + +fi + +if [ $stage -le 3 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_dev.py \ + --corpus-dir $lre22_dev_root \ + --output-dir data/lre22_dev \ + --target-fs 8000 + +fi + +if [ $stage -le 4 ];then + hyp_utils/conda_env.sh \ + local/prepare_lre22_eval.py \ + --corpus-dir $lre22_eval_root \ + --output-dir data/lre22_eval \ + --target-fs 8000 + +fi + +if [ $stage -le 5 ];then + local/make_sre16_train_dev.sh $sre16_dev_root 8 data + local/make_sre16_train_eval.sh $sre16_eval_root 8 data +fi + +if [ $stage -le 6 ];then + local/make_sre18_dev_unlabeled.sh $sre18_dev_root 8 data + local/make_sre18_train_dev.sh $sre18_dev_root 8 data + local/make_sre18_train_eval.sh $sre18_eval_root 8 data +fi + +if [ $stage -le 7 ];then + # Prepare sre19 + local/make_sre19cmn2_eval.sh $sre19cmn2_eval_root 8 data +fi + +if [ $stage -le 8 ];then + # Prepare SRE21 dev + hyp_utils/conda_env.sh \ + local/prepare_sre21av_dev_audio.py \ + --corpus-dir $sre21_dev_root \ + --target-fs 8000 \ + --output-path data/sre21_audio_dev \ + --av-output-path data/sre21_audio-visual_dev + # Prepare SRE21 eval + hyp_utils/conda_env.sh \ + local/prepare_sre21av_eval_audio.py \ + --corpus-dir $sre21_eval_root \ + --target-fs 8000 \ + --output-path data/sre21_audio_eval \ + --av-output-path data/sre21_audio-visual_eval + +fi + +if [ $stage -le 9 ];then + # Prepare SRE CTS superset + hyp_utils/conda_env.sh \ + local/prepare_sre_cts_superset.py \ + --corpus-dir $sre_superset_root \ + --target-fs 8000 \ + --output-dir data/sre_cts_superset +fi + +if [ $stage -le 10 ];then + # Prepare babel datasets + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_assamese_root \ + --target-fs 8000 \ + --lang-code as-as \ + --output-dir data/babel_assamese + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_bengali_root \ + --target-fs 8000 \ + --lang-code bn-bn \ + --output-dir data/babel_bengali + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_pashto_root \ + --target-fs 8000 \ + --lang-code ps-ps \ + --output-dir data/babel_pashto + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_turkish_root \ + --target-fs 8000 \ + --lang-code tr-tr \ + --output-dir data/babel_turkish + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_georgian_root \ + --target-fs 8000 \ + --lang-code ka-ka \ + --output-dir data/babel_georgian + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_vietnam_root \ + --target-fs 8000 \ + --lang-code vi-vi \ + --output-dir data/babel_vietnam + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_haitian_root \ + --target-fs 8000 \ + --lang-code ht-ht \ + --output-dir data/babel_haitian + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_lao_root \ + --target-fs 8000 \ + --lang-code lo-lo \ + --output-dir data/babel_lao + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_tamil_root \ + --target-fs 8000 \ + --lang-code ta-ta \ + --output-dir data/babel_tamil + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_zulu_root \ + --target-fs 8000 \ + --lang-code zul-zul \ + --output-dir data/babel_zulu + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_kurmanji_root \ + --target-fs 8000 \ + --lang-code kur-kur \ + --output-dir data/babel_kurmanji + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_tok_root \ + --target-fs 8000 \ + --lang-code tok-tok \ + --output-dir data/babel_tok + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_kazakh_root \ + --target-fs 8000 \ + --lang-code kk-kk \ + --output-dir data/babel_kazakh + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_telugu_root \ + --target-fs 8000 \ + --lang-code te-te \ + --output-dir data/babel_telugu + hyp_utils/conda_env.sh \ + local/prepare_babel.py \ + --corpus-dir $babel_lithuanian_root \ + --target-fs 8000 \ + --lang-code lt-lt \ + --output-dir data/babel_lithuanian + +fi + +if [ $stage -le 11 ];then + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre.py \ + --corpus-dir $fleurs_root \ + --output-dir data/fleurs22 \ + --map-langs-to-lre-codes --target-fs 8000 + + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre.py \ + --corpus-dir $lwazi_root \ + --output-dir data/lwazi09 \ + --map-langs-to-lre-codes --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre.py \ + --corpus-dir $nchlt_root \ + --output-dir data/nchlt14 \ + --map-langs-to-lre-codes --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_some_data_for_lre.py \ + --corpus-dir $ammi_root \ + --output-dir data/ammi20 \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 12 ];then + + hyp_utils/conda_env.sh \ + local/prepare_common_voice_cat.py \ + --corpus-dir $cv22_root \ + --output-dir data/cv22_tir \ + --keep-langs tir-tir \ + --map-langs-to-lre-codes --target-fs 8000 +fi + + +if [ $stage -le 13 ];then + hyp_utils/conda_env.sh \ + local/prepare_common_voice_accents_cat.py \ + --corpus-dir $cv20_root \ + --output-dir data/cv20_eng_ine \ + --lang en \ + --target-fs 8000 + hyp_utils/conda_env.sh \ + local/prepare_common_voice_accents_cat.py \ + --corpus-dir $cv20_root \ + --output-dir data/cv20_fra \ + --lang fr \ + --target-fs 8000 + +fi + +if [ $stage -le 14 ];then + hyp_utils/conda_env.sh \ + local/prepare_adi17.py \ + --corpus-dir $adi_root \ + --output-dir data/adi17 \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 15 ];then + hyp_utils/conda_env.sh \ + local/prepare_ast.py \ + --corpus-dir $ast_root \ + --output-dir data/ast \ + --map-langs-to-lre-codes --target-fs 8000 +fi + +if [ $stage -le 16 ];then + #combine data + utils/combine_data.sh \ + data/babel \ + data/babel_{a*,b*,g*,k*,l*,p*,t*,v*,zulu} + + utils/combine_data.sh \ + data/cv \ + data/cv20_eng_ine data/cv20_fra data/cv22_tir + + utils/combine_data.sh \ + data/sre16 \ + data/sre16_train_{dev*,eval*} + + utils/combine_data.sh \ + data/sre18 \ + data/sre18_train_{dev*,eval*} data/sre18_dev_unlabeled + + utils/combine_data.sh \ + data/sre19 \ + data/sre19_eval_{enroll,test}_cmn2 + + utils/combine_data.sh \ + data/sre21_cts \ + data/sre21_*_cts + + utils/combine_data.sh \ + data/sre21_afv \ + data/sre21_audio*_{dev*,eval*}_afv + + utils/combine_data.sh \ + data/sre16-21_cts \ + data/sre1{6,8,9} data/sre21_cts + +fi + diff --git a/egs/lre22/open.v2.8k/run_002_compute_evad.sh b/egs/lre22/open.v2.8k/run_002_compute_evad.sh new file mode 100755 index 00000000..f7ccdfa7 --- /dev/null +++ b/egs/lre22/open.v2.8k/run_002_compute_evad.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=b1 +storage_name=$(date +'%m_%d_%H_%M') +vaddir=`pwd`/exp/vad_e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $vaddir/storage ]; then + dir_name=$USER/hyp-data/lre22-fixed-v1.8k-$storage_name/vad/storage + if [ "$nodes" == "b0" ];then + utils/create_split_dir.pl \ + utils/create_split_dir.pl \ + /export/b{04,05,06,07}/$dir_name $vaddir/storage + elif [ "$nodes" == "b1" ];then + utils/create_split_dir.pl \ + /export/b1{0,1,2,3,4,5,6,7,8,9}/$dir_name $vaddir/storage + elif [ "$nodes" == "c0" ];then + utils/create_split_dir.pl \ + /export/c{06,07,08,09}/$dir_name $vaddir/storage + elif [ "$nodes" == "fs01" ];then + utils/create_split_dir.pl \ + /export/fs01/$dir_name $vaddir/storage + elif [ "$nodes" == "fs05" ];then + utils/create_split_dir.pl \ + /export/fs05/$dir_name $vaddir/storage + else + echo "we don't distribute data between multiple machines" + fi + fi +fi + +# VAD Train/Test Datasets +if [ $stage -le 2 ];then + for name in voxlingua107 \ + lre17_train \ + lre17_dev_cts lre17_dev_afv \ + lre17_eval_cts lre17_eval_afv \ + lre22_dev lre22_eval \ + babel sre16-21_cts sre21_afv sre_cts_superset \ + lwazi09 nchlt14 adi17 fleurs22 ammi20 \ + ast cv + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 40 ? $num_spk:40)) + hyp_utils/feats/make_evad.sh --write-utt2num-frames true \ + --vad-config $vad_config --nj $nj --cmd "$train_cmd" \ + data/${name} exp/make_vad/$name $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + diff --git a/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh b/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..08d4d910 --- /dev/null +++ b/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root 16 data + + for name in musan_noise musan_music + do + steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v2.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_proc_audio exp/${name}_proc_audio + utils/fix_data_dir.sh data/${name}_proc_audio + done + +fi + +if [ $stage -le 2 ]; then + + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ + --storage_name lre22-fixed-v2.8k-$(date +'%m_%d_%H_%M') \ + data/${name} data/${name}_babble exp/${name}_babble + # utils/fix_data_dir.sh data/${name}_babble + done +fi + +if [ $stage -le 3 ]; then + if [ ! -d "RIRS_NOISES" ]; then + if [ -d ../v1.8k/RIRS_NOISES ];then + ln -s ../v1.8k/RIRS_NOISES + else + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + fi + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/smallroom 16 data/rirs_smallroom + local/make_rirs_data.sh RIRS_NOISES/simulated_rirs/mediumroom 16 data/rirs_mediumroom + local/make_rirs_data.sh RIRS_NOISES/real_rirs_isotropic_noises 16 data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + #pack all rirs in h5 files + steps_xvec/pack_rirs_for_nnet_train.sh data/$rirs data/$rirs exp/rirs/$rirs + done + +fi + + diff --git a/egs/lre22/open.v2.8k/run_004_apply_codecs.sh b/egs/lre22/open.v2.8k/run_004_apply_codecs.sh new file mode 100755 index 00000000..6efc016b --- /dev/null +++ b/egs/lre22/open.v2.8k/run_004_apply_codecs.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ];then + + for data in voxlingua107 \ + lre17_dev_afv lre17_eval_afv \ + sre21_afv ast cv \ + lwazi09 nchlt14 adi17 fleurs22 ammi20 + do + hyp_utils/conda_env.sh \ + local/apply_tel_codecs_to_kaldi_datadir.py \ + --input-dir data/$data \ + --output-dir data/${data}_codecs + done + +fi diff --git a/egs/lre22/open.v2.8k/run_010_prepare_xvec_train_data.sh b/egs/lre22/open.v2.8k/run_010_prepare_xvec_train_data.sh new file mode 100755 index 00000000..98aa9a4c --- /dev/null +++ b/egs/lre22/open.v2.8k/run_010_prepare_xvec_train_data.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # This script preprocess audio for x-vector training + for name in voxlingua107_codecs \ + lre17_train \ + lre17_{dev,eval}_{cts,afv,afv_codecs} \ + babel sre16-21_cts sre_cts_superset \ + sre21_afv_codecs cv_codecs adi17_codecs \ + lwazi09{,_codecs} nchlt14{,_codecs} fleurs22{,_codecs} ammi20{,_codecs} ast{,_codecs} + do + steps_xvec/preprocess_audios_for_nnet_train.sh \ + --nj 40 --cmd "$train_cmd" \ + --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') --use-bin-vad true \ + data/${name} data/${name}_proc_audio_no_sil exp/${name}_proc_audio_no_sil + utils/fix_data_dir.sh data/${name}_proc_audio_no_sil + done +fi + +if [ $stage -le 2 ];then + utils/combine_data.sh \ + data/lre17_proc_audio_no_sil \ + data/lre17_train_proc_audio_no_sil \ + data/lre17_{dev,eval}_{cts,afv,afv_codecs}_proc_audio_no_sil + + utils/combine_data.sh \ + data/babel_sre_proc_audio_no_sil \ + data/{babel,sre16-21_cts,sre21_afv_codecs,sre_cts_superset}_proc_audio_no_sil + + utils/combine_data.sh \ + data/others_afr_proc_audio_no_sil \ + data/adi17_proc_audio_no_sil \ + data/{lwazi09,nchlt14,fleurs22,ammi20,ast}{,_codecs}_proc_audio_no_sil +fi + +if [ $stage -le 3 ]; then + # Now, we remove files with less than 3s + hyp_utils/remove_short_audios.sh --min-len 3 data/voxlingua107_codecs_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/lre17_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/babel_sre_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/others_afr_proc_audio_no_sil + hyp_utils/remove_short_audios.sh --min-len 3 data/cv_codecs_proc_audio_no_sil +fi + +if [ $stage -le 4 ];then + # merge all data + utils/combine_data.sh \ + data/open_proc_audio_no_sil \ + data/{voxlingua107_codecs,lre17,babel_sre,cv_codecs,others_afr}_proc_audio_no_sil \ +fi + + +if [ $stage -le 5 ]; then + for name in open_proc_audio_no_sil + do + hyp_utils/conda_env.sh \ + local/split_segments_train_val.py \ + --segments-file data/$name/utt2lang \ + --recordings-file data/$name/wav.scp \ + --durations-file data/$name/utt2dur \ + --val-percent 2. \ + --remove-langs fra-mix ara-ary en-en es-es pt-pt ar-ar \ + --output-dir data/$name/train_val_split + done +fi diff --git a/egs/lre22/open.v2.8k/run_011_train_xvector.sh b/egs/lre22/open.v2.8k/run_011_train_xvector.sh new file mode 100755 index 00000000..3a7a47a4 --- /dev/null +++ b/egs/lre22/open.v2.8k/run_011_train_xvector.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +list_dir=data/${nnet_data}_proc_audio_no_sil + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-open-v2.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" +fi + + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + train_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu + +fi + +if [ $stage -le 2 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir $args \ + --num-gpus $ngpu \ + +fi +exit +if [ $stage -le 3 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir $args \ + --num-gpus $ngpu \ + +fi + +if [ $stage -le 4 ]; then + + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s4_name.$(date -Iminutes)" + fi + + mkdir -p $nnet_s4_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s4_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + finetune_wav2vec2xvector.py $nnet_type \ + --cfg $nnet_s4_base_cfg $nnet_s4_args $extra_args \ + --data.train.dataset.recordings-file $list_dir/wav.scp \ + --data.train.dataset.segments-file $list_dir/train_val_split/train_segments.csv \ + --data.train.dataset.class-files $list_dir/train_val_split/class_file.csv \ + --data.val.dataset.recordings-file $list_dir/wav.scp \ + --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ + --in-model-file $nnet_s3 \ + --trainer.exp-path $nnet_s4_dir $args \ + --num-gpus $ngpu \ + +fi + diff --git a/egs/lre22/open.v2.8k/steps b/egs/lre22/open.v2.8k/steps new file mode 120000 index 00000000..aede39fe --- /dev/null +++ b/egs/lre22/open.v2.8k/steps @@ -0,0 +1 @@ +hyp_utils/kaldi/steps \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/steps_be b/egs/lre22/open.v2.8k/steps_be new file mode 120000 index 00000000..48aedc5a --- /dev/null +++ b/egs/lre22/open.v2.8k/steps_be @@ -0,0 +1 @@ +../fixed.v1.8k/steps_be \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/steps_xvec b/egs/lre22/open.v2.8k/steps_xvec new file mode 120000 index 00000000..af66a94d --- /dev/null +++ b/egs/lre22/open.v2.8k/steps_xvec @@ -0,0 +1 @@ +hyp_utils/xvectors \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/utils b/egs/lre22/open.v2.8k/utils new file mode 120000 index 00000000..3d590a1d --- /dev/null +++ b/egs/lre22/open.v2.8k/utils @@ -0,0 +1 @@ +hyp_utils/kaldi/utils \ No newline at end of file diff --git a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh index d7ea8ed0..1ffd35a8 100755 --- a/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh +++ b/egs/sre21-av-a/v1.8k/run_011_train_xvector.sh @@ -68,50 +68,3 @@ if [ $stage -le 2 ]; then --num-gpus $ngpu \ fi -exit - -# Network Training -if [ $stage -le 1 ]; then - - if [[ ${nnet_type} =~ resnet1d ]]; then - train_exec=torch-train-resnet1d-xvec-from-wav.py - elif [[ ${nnet_type} =~ resnet ]] || [[ ${nnet_type} =~ resnext ]] || [[ ${nnet_type} =~ res2net ]] || [[ ${nnet_type} =~ res2next ]]; then - train_exec=torch-train-resnet-xvec-from-wav.py - elif [[ ${nnet_type} =~ efficientnet ]]; then - train_exec=torch-train-efficientnet-xvec-from-wav.py - elif [[ ${nnet_type} =~ tdnn ]]; then - train_exec=torch-train-tdnn-xvec-from-wav.py - elif [[ ${nnet_type} =~ transformer ]]; then - train_exec=torch-train-transformer-xvec-v1-from-wav.py - else - echo "$nnet_type not supported" - exit 1 - fi - - mkdir -p $nnet_dir/log - $cuda_cmd \ - --gpu $ngpu $nnet_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - $train_exec --feats $feat_config $aug_opt \ - --audio-path $list_dir/wav.scp \ - --time-durs-file $list_dir/utt2dur \ - --train-list $list_dir/lists_xvec/train.scp \ - --val-list $list_dir/lists_xvec/val.scp \ - --class-file $list_dir/lists_xvec/class2int \ - --min-chunk-length $min_chunk --max-chunk-length $max_chunk \ - --iters-per-epoch $ipe \ - --batch-size $batch_size \ - --num-workers $num_workers \ - --grad-acc-steps $grad_acc_steps \ - --embed-dim $embed_dim $nnet_opt $opt_opt $lrs_opt \ - --epochs $nnet_num_epochs \ - --cos-scale $s --margin $margin --margin-warmup-epochs $margin_warmup \ - --dropout-rate $dropout \ - --num-gpus $ngpu \ - --log-interval $log_interval \ - --exp-path $nnet_dir $args - -fi - - -exit diff --git a/hyperion/np/transforms/skl_tsne.py b/hyperion/np/transforms/skl_tsne.py index ebabc6ec..fbff7df3 100644 --- a/hyperion/np/transforms/skl_tsne.py +++ b/hyperion/np/transforms/skl_tsne.py @@ -53,7 +53,8 @@ def __init__( super().__init__(**kwargs) self.rng_seed = rng_seed if rng is None: - rng = np.random.default_rng(seed=rng_seed) + #rng = np.random.default_rng(seed=rng_seed) + rng = np.random.RandomState(seed=rng_seed) self._tsne = TSNE( n_components=tsne_dim, diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index 57f3faf2..b94d9752 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -139,7 +139,7 @@ def load(cls, file_path, sep=None, name="class_id"): sep=" ", header=None, names=["id", name], - dtype={"id": np.str, name: np.str}, + dtype={"id": str, name: str}, ) else: if sep is None: diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py index 070e4f53..3d8b5e9d 100644 --- a/hyperion/utils/scp_list.py +++ b/hyperion/utils/scp_list.py @@ -36,7 +36,7 @@ def __init__(self, key, file_path, offset=None, range_spec=None): def validate(self): """Validates the attributes of the SCPList object.""" self.key = list2ndarray(self.key) - self.file_path = list2ndarray(self.file_path, dtype=np.object) + self.file_path = list2ndarray(self.file_path, dtype=object) assert len(self.key) == len(self.file_path) if self.offset is not None: if isinstance(self.offset, list): From af6e26e1a4f64a02cf8612e412649dec55ba3926 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 1 Nov 2023 12:45:13 -0400 Subject: [PATCH 116/154] new recipes voxceleb/v1.2 and v2.1 --- egs/librispeech/v0/cmd.sh | 28 + egs/librispeech/v0/conf/clsp.conf | 11 + egs/librispeech/{v1 => v0}/conf/infer.yaml | 0 .../{v1 => v0}/conf/reverb_noise20dB_aug.yaml | 0 .../{v1 => v0}/conf/reverb_noise_aug.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml | 0 ...base_conf_rnnt_k2_pruned_stage1_v3.10.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml | 0 ...ase_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml | 0 ...ase_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml | 0 ...2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml | 0 ...2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml | 0 ...v2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml | 0 ...v2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml | 0 ...v2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml | 0 ...rain_wav2vec2base_rnnt_k2_stage1_v1.0.yaml | 0 ...n_wav2vec2base_transducer_stage1_v5.0.yaml | 0 ...n_wav2vec2base_transducer_stage1_v6.1.yaml | 0 ...n_wav2vec2base_transducer_stage1_v7.1.yaml | 0 ...v2vec2xlsr300m_transducer_stage1_v1.0.yaml | 0 ...v2vec2xlsr300m_transducer_stage1_v2.0.yaml | 0 ...v2vec2xlsr300m_transducer_stage1_v3.0.yaml | 0 ...v2vec2xlsr300m_transducer_stage1_v3.1.yaml | 0 ...v2vec2xlsr300m_transducer_stage1_v3.2.yaml | 0 ...v2vec2xlsr300m_transducer_stage1_v3.3.yaml | 0 ...v2vec2xlsr300m_transducer_stage1_v4.3.yaml | 0 ...v2vec2xlsr300m_transducer_stage1_v4.4.yaml | 0 ...v2vec2xlsr300m_transducer_stage2_v3.2.yaml | 0 .../wav2vec2base_rnn_transducer_do0.4.yaml | 0 .../conf/wav2vec2base_rnnt_ta_do0.4.yaml | 0 .../conf/wav2vec2base_transducer_do0.4.yaml | 0 .../conf/wav2vec2xlsr300m_transducer.yaml | 0 .../conf/wav2vec2xlsr300m_transducer_do.yaml | 0 .../wav2vec2xlsr300m_transducer_do0.2.yaml | 0 .../wav2vec2xlsr300m_transducer_do0.3.yaml | 0 .../wav2vec2xlsr300m_transducer_do0.4.yaml | 0 .../wav2vec2xlsr300m_transducer_enclast.yaml | 0 egs/librispeech/{v1 => v0}/datapath.sh | 0 egs/librispeech/{v1 => v0}/default_config.sh | 0 egs/librispeech/{v1 => v0}/feats | 0 .../global_conf/config_transducer_v1.sh | 0 .../global_conf/config_transducer_v2.sh | 0 .../global_conf/config_transducer_v3.1.sh | 0 .../global_conf/config_transducer_v3.2.sh | 0 .../global_conf/config_transducer_v3.3.sh | 0 .../global_conf/config_transducer_v3.sh | 0 .../global_conf/config_transducer_v4.3.sh | 0 .../global_conf/config_transducer_v4.4.sh | 0 .../global_conf/config_transducer_v5.0.sh | 0 .../global_conf/config_transducer_v6.1.sh | 0 .../global_conf/config_transducer_v7.1.sh | 0 ...g_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh | 0 ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh | 0 ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh | 0 ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh | 0 ...wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh | 0 ...wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh | 0 ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh | 0 ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh | 0 ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh | 0 ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh | 0 ...g_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh | 0 ...g_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh | 0 ...config_wav2vec2base_rnnt_k2_pruned_v1.0.sh | 0 ...config_wav2vec2base_rnnt_k2_pruned_v1.2.sh | 0 ...config_wav2vec2base_rnnt_k2_pruned_v1.3.sh | 0 .../config_wav2vec2base_rnnt_k2_v1.0.sh | 0 egs/librispeech/v0/hyp_utils | 1 + egs/librispeech/{v1 => v0}/local/data_prep.sh | 0 .../{v1 => v0}/local/download_lm.py | 0 .../{v1 => v0}/local/make_musan.py | 0 .../{v1 => v0}/local/make_musan.sh | 0 .../{v1 => v0}/local/make_rirs_data.sh | 0 .../{v1 => v0}/local/prepare_lang.py | 0 .../{v1 => v0}/local/prepare_lang_bpe.py | 0 .../{v1 => v0}/local/train_bpe_model.py | 0 .../{v1 => v0}/local/validate_bpe_lexicon.py | 0 egs/librispeech/v0/path.sh | 5 + egs/librispeech/v0/run_001_prepare_data.sh | 54 ++ .../{v1 => v0}/run_003_prepare_noises_rirs.sh | 0 .../{v1 => v0}/run_004_compute_bpe.sh | 0 .../{v1 => v0}/run_011_train_asr.sh | 0 .../{v1 => v0}/run_011_train_asr_old.sh | 0 .../{v1 => v0}/run_030_inference.sh | 0 .../{v1 => v0}/run_030_inference_old.sh | 0 .../{v1 => v0}/run_040_eval_wer.sh | 0 egs/librispeech/{v1 => v0}/steps | 0 egs/librispeech/{v1 => v0}/steps_be | 0 egs/librispeech/{v1 => v0}/steps_pyfe | 0 .../decode_wav2vec2rnn_transducer.sh | 0 .../decode_wav2vec2transducer.sh | 0 egs/librispeech/{v1 => v0}/steps_xvec | 0 egs/librispeech/{v1 => v0}/utils | 0 egs/librispeech/{v1 => v0}/xvectors | 0 egs/librispeech/v1/cmd.sh | 8 +- egs/librispeech/v1/conf/clsp.conf | 2 +- egs/librispeech/v1/conf/coe_gpu_bigmem.conf | 11 + egs/librispeech/v1/conf/coe_gpu_long.conf | 13 + egs/librispeech/v1/conf/coe_gpu_rtx.conf | 11 + egs/librispeech/v1/conf/coe_gpu_short.conf | 11 + egs/librispeech/v1/conf/coe_gpu_v100.conf | 11 + .../v1/conf/fbank80_specaug1_mn_16k.yaml | 25 + .../conf/speed_reverb_noise10-20dB_aug.yaml | 39 + ...mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml | 70 ++ ...nk80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh | 18 + egs/librispeech/v1/run_001_prepare_data.sh | 51 +- egs/voxceleb/ssl.v1/cmd.sh | 28 + egs/voxceleb/ssl.v1/conf/clsp.conf | 11 + egs/voxceleb/ssl.v1/conf/coe_gpu_bigmem.conf | 11 + egs/voxceleb/ssl.v1/conf/coe_gpu_long.conf | 13 + egs/voxceleb/ssl.v1/conf/coe_gpu_rtx.conf | 11 + egs/voxceleb/ssl.v1/conf/coe_gpu_short.conf | 11 + egs/voxceleb/ssl.v1/conf/coe_gpu_v100.conf | 11 + .../conf/fbank80_specaug1_stmn_16k.yaml | 24 + .../ssl.v1/conf/reverb_noise_aug.yaml | 34 + egs/voxceleb/ssl.v1/datapath.sh | 23 + egs/voxceleb/ssl.v1/hyp_utils | 1 + egs/voxceleb/ssl.v1/path.sh | 5 + egs/voxceleb/ssl.v1/run_001_prepare_data.sh | 46 ++ egs/voxceleb/ssl.v1/run_002_compute_evad.sh | 66 ++ .../ssl.v1/run_003_prepare_noises_rirs.sh | 102 +++ .../ssl.v1/run_004_prepare_xvec_train_data.sh | 75 ++ egs/voxceleb/ssl.v1/run_005_train_dino.sh | 99 +++ ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 1 - ...wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml | 69 ++ ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 5 +- ...wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml | 78 ++ egs/voxceleb/v2.1/run_007_eval_be.sh | 2 +- hyperion/bin/extract_wav2vec2xvectors.py | 9 +- .../generate_adv_attacks_xvector_classif.py | 14 +- .../bin/generate_adv_attacks_xvector_verif.py | 4 +- hyperion/bin/train_wav2vec2xvector.py | 6 + hyperion/bin/train_wav2xvector.py | 2 + hyperion/bin/train_xvector_from_feats.py | 2 + hyperion/bin/train_xvector_from_wav.py | 2 + hyperion/torch/data/audio_dataset.py | 235 ++++-- hyperion/torch/layer_blocks/__init__.py | 58 +- .../layer_blocks/conformer_decoder_v1.py | 213 ++++++ .../layer_blocks/conformer_encoder_v1.py | 108 +-- .../transformer_conv2d_subsampler.py | 61 -- .../torch/layer_blocks/transformer_input.py | 151 ++++ hyperion/torch/layers/__init__.py | 7 + hyperion/torch/layers/audio_feats.py | 12 +- hyperion/torch/layers/audio_feats_factory.py | 23 +- hyperion/torch/layers/feat_fuser_factory.py | 101 +++ hyperion/torch/layers/feat_fusers.py | 86 +++ hyperion/torch/layers/mvn.py | 112 ++- hyperion/torch/lr_schedulers/triangular_lr.py | 7 +- hyperion/torch/models/__init__.py | 6 +- .../transducer/conformer_v1_rnn_transducer.py | 12 +- .../torch/models/wav2xvectors/__init__.py | 6 + .../hf_hubert2conformer_v1_xvector.py | 93 +++ .../hf_wav2vec2conformer_v1_xvector.py | 96 +++ .../models/wav2xvectors/hf_wav2xvector.py | 1 + .../hf_wavlm2conformer_v1_xvector.py | 93 +++ .../wav2xvectors/wav2conformer_v1_xvector.py | 70 ++ hyperion/torch/models/xvectors/__init__.py | 1 + .../models/xvectors/conformer_v1_xvector.py | 168 ++++ hyperion/torch/narchs/audio_feats_mvn.py | 5 +- hyperion/torch/narchs/conformer_decoder_v1.py | 724 ++++++++++++++++++ hyperion/torch/narchs/conformer_encoder_v1.py | 159 ++-- hyperion/torch/narchs/feat_fuser_mvn.py | 107 +++ hyperion/torch/torch_model.py | 120 ++- hyperion/torch/tpm/hf/hf_wav2vec_base.py | 36 +- hyperion/torch/trainers/torch_trainer.py | 86 ++- hyperion/torch/trainers/xvector_trainer.py | 42 +- .../trainers/xvector_trainer_from_wav.py | 7 +- hyperion/torch/utils/__init__.py | 11 +- hyperion/torch/utils/masking.py | 21 +- hyperion/utils/misc.py | 7 +- hyperion/utils/scp_list.py | 2 +- 179 files changed, 3703 insertions(+), 478 deletions(-) create mode 100755 egs/librispeech/v0/cmd.sh create mode 100644 egs/librispeech/v0/conf/clsp.conf rename egs/librispeech/{v1 => v0}/conf/infer.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/reverb_noise20dB_aug.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/reverb_noise_aug.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/wav2vec2base_rnn_transducer_do0.4.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/wav2vec2base_rnnt_ta_do0.4.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/wav2vec2base_transducer_do0.4.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/wav2vec2xlsr300m_transducer.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/wav2vec2xlsr300m_transducer_do.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/wav2vec2xlsr300m_transducer_do0.2.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/wav2vec2xlsr300m_transducer_do0.3.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/wav2vec2xlsr300m_transducer_do0.4.yaml (100%) rename egs/librispeech/{v1 => v0}/conf/wav2vec2xlsr300m_transducer_enclast.yaml (100%) rename egs/librispeech/{v1 => v0}/datapath.sh (100%) rename egs/librispeech/{v1 => v0}/default_config.sh (100%) rename egs/librispeech/{v1 => v0}/feats (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v1.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v2.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v3.1.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v3.2.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v3.3.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v3.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v4.3.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v4.4.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v5.0.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v6.1.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_transducer_v7.1.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh (100%) rename egs/librispeech/{v1 => v0}/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh (100%) create mode 120000 egs/librispeech/v0/hyp_utils rename egs/librispeech/{v1 => v0}/local/data_prep.sh (100%) rename egs/librispeech/{v1 => v0}/local/download_lm.py (100%) rename egs/librispeech/{v1 => v0}/local/make_musan.py (100%) rename egs/librispeech/{v1 => v0}/local/make_musan.sh (100%) rename egs/librispeech/{v1 => v0}/local/make_rirs_data.sh (100%) rename egs/librispeech/{v1 => v0}/local/prepare_lang.py (100%) rename egs/librispeech/{v1 => v0}/local/prepare_lang_bpe.py (100%) rename egs/librispeech/{v1 => v0}/local/train_bpe_model.py (100%) rename egs/librispeech/{v1 => v0}/local/validate_bpe_lexicon.py (100%) create mode 100755 egs/librispeech/v0/path.sh create mode 100755 egs/librispeech/v0/run_001_prepare_data.sh rename egs/librispeech/{v1 => v0}/run_003_prepare_noises_rirs.sh (100%) rename egs/librispeech/{v1 => v0}/run_004_compute_bpe.sh (100%) rename egs/librispeech/{v1 => v0}/run_011_train_asr.sh (100%) rename egs/librispeech/{v1 => v0}/run_011_train_asr_old.sh (100%) rename egs/librispeech/{v1 => v0}/run_030_inference.sh (100%) rename egs/librispeech/{v1 => v0}/run_030_inference_old.sh (100%) rename egs/librispeech/{v1 => v0}/run_040_eval_wer.sh (100%) rename egs/librispeech/{v1 => v0}/steps (100%) rename egs/librispeech/{v1 => v0}/steps_be (100%) rename egs/librispeech/{v1 => v0}/steps_pyfe (100%) rename egs/librispeech/{v1 => v0}/steps_transducer/decode_wav2vec2rnn_transducer.sh (100%) rename egs/librispeech/{v1 => v0}/steps_transducer/decode_wav2vec2transducer.sh (100%) rename egs/librispeech/{v1 => v0}/steps_xvec (100%) rename egs/librispeech/{v1 => v0}/utils (100%) rename egs/librispeech/{v1 => v0}/xvectors (100%) create mode 100644 egs/librispeech/v1/conf/coe_gpu_bigmem.conf create mode 100644 egs/librispeech/v1/conf/coe_gpu_long.conf create mode 100644 egs/librispeech/v1/conf/coe_gpu_rtx.conf create mode 100644 egs/librispeech/v1/conf/coe_gpu_short.conf create mode 100644 egs/librispeech/v1/conf/coe_gpu_v100.conf create mode 100644 egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml create mode 100644 egs/librispeech/v1/conf/speed_reverb_noise10-20dB_aug.yaml create mode 100644 egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml create mode 100644 egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh create mode 100755 egs/voxceleb/ssl.v1/cmd.sh create mode 100644 egs/voxceleb/ssl.v1/conf/clsp.conf create mode 100644 egs/voxceleb/ssl.v1/conf/coe_gpu_bigmem.conf create mode 100644 egs/voxceleb/ssl.v1/conf/coe_gpu_long.conf create mode 100644 egs/voxceleb/ssl.v1/conf/coe_gpu_rtx.conf create mode 100644 egs/voxceleb/ssl.v1/conf/coe_gpu_short.conf create mode 100644 egs/voxceleb/ssl.v1/conf/coe_gpu_v100.conf create mode 100644 egs/voxceleb/ssl.v1/conf/fbank80_specaug1_stmn_16k.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/reverb_noise_aug.yaml create mode 100644 egs/voxceleb/ssl.v1/datapath.sh create mode 120000 egs/voxceleb/ssl.v1/hyp_utils create mode 100755 egs/voxceleb/ssl.v1/path.sh create mode 100755 egs/voxceleb/ssl.v1/run_001_prepare_data.sh create mode 100755 egs/voxceleb/ssl.v1/run_002_compute_evad.sh create mode 100755 egs/voxceleb/ssl.v1/run_003_prepare_noises_rirs.sh create mode 100755 egs/voxceleb/ssl.v1/run_004_prepare_xvec_train_data.sh create mode 100755 egs/voxceleb/ssl.v1/run_005_train_dino.sh create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml create mode 100644 egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml create mode 100644 hyperion/torch/layer_blocks/conformer_decoder_v1.py delete mode 100644 hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py create mode 100644 hyperion/torch/layer_blocks/transformer_input.py create mode 100644 hyperion/torch/layers/feat_fuser_factory.py create mode 100644 hyperion/torch/layers/feat_fusers.py create mode 100644 hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py create mode 100644 hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py create mode 100644 hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py create mode 100644 hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py create mode 100644 hyperion/torch/models/xvectors/conformer_v1_xvector.py create mode 100644 hyperion/torch/narchs/conformer_decoder_v1.py create mode 100644 hyperion/torch/narchs/feat_fuser_mvn.py diff --git a/egs/librispeech/v0/cmd.sh b/egs/librispeech/v0/cmd.sh new file mode 100755 index 00000000..89dbb7d8 --- /dev/null +++ b/egs/librispeech/v0/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" + #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " + export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/librispeech/v0/conf/clsp.conf b/egs/librispeech/v0/conf/clsp.conf new file mode 100644 index 00000000..959c62a7 --- /dev/null +++ b/egs/librispeech/v0/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/librispeech/v1/conf/infer.yaml b/egs/librispeech/v0/conf/infer.yaml similarity index 100% rename from egs/librispeech/v1/conf/infer.yaml rename to egs/librispeech/v0/conf/infer.yaml diff --git a/egs/librispeech/v1/conf/reverb_noise20dB_aug.yaml b/egs/librispeech/v0/conf/reverb_noise20dB_aug.yaml similarity index 100% rename from egs/librispeech/v1/conf/reverb_noise20dB_aug.yaml rename to egs/librispeech/v0/conf/reverb_noise20dB_aug.yaml diff --git a/egs/librispeech/v1/conf/reverb_noise_aug.yaml b/egs/librispeech/v0/conf/reverb_noise_aug.yaml similarity index 100% rename from egs/librispeech/v1/conf/reverb_noise_aug.yaml rename to egs/librispeech/v0/conf/reverb_noise_aug.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v1.2.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.0.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.1.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.10.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.2.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.3.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.1.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.2.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.4.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.5.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.6.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.7.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.8.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_conf_rnnt_k2_pruned_stage1_v3.9.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_lstm_rnnt_k2_pruned_stage1_v1.2.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.0.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.2.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_pruned_stage1_v1.3.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_rnnt_k2_stage1_v1.0.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v5.0.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v6.1.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml rename to egs/librispeech/v0/conf/train_wav2vec2base_transducer_stage1_v7.1.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml rename to egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml rename to egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v2.0.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml rename to egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.0.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml rename to egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.1.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml rename to egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.2.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml rename to egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v3.3.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml rename to egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.3.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml rename to egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage1_v4.4.yaml diff --git a/egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml b/egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml similarity index 100% rename from egs/librispeech/v1/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml rename to egs/librispeech/v0/conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2base_rnn_transducer_do0.4.yaml b/egs/librispeech/v0/conf/wav2vec2base_rnn_transducer_do0.4.yaml similarity index 100% rename from egs/librispeech/v1/conf/wav2vec2base_rnn_transducer_do0.4.yaml rename to egs/librispeech/v0/conf/wav2vec2base_rnn_transducer_do0.4.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2base_rnnt_ta_do0.4.yaml b/egs/librispeech/v0/conf/wav2vec2base_rnnt_ta_do0.4.yaml similarity index 100% rename from egs/librispeech/v1/conf/wav2vec2base_rnnt_ta_do0.4.yaml rename to egs/librispeech/v0/conf/wav2vec2base_rnnt_ta_do0.4.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2base_transducer_do0.4.yaml b/egs/librispeech/v0/conf/wav2vec2base_transducer_do0.4.yaml similarity index 100% rename from egs/librispeech/v1/conf/wav2vec2base_transducer_do0.4.yaml rename to egs/librispeech/v0/conf/wav2vec2base_transducer_do0.4.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer.yaml similarity index 100% rename from egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml rename to egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do.yaml similarity index 100% rename from egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do.yaml rename to egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.2.yaml similarity index 100% rename from egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.2.yaml rename to egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.2.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.3.yaml similarity index 100% rename from egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.3.yaml rename to egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.3.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.4.yaml similarity index 100% rename from egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_do0.4.yaml rename to egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_do0.4.yaml diff --git a/egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml b/egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_enclast.yaml similarity index 100% rename from egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer_enclast.yaml rename to egs/librispeech/v0/conf/wav2vec2xlsr300m_transducer_enclast.yaml diff --git a/egs/librispeech/v1/datapath.sh b/egs/librispeech/v0/datapath.sh similarity index 100% rename from egs/librispeech/v1/datapath.sh rename to egs/librispeech/v0/datapath.sh diff --git a/egs/librispeech/v1/default_config.sh b/egs/librispeech/v0/default_config.sh similarity index 100% rename from egs/librispeech/v1/default_config.sh rename to egs/librispeech/v0/default_config.sh diff --git a/egs/librispeech/v1/feats b/egs/librispeech/v0/feats similarity index 100% rename from egs/librispeech/v1/feats rename to egs/librispeech/v0/feats diff --git a/egs/librispeech/v1/global_conf/config_transducer_v1.sh b/egs/librispeech/v0/global_conf/config_transducer_v1.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v1.sh rename to egs/librispeech/v0/global_conf/config_transducer_v1.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v2.sh b/egs/librispeech/v0/global_conf/config_transducer_v2.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v2.sh rename to egs/librispeech/v0/global_conf/config_transducer_v2.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.1.sh b/egs/librispeech/v0/global_conf/config_transducer_v3.1.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v3.1.sh rename to egs/librispeech/v0/global_conf/config_transducer_v3.1.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.2.sh b/egs/librispeech/v0/global_conf/config_transducer_v3.2.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v3.2.sh rename to egs/librispeech/v0/global_conf/config_transducer_v3.2.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.3.sh b/egs/librispeech/v0/global_conf/config_transducer_v3.3.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v3.3.sh rename to egs/librispeech/v0/global_conf/config_transducer_v3.3.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v3.sh b/egs/librispeech/v0/global_conf/config_transducer_v3.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v3.sh rename to egs/librispeech/v0/global_conf/config_transducer_v3.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v4.3.sh b/egs/librispeech/v0/global_conf/config_transducer_v4.3.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v4.3.sh rename to egs/librispeech/v0/global_conf/config_transducer_v4.3.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v4.4.sh b/egs/librispeech/v0/global_conf/config_transducer_v4.4.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v4.4.sh rename to egs/librispeech/v0/global_conf/config_transducer_v4.4.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v5.0.sh b/egs/librispeech/v0/global_conf/config_transducer_v5.0.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v5.0.sh rename to egs/librispeech/v0/global_conf/config_transducer_v5.0.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v6.1.sh b/egs/librispeech/v0/global_conf/config_transducer_v6.1.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v6.1.sh rename to egs/librispeech/v0/global_conf/config_transducer_v6.1.sh diff --git a/egs/librispeech/v1/global_conf/config_transducer_v7.1.sh b/egs/librispeech/v0/global_conf/config_transducer_v7.1.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_transducer_v7.1.sh rename to egs/librispeech/v0/global_conf/config_transducer_v7.1.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v1.2.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.0.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.2.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.3.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.1.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.2.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.4.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.5.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.6.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.7.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_conf_rnnt_k2_pruned_v3.9.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_lstm_rnnt_k2_pruned_v1.2.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.0.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.2.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_pruned_v1.3.sh diff --git a/egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh b/egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh similarity index 100% rename from egs/librispeech/v1/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh rename to egs/librispeech/v0/global_conf/config_wav2vec2base_rnnt_k2_v1.0.sh diff --git a/egs/librispeech/v0/hyp_utils b/egs/librispeech/v0/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/librispeech/v0/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/librispeech/v1/local/data_prep.sh b/egs/librispeech/v0/local/data_prep.sh similarity index 100% rename from egs/librispeech/v1/local/data_prep.sh rename to egs/librispeech/v0/local/data_prep.sh diff --git a/egs/librispeech/v1/local/download_lm.py b/egs/librispeech/v0/local/download_lm.py similarity index 100% rename from egs/librispeech/v1/local/download_lm.py rename to egs/librispeech/v0/local/download_lm.py diff --git a/egs/librispeech/v1/local/make_musan.py b/egs/librispeech/v0/local/make_musan.py similarity index 100% rename from egs/librispeech/v1/local/make_musan.py rename to egs/librispeech/v0/local/make_musan.py diff --git a/egs/librispeech/v1/local/make_musan.sh b/egs/librispeech/v0/local/make_musan.sh similarity index 100% rename from egs/librispeech/v1/local/make_musan.sh rename to egs/librispeech/v0/local/make_musan.sh diff --git a/egs/librispeech/v1/local/make_rirs_data.sh b/egs/librispeech/v0/local/make_rirs_data.sh similarity index 100% rename from egs/librispeech/v1/local/make_rirs_data.sh rename to egs/librispeech/v0/local/make_rirs_data.sh diff --git a/egs/librispeech/v1/local/prepare_lang.py b/egs/librispeech/v0/local/prepare_lang.py similarity index 100% rename from egs/librispeech/v1/local/prepare_lang.py rename to egs/librispeech/v0/local/prepare_lang.py diff --git a/egs/librispeech/v1/local/prepare_lang_bpe.py b/egs/librispeech/v0/local/prepare_lang_bpe.py similarity index 100% rename from egs/librispeech/v1/local/prepare_lang_bpe.py rename to egs/librispeech/v0/local/prepare_lang_bpe.py diff --git a/egs/librispeech/v1/local/train_bpe_model.py b/egs/librispeech/v0/local/train_bpe_model.py similarity index 100% rename from egs/librispeech/v1/local/train_bpe_model.py rename to egs/librispeech/v0/local/train_bpe_model.py diff --git a/egs/librispeech/v1/local/validate_bpe_lexicon.py b/egs/librispeech/v0/local/validate_bpe_lexicon.py similarity index 100% rename from egs/librispeech/v1/local/validate_bpe_lexicon.py rename to egs/librispeech/v0/local/validate_bpe_lexicon.py diff --git a/egs/librispeech/v0/path.sh b/egs/librispeech/v0/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/librispeech/v0/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/librispeech/v0/run_001_prepare_data.sh b/egs/librispeech/v0/run_001_prepare_data.sh new file mode 100755 index 00000000..0708e667 --- /dev/null +++ b/egs/librispeech/v0/run_001_prepare_data.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. ./datapath.sh + + +nj=6 + +mkdir -p data + + +if [ ${stage} -le 1 ]; then + ### Task dependent. You have to make data the following preparation part by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 0: Data preparation" + for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500 + do + # use underscore-separated names in data directories. + local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} + steps_xvec/audio_to_duration.sh --cmd "$train_cmd" data/${part//-/_} + done +fi + +# if [ $stage -le 1 ]; then +# echo "Stage 1: Prepare LibriSpeech manifest" +# # We assume that you have downloaded the LibriSpeech corpus +# # to $librispeech_root +# mkdir -p data/manifests +# if [ ! -e data/manifests/.librispeech.done ]; then +# lhotse prepare librispeech -j $nj $librispeech_root data/manifests +# touch data/manifests/.librispeech.done +# fi +# fi + +# if [ $stage -le 2 ]; then +# echo "Stage 2: Prepare musan manifest" +# # We assume that you have downloaded the musan corpus +# # to $musan_root +# mkdir -p data/manifests +# if [ ! -e data/manifests/.musan.done ]; then +# lhotse prepare musan $musan_root data/manifests +# touch data/manifests/.musan.done +# fi +# fi diff --git a/egs/librispeech/v1/run_003_prepare_noises_rirs.sh b/egs/librispeech/v0/run_003_prepare_noises_rirs.sh similarity index 100% rename from egs/librispeech/v1/run_003_prepare_noises_rirs.sh rename to egs/librispeech/v0/run_003_prepare_noises_rirs.sh diff --git a/egs/librispeech/v1/run_004_compute_bpe.sh b/egs/librispeech/v0/run_004_compute_bpe.sh similarity index 100% rename from egs/librispeech/v1/run_004_compute_bpe.sh rename to egs/librispeech/v0/run_004_compute_bpe.sh diff --git a/egs/librispeech/v1/run_011_train_asr.sh b/egs/librispeech/v0/run_011_train_asr.sh similarity index 100% rename from egs/librispeech/v1/run_011_train_asr.sh rename to egs/librispeech/v0/run_011_train_asr.sh diff --git a/egs/librispeech/v1/run_011_train_asr_old.sh b/egs/librispeech/v0/run_011_train_asr_old.sh similarity index 100% rename from egs/librispeech/v1/run_011_train_asr_old.sh rename to egs/librispeech/v0/run_011_train_asr_old.sh diff --git a/egs/librispeech/v1/run_030_inference.sh b/egs/librispeech/v0/run_030_inference.sh similarity index 100% rename from egs/librispeech/v1/run_030_inference.sh rename to egs/librispeech/v0/run_030_inference.sh diff --git a/egs/librispeech/v1/run_030_inference_old.sh b/egs/librispeech/v0/run_030_inference_old.sh similarity index 100% rename from egs/librispeech/v1/run_030_inference_old.sh rename to egs/librispeech/v0/run_030_inference_old.sh diff --git a/egs/librispeech/v1/run_040_eval_wer.sh b/egs/librispeech/v0/run_040_eval_wer.sh similarity index 100% rename from egs/librispeech/v1/run_040_eval_wer.sh rename to egs/librispeech/v0/run_040_eval_wer.sh diff --git a/egs/librispeech/v1/steps b/egs/librispeech/v0/steps similarity index 100% rename from egs/librispeech/v1/steps rename to egs/librispeech/v0/steps diff --git a/egs/librispeech/v1/steps_be b/egs/librispeech/v0/steps_be similarity index 100% rename from egs/librispeech/v1/steps_be rename to egs/librispeech/v0/steps_be diff --git a/egs/librispeech/v1/steps_pyfe b/egs/librispeech/v0/steps_pyfe similarity index 100% rename from egs/librispeech/v1/steps_pyfe rename to egs/librispeech/v0/steps_pyfe diff --git a/egs/librispeech/v1/steps_transducer/decode_wav2vec2rnn_transducer.sh b/egs/librispeech/v0/steps_transducer/decode_wav2vec2rnn_transducer.sh similarity index 100% rename from egs/librispeech/v1/steps_transducer/decode_wav2vec2rnn_transducer.sh rename to egs/librispeech/v0/steps_transducer/decode_wav2vec2rnn_transducer.sh diff --git a/egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh b/egs/librispeech/v0/steps_transducer/decode_wav2vec2transducer.sh similarity index 100% rename from egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh rename to egs/librispeech/v0/steps_transducer/decode_wav2vec2transducer.sh diff --git a/egs/librispeech/v1/steps_xvec b/egs/librispeech/v0/steps_xvec similarity index 100% rename from egs/librispeech/v1/steps_xvec rename to egs/librispeech/v0/steps_xvec diff --git a/egs/librispeech/v1/utils b/egs/librispeech/v0/utils similarity index 100% rename from egs/librispeech/v1/utils rename to egs/librispeech/v0/utils diff --git a/egs/librispeech/v1/xvectors b/egs/librispeech/v0/xvectors similarity index 100% rename from egs/librispeech/v1/xvectors rename to egs/librispeech/v0/xvectors diff --git a/egs/librispeech/v1/cmd.sh b/egs/librispeech/v1/cmd.sh index 89dbb7d8..040f458b 100755 --- a/egs/librispeech/v1/cmd.sh +++ b/egs/librispeech/v1/cmd.sh @@ -11,16 +11,16 @@ # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" - #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G" - #export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else - export train_cmd="queue.pl --config conf/clsp.conf --mem 4G " - export cuda_cmd="queue.pl --config conf/clsp.conf --mem 20G" + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" export cuda_eval_cmd="$train_cmd" fi diff --git a/egs/librispeech/v1/conf/clsp.conf b/egs/librispeech/v1/conf/clsp.conf index 959c62a7..4ed38246 100644 --- a/egs/librispeech/v1/conf/clsp.conf +++ b/egs/librispeech/v1/conf/clsp.conf @@ -7,5 +7,5 @@ option num_threads=* -pe smp $0 option num_threads=1 # Do not add anything to qsub_opts option max_jobs_run=* -tc $0 default gpu=0 -option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[1345679]*|c2[12357]*' +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/librispeech/v1/conf/coe_gpu_bigmem.conf b/egs/librispeech/v1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/librispeech/v1/conf/coe_gpu_long.conf b/egs/librispeech/v1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/librispeech/v1/conf/coe_gpu_rtx.conf b/egs/librispeech/v1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/librispeech/v1/conf/coe_gpu_short.conf b/egs/librispeech/v1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/librispeech/v1/conf/coe_gpu_v100.conf b/egs/librispeech/v1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/librispeech/v1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml b/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml new file mode 100644 index 00000000..99f202bb --- /dev/null +++ b/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml @@ -0,0 +1,25 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +spec_augment: + time_warp_prob: 0.66 + time_warp_window: 5 + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 40 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 2 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 30 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 2 + mask_method: mean +mvn: + norm_var: false diff --git a/egs/librispeech/v1/conf/speed_reverb_noise10-20dB_aug.yaml b/egs/librispeech/v1/conf/speed_reverb_noise10-20dB_aug.yaml new file mode 100644 index 00000000..f9ecdd33 --- /dev/null +++ b/egs/librispeech/v1/conf/speed_reverb_noise10-20dB_aug.yaml @@ -0,0 +1,39 @@ +speed_aug: + speed_prob: 0.5 + speed_ratios: + - 0.9 + - 1.1 +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 10 + max_snr: 20 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 10 + max_snr: 20 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 10 + max_snr: 20 diff --git a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml new file mode 100644 index 00000000..ed622adb --- /dev/null +++ b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + wav_scale: 1 + aug_cfgs: + - conf/speed_reverb_noise10-20dB_aug.yaml + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 625. + min_batch_size: 1 + drop_last: false + data_loader: + num_workers: 8 + val: + dataset: + wav_scale: 1 + return_segment_info: + - text + sampler: + sampler_type: bucketing_seg_sampler + max_batch_length: 625 + min_batch_size: 1 + drop_last: true + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_mn_16k.yaml + transducer: + encoder: + att_type: local-scaled-dot-prod-v1 + att_context: 32 + d_model: 144 + num_heads: 4 + num_blocks: 16 + d_ff: 576 + in_layer_type: conv2d-sub + decoder: + rnnt_loss: k2_pruned + simple_loss_scale: 0.2 + predictor: + embed_dim: 1024 + num_layers: 2 + hid_feats: 320 + embed_dropout_rate: 0.1 + rnn_dropout_rate: 0.1 + rnn_type: lstm + joiner: + hid_feats: 320 +trainer: + optim: + opt_type: adamw + lr: 0.001 + beta1: 0.9 + beta2: 0.98 + weight_decay: 1e-6 + lrsched: + lrsch_type: noam_lr + d_model: 144 + lr_factor: 8.0 + min_lr: 1e-6 + warmup_steps: 25000 + update_lr_on_opt_step: true + grad_clip: 100 + use_amp: true + log_interval: 1000 + epochs: 120 + eff_batch_size: 128 + train_mode: full diff --git a/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh b/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh new file mode 100644 index 00000000..ee8c2b55 --- /dev/null +++ b/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh @@ -0,0 +1,18 @@ +# Conformer + RNN-T + +# training data +nnet_train_data=train_960h +nnet_val__data=dev_all + +# tokenizer +bpe_model=data/lang_bpe_1000/bpe.model + +# rnn-t cfg +nnet_type=conformer_v1_rnn_transducer +nnet_name=fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p +nnet_s1_base_cfg=conf/train_${nnet_name}.s1.yaml +nnet_s1_args="" +nnet_s1_name=$nnet_name.s1 + +nnet_s1_dir=exp/asr_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0115.pth diff --git a/egs/librispeech/v1/run_001_prepare_data.sh b/egs/librispeech/v1/run_001_prepare_data.sh index 0708e667..3a4ef221 100755 --- a/egs/librispeech/v1/run_001_prepare_data.sh +++ b/egs/librispeech/v1/run_001_prepare_data.sh @@ -19,36 +19,25 @@ nj=6 mkdir -p data -if [ ${stage} -le 1 ]; then - ### Task dependent. You have to make data the following preparation part by yourself. - ### But you can utilize Kaldi recipes in most cases - echo "stage 0: Data preparation" - for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500 - do - # use underscore-separated names in data directories. - local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} - steps_xvec/audio_to_duration.sh --cmd "$train_cmd" data/${part//-/_} - done -fi - -# if [ $stage -le 1 ]; then -# echo "Stage 1: Prepare LibriSpeech manifest" -# # We assume that you have downloaded the LibriSpeech corpus -# # to $librispeech_root -# mkdir -p data/manifests -# if [ ! -e data/manifests/.librispeech.done ]; then -# lhotse prepare librispeech -j $nj $librispeech_root data/manifests -# touch data/manifests/.librispeech.done -# fi +# if [ ${stage} -le 1 ]; then +# ### Task dependent. You have to make data the following preparation part by yourself. +# ### But you can utilize Kaldi recipes in most cases +# echo "stage 0: Data preparation" +# for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500 +# do +# # use underscore-separated names in data directories. +# local/data_prep.sh ${librispeech_root}/${part} data/${part//-/_} +# steps_xvec/audio_to_duration.sh --cmd "$train_cmd" data/${part//-/_} +# done # fi -# if [ $stage -le 2 ]; then -# echo "Stage 2: Prepare musan manifest" -# # We assume that you have downloaded the musan corpus -# # to $musan_root -# mkdir -p data/manifests -# if [ ! -e data/manifests/.musan.done ]; then -# lhotse prepare musan $musan_root data/manifests -# touch data/manifests/.musan.done -# fi -# fi +if [ $stage -le 1 ]; then + echo "Stage 1: Prepare lhotse LibriSpeech manifest" + # We assume that you have downloaded the LibriSpeech corpus + # to $librispeech_root + mkdir -p data/lhotse_librispeech + if [ ! -e data/lhotse_librispeech/.librispeech.done ]; then + lhotse prepare librispeech -j $nj $librispeech_root data/lhotse_librispeech + touch data/lhotse_librispeech/.librispeech.done + fi +fi diff --git a/egs/voxceleb/ssl.v1/cmd.sh b/egs/voxceleb/ssl.v1/cmd.sh new file mode 100755 index 00000000..040f458b --- /dev/null +++ b/egs/voxceleb/ssl.v1/cmd.sh @@ -0,0 +1,28 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +if [ "$(hostname -d)" == "cm.gemini" ];then + #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" + export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" + # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" +else + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" + export cuda_eval_cmd="$train_cmd" +fi + + + diff --git a/egs/voxceleb/ssl.v1/conf/clsp.conf b/egs/voxceleb/ssl.v1/conf/clsp.conf new file mode 100644 index 00000000..4ed38246 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/clsp.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*' +option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0' diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_bigmem.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_bigmem.conf new file mode 100644 index 00000000..a7a2ce40 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_bigmem.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[2-7]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[237]n[01][0123456789]* diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_long.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_long.conf new file mode 100644 index 00000000..b31c167c --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_long.conf @@ -0,0 +1,13 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q -l hostname=r[1-9]* + + diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_rtx.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_rtx.conf new file mode 100644 index 00000000..ba6d9e56 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_rtx.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@rtx diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_short.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_short.conf new file mode 100644 index 00000000..81de5cb7 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_short.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 -l hostname=r[1-9]* +option gpu=* -l gpu=$0,h_rt=00:59:00 -q gpu_short.q -l hostname=r[17]* diff --git a/egs/voxceleb/ssl.v1/conf/coe_gpu_v100.conf b/egs/voxceleb/ssl.v1/conf/coe_gpu_v100.conf new file mode 100644 index 00000000..69326b82 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/coe_gpu_v100.conf @@ -0,0 +1,11 @@ + +# Default configuration +command qsub -v PATH -cwd -S /bin/bash -j y -sync y -l arch=*64* -V +option mem=* -l mem_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l num_proc=$0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -q all.q -l h_rt=100:00:00 +option gpu=* -l gpu=$0,h_rt=500:00:00 -q gpu.q@@v100 diff --git a/egs/voxceleb/ssl.v1/conf/fbank80_specaug1_stmn_16k.yaml b/egs/voxceleb/ssl.v1/conf/fbank80_specaug1_stmn_16k.yaml new file mode 100644 index 00000000..8df42fc6 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/fbank80_specaug1_stmn_16k.yaml @@ -0,0 +1,24 @@ +audio_feats: + audio_feat: logfb + sample_frequency: 16000 + frame_length: 25 + low_freq: 20 + high_freq: 7600 + num_filters: 80 + snip_edges: false + use_energy: false +spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 8 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean +mvn: + context: 150 + norm_var: false diff --git a/egs/voxceleb/ssl.v1/conf/reverb_noise_aug.yaml b/egs/voxceleb/ssl.v1/conf/reverb_noise_aug.yaml new file mode 100644 index 00000000..86f55073 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/reverb_noise_aug.yaml @@ -0,0 +1,34 @@ +reverb_aug: + reverb_prob: 0.45 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max + mediumroom: + weight: 1 + rir_path: csv:data/rirs_mediumroom/rirs.csv + rir_norm: max + realroom: + weight: 1 + rir_path: csv:data/rirs_real/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 0 + max_snr: 18 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 3 + max_snr: 18 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 3 + max_snr: 18 diff --git a/egs/voxceleb/ssl.v1/datapath.sh b/egs/voxceleb/ssl.v1/datapath.sh new file mode 100644 index 00000000..a7eb575c --- /dev/null +++ b/egs/voxceleb/ssl.v1/datapath.sh @@ -0,0 +1,23 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + # voxceleb1_root=/export/corpora5/VoxCeleb1_v1 #voxceleb1 v1 + voxceleb1_root=/export/corpora5/VoxCeleb1_v2 #voxceleb1 v2 + voxceleb2_root=/export/corpora5/VoxCeleb2 + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + # voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1 + voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2 + voxceleb2_root=/expscratch/dgromero/corpora-open/vox2 + voxsrc22_root=/exp/jvillalba/corpora/voxsrc22 + musan_root=/expscratch/dgromero/corpora-open/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/egs/voxceleb/ssl.v1/hyp_utils b/egs/voxceleb/ssl.v1/hyp_utils new file mode 120000 index 00000000..f6d1eb7a --- /dev/null +++ b/egs/voxceleb/ssl.v1/hyp_utils @@ -0,0 +1 @@ +../../../hyp_utils \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/path.sh b/egs/voxceleb/ssl.v1/path.sh new file mode 100755 index 00000000..6994fdab --- /dev/null +++ b/egs/voxceleb/ssl.v1/path.sh @@ -0,0 +1,5 @@ + +export HYP_ROOT=$(readlink -f `pwd -P`/../../..) +export TOOLS_ROOT=$HYP_ROOT/tools + +. $TOOLS_ROOT/path.sh diff --git a/egs/voxceleb/ssl.v1/run_001_prepare_data.sh b/egs/voxceleb/ssl.v1/run_001_prepare_data.sh new file mode 100755 index 00000000..563d3c2d --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_001_prepare_data.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. datapath.sh +. $config_file + +if [ $stage -le 1 ];then + # Prepare the VoxCeleb2 dataset for training. + hyperion-prepare-data voxceleb2 --subset dev --corpus-dir $voxceleb2_root \ + --cat-videos --use-kaldi-ids \ + --output-dir data/voxceleb2cat_train +fi + +if [ $stage -le 2 ];then + # prepare voxceleb1 for test + hyperion-prepare-data voxceleb1 --task test --corpus-dir $voxceleb1_root \ + --use-kaldi-ids \ + --output-dir data/voxceleb1_test +fi + +if [ $stage -le 3 ] && [ "$do_voxsrc22" == "true" ];then + hyperion-prepare-data voxsrc22 --subset dev --corpus-dir $voxsrc22_root \ + --vox1-corpus-dir $voxceleb1_root \ + --output-dir data/voxsrc22_dev +fi + +# if [ $stage -le 4 ] && [ "$do_voxsrc22" == "true" ];then + # hyperion-prepare-data voxsrc22 --subset test --corpus-dir $voxsrc22_root \ + # --vox1-corpus-dir $voxceleb1_root \ + # --output-dir data/voxsrc22_test +# fi + +if [ $stage -le 5 ] && [ "$do_qmf" == "true" ];then + # split vox2 into 2 parts, for cohort and qmf training + hyperion-split-dataset-into-trials-and-cohort --data-dir data/voxceleb2cat_train +fi diff --git a/egs/voxceleb/ssl.v1/run_002_compute_evad.sh b/egs/voxceleb/ssl.v1/run_002_compute_evad.sh new file mode 100755 index 00000000..acccace3 --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_002_compute_evad.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e +nodes=fs01 +vad_dir=`pwd`/exp/vad_e +vad_config=conf/vad_16k.yaml +nj=40 + +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ -z "$vad_config" ];then + echo "We are not using VAD in this configuration" + exit 0 +fi + +if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" +fi + + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + hyp_utils/create_data_split_dirs.sh \ + $vad_dir/$name \ + $USER/hyp-data/voxceleb/v1.2/vad $nodes + done +fi + +#Train datasets +if [ $stage -le 2 ];then + for name in voxceleb2cat_train voxceleb1_test $extra_data + do + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_data_split_links.sh $vad_dir/$name/vad.JOB.ark $nj + echo "compute vad for $name" + $train_cmd JOB=1:$nj $vad_dir/$name/log/vad.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-compute-energy-vad --cfg $vad_config \ + --recordings-file data/$name/recordings.csv \ + --output-spec ark,csv:$vad_dir/$name/vad.JOB.ark,$vad_dir/$name/vad.JOB.csv \ + --part-idx JOB --num-parts $nj || exit 1 + + hyperion-tables cat \ + --table-type features \ + --output-file $vad_dir/$name/vad.csv --num-tables $nj + hyperion-dataset add_features \ + --dataset data/$name \ + --features-name vad \ + --features-file $vad_dir/$name/vad.csv + done +fi + + diff --git a/egs/voxceleb/ssl.v1/run_003_prepare_noises_rirs.sh b/egs/voxceleb/ssl.v1/run_003_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_003_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/voxceleb/ssl.v1/run_004_prepare_xvec_train_data.sh b/egs/voxceleb/ssl.v1/run_004_prepare_xvec_train_data.sh new file mode 100755 index 00000000..3b7b9083 --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_004_prepare_xvec_train_data.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +nodes=b1 +nj=40 +stage=1 +config_file=default_config.sh + +. parse_options.sh || exit 1; +. $config_file + +if [ $stage -le 1 ]; then + # Prepare to distribute data over multiple machines + # This only does something at CLSP grid + hyp_utils/create_data_split_dirs.sh \ + exp/xvector_audios/$nnet_data \ + $USER/hyp-data/voxceleb/v1.2/xvector_audios/$nnet_data $nodes +fi + +if [ $stage -le 2 ];then + output_dir=exp/proc_audio/$nnet_data + # This creates links to distribute data in CLSP grid + # If you are not at CLSP grid, it does nothing and can be deleted + hyp_utils/create_audios_split_links.sh $output_dir data/$nnet_data/recordings.csv flac + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$nnet_data/vad.csv" + update_durs="--update-seg-durs" + fi + + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${nnet_data}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac --remove-dc-offset $vad_args \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$nnet_data/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + + hyperion-dataset set_recordings $update_durs \ + --dataset data/$nnet_data \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset data/${nnet_data}_proc_audio \ + --remove-features vad +fi + +if [ $stage -le 3 ];then + hyperion-dataset remove_short_segments \ + --dataset data/${nnet_data}_proc_audio \ + --output-dataset data/${nnet_data}_filtered \ + --length-name duration --min-length 2.0 + + hyperion-dataset remove_classes_few_segments \ + --dataset data/${nnet_data}_filtered \ + --class-name speaker --min-segs 4 +fi + +if [ $stage -le 4 ];then + hyperion-dataset split_train_val \ + --dataset data/${nnet_data}_filtered \ + --val-prob 0.03 \ + --seed 1123581321 \ + --train-dataset data/${nnet_data}_xvector_train \ + --val-dataset data/${nnet_data}_xvector_val +fi + diff --git a/egs/voxceleb/ssl.v1/run_005_train_dino.sh b/egs/voxceleb/ssl.v1/run_005_train_dino.sh new file mode 100755 index 00000000..eb1c591e --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_005_train_dino.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2vec2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Finetune full model +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi + +# Finetune full model +if [ $stage -le 3 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s3_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s3_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s2 \ + --trainer.exp-path $nnet_s3_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml index 8504db9e..fc964f84 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -60,5 +60,4 @@ trainer: log_interval: 1000 epochs: 8 eff_batch_size: 512 - target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml new file mode 100644 index 00000000..ab6b3f4e --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml @@ -0,0 +1,69 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + class_name: speaker + data_loader: + num_workers: 8 +model: + hf_feats: + encoder_lr: 1e-2 + feat_extract_lr: 1e-2 + xvector: + cos_scale: 32.0 + margin: 0.2 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + #lr: 5e-2 + lr: 1e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 7500 + hold_steps: 20000 + #min_lr: 5e-4 + min_lr: 1e-6 + warmup_steps: 10000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 14 + eff_batch_size: 512 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml index ad56e80d..928779f5 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -35,8 +35,8 @@ data: sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 16 - max_chunk_length: 3.0 - min_chunk_length: 3.0 + max_chunk_length: 6.0 + min_chunk_length: 6.0 num_chunks_per_seg_epoch: 6 class_name: speaker weight_exponent: 0.5 @@ -70,5 +70,4 @@ trainer: log_interval: 1000 epochs: 4 eff_batch_size: 256 - target_key: speaker train_mode: full diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml new file mode 100644 index 00000000..7ab8cea7 --- /dev/null +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + weight_exponent: 0.5 + weight_mode: data-prior + seg_weight_mode: data-prior + num_hard_prototypes: 0 + data_loader: + num_workers: 8 +model: + #hf_feats: + # encoder_lr: 1e-2 + # feat_extract_lr: 1e-2 + xvector: + cos_scale: 32.0 + margin: 0.4 + margin_warmup_epochs: 0 + intertop_k: 5 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 1e-4 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 15000 + hold_steps: 10000 + min_lr: 1e-6 + warmup_steps: 5000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 14 + eff_batch_size: 256 + target_key: speaker + #train_mode: full + train_mode: hf-feats-frozen-nograd \ No newline at end of file diff --git a/egs/voxceleb/v2.1/run_007_eval_be.sh b/egs/voxceleb/v2.1/run_007_eval_be.sh index 53621488..a686b237 100755 --- a/egs/voxceleb/v2.1/run_007_eval_be.sh +++ b/egs/voxceleb/v2.1/run_007_eval_be.sh @@ -8,7 +8,7 @@ set -e stage=1 -nnet_stage=2 +nnet_stage=3 config_file=default_config.sh . parse_options.sh || exit 1; diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index f2df9581..02a3b68e 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -139,10 +139,12 @@ def extract_xvectors( aug_df = None num_augs = 1 + metadata_columns = ["speech_duration"] + ar_args = AR.filter_args(**kwargs) ar_args["wav_scale"] = 1.0 logging.info("opening output stream: %s", output_spec) - with DWF.create(output_spec) as writer: + with DWF.create(output_spec, metadata_columns=metadata_columns) as writer: logging.info(f"opening input stream: {recordings_file} with args={ar_args}") with AR(recordings_file, **ar_args) as reader: if vad_spec is not None: @@ -168,6 +170,7 @@ def extract_xvectors( logging.info("processing utt %s", key0) for aug_id in range(num_augs): + metadata = {} t3 = time.time() key, x = augment(key0, x0, augmenter, aug_df, aug_id) t4 = time.time() @@ -201,6 +204,8 @@ def extract_xvectors( key, x, fs, min_utt_length, max_utt_length, rng ) + metadata["speech_duration"] = x.shape[1] / fs + t6 = time.time() if x.shape[1] == 0: y = np.zeros((model.embed_dim,), dtype=float_cpu()) @@ -217,7 +222,7 @@ def extract_xvectors( ) t7 = time.time() - writer.write([key], [y]) + writer.write([key], [y], metadata=metadata) if write_speech_dur is not None: keys.append(key) info.append(str(x.shape[1] / fs)) diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 00452695..711c4194 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -54,7 +54,7 @@ def __init__(self, feat_extractor, xvector_model): self.vad = None def forward(self, s): - f = self.feat_extractor(s) + f, _ = self.feat_extractor(s) if self.vad is not None: n_vad_frames = len(self.vad) n_feat_frames = f.shape[1] @@ -161,16 +161,16 @@ def generate_attacks( model = init_model(model_path, **kwargs) model.to(device) - logging.info("opening audio read stream: %s" % (wav_file)) + logging.info("opening audio read stream: %s", wav_file) audio_args = AR.filter_args(**kwargs) - audio_reader = AR(wav_file**audio_args) + audio_reader = AR(wav_file, **audio_args) wav_scale = audio_reader.wav_scale - logging.info("opening audio write stream: %s" % (output_wav_dir)) + logging.info("opening audio write stream: %s", output_wav_dir) audio_writer = AW(output_wav_dir, audio_format="flac") if vad_spec is not None: - logging.info("opening VAD stream: %s" % (vad_spec)) + logging.info("opening VAD stream: %s", vad_spec) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) keys, class_names, class_ids = read_utt_list( @@ -190,9 +190,7 @@ def generate_attacks( s = s[0] fs = fs[0] - torch.manual_seed( - random_seed + int(s[0]) - ) # this is to make results reproducible + torch.manual_seed(random_seed + len(s)) # this is to make results reproducible p = torch.rand(1).item() if p > p_attack: logging.info("skipping attack for utt %s" % (key)) diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index ab7d907b..f858ea22 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -210,9 +210,7 @@ def generate_attacks( s, fs = audio_reader.read([key.seg_set[j]]) s = s[0] fs = fs[0] - torch.manual_seed( - random_seed + int(s[0]) - ) # this is to make results reproducible + torch.manual_seed(random_seed + len(s)) # this is to make results reproducible s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) if vad_spec is not None: diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index e6dd3d3e..63ac34a9 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -26,8 +26,11 @@ from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy from hyperion.torch.models import ( + HFHubert2ConformerV1XVector, HFHubert2ResNet1dXVector, + HFWav2Vec2ConformerV1XVector, HFWav2Vec2ResNet1dXVector, + HFWavLM2ConformerV1XVector, HFWavLM2ResNet1dXVector, ) from hyperion.torch.trainers import XVectorTrainer as Trainer @@ -37,6 +40,9 @@ "hf_wav2vec2resnet1d": HFWav2Vec2ResNet1dXVector, "hf_hubert2resnet1d": HFHubert2ResNet1dXVector, "hf_wavlm2resnet1d": HFWavLM2ResNet1dXVector, + "hf_wav2vec2conformer": HFWav2Vec2ConformerV1XVector, + "hf_hubert2conformer": HFHubert2ConformerV1XVector, + "hf_wavlm2conformer": HFWavLM2ConformerV1XVector, } diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py index 7373a338..3138784d 100755 --- a/hyperion/bin/train_wav2xvector.py +++ b/hyperion/bin/train_wav2xvector.py @@ -22,6 +22,7 @@ from hyperion.torch.metrics import CategoricalAccuracy # from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ConformerV1XVector as CXVec from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec from hyperion.torch.models import Wav2ResNetXVector as RXVec @@ -34,6 +35,7 @@ xvec_dict = { "resnet": RXVec, "resnet1d": R1dXVec, + "conformer": CXVec, # "efficientnet": EXVec, # "tdnn": TDXVec, # "transformer": TFXVec, diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index a2acdf4c..699aa410 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -24,6 +24,7 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ConformerV1XVector as CXVec from hyperion.torch.models import EfficientNetXVector as EXVec from hyperion.torch.models import ResNet1dXVector as R1dXVec from hyperion.torch.models import ResNetXVector as RXVec @@ -40,6 +41,7 @@ "tdnn": TDXVec, "transformer": TFXVec, "spinenet": SpineXVec, + "conformer": CXVec, } diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index c3f6170d..67075a5d 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -20,6 +20,7 @@ from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory from hyperion.torch.metrics import CategoricalAccuracy +from hyperion.torch.models import ConformerV1XVector as CXVec from hyperion.torch.models import EfficientNetXVector as EXVec from hyperion.torch.models import ResNet1dXVector as R1dXVec from hyperion.torch.models import ResNetXVector as RXVec @@ -37,6 +38,7 @@ "tdnn": TDXVec, "transformer": TFXVec, "spinenet": SpineXVec, + "conformer": CXVec, } diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index f91d7d96..4644f141 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -6,22 +6,23 @@ import logging import math import time +from typing import Dict, List, Optional import numpy as np import pandas as pd # import k2 import sentencepiece as spm -import torchaudio.transforms as tat -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.distributed as dist +import torchaudio.transforms as tat +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.utils.data import Dataset from ...io import RandomAccessAudioReader as AR from ...np.augment import SpeechAugment from ...utils.class_info import ClassInfo +from ...utils.misc import filter_func_args from ...utils.segment_set import SegmentSet from ...utils.text import read_text from ..torch_defs import floatstr_torch @@ -30,22 +31,24 @@ class AudioDataset(Dataset): def __init__( self, - recordings_file, - segments_file, - class_names=None, - class_files=None, - bpe_model=None, - text_file=None, - time_durs_file=None, - aug_cfgs=None, - num_augs=1, - return_segment_info=None, - return_orig=False, - target_sample_freq=None, - wav_scale=2 ** 15 - 1, - is_val=False, + recordings_file: str, + segments_file: str, + class_names: Optional[List[str]] = None, + class_files: Optional[List[str]] = None, + bpe_model: Optional[str] = None, + text_file: Optional[str] = None, + time_durs_file: Optional[str] = None, + aug_cfgs: Optional[List[str]] = None, + num_augs: int = 1, + num_aug_mix: int = 0, + aug_mix_alpha: float = 0, + return_segment_info: Optional[List[str]] = None, + return_orig: bool = False, + target_sample_freq: Optional[float] = None, + wav_scale: float = 1, + is_val: bool = False, + seed: int = 112358, ): - super().__init__() try: rank = dist.get_rank() @@ -86,12 +89,17 @@ def __init__( if text_file is not None: logging.info("loading text files") self._load_text_infos(text_file, is_val) + self.return_segment_info = ( [] if return_segment_info is None else return_segment_info ) self.return_orig = return_orig self.num_augs = num_augs + self.num_aug_mix = num_aug_mix + self.aug_mix_alpha = aug_mix_alpha + self.seed = seed + self.rng = np.random.default_rng(seed + 1000 * rank) self._create_augmenters(aug_cfgs) self.target_sample_freq = target_sample_freq @@ -135,7 +143,7 @@ def _load_class_infos(self, class_names, class_files, is_val): name in self.seg_set ), f"class_name {name} not present in the segment set" if self.rank == 0: - logging.info("loading class-info file %s" % file) + logging.info("loading class-info file %s", file) table = ClassInfo.load(file) self.class_info[name] = table if not is_val: @@ -157,7 +165,7 @@ def _create_augmenters(self, aug_cfgs): for aug_cfg in aug_cfgs: logging.info(f"loading augmentation={aug_cfg}") augmenter = SpeechAugment.create( - aug_cfg, random_seed=112358 + 1000 * self.rank + aug_cfg, random_seed=self.seed + 1000 * self.rank ) self.augmenters.append(augmenter) self.reverb_context = max(augmenter.max_reverb_context, self.reverb_context) @@ -223,32 +231,64 @@ def _read_audio(self, seg_id, start, duration): x, fs = self.r.read([seg_id], time_offset=start, time_durs=read_duration) return x[0].astype(floatstr_torch(), copy=False), fs[0] - def _read_audio0(self, seg_id, start, duration): - # how much extra audio we need to load to - # calculate the reverb of the first part of the audio - reverb_context = min(self.reverb_context, start) - start -= reverb_context - read_duration = duration + reverb_context - - # read audio - recording_id = self.seg_set.recording_ids(seg_id) - x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration) - return x[0].astype(floatstr_torch(), copy=False), fs[0] + # def _read_audio0(self, seg_id, start, duration): + # # how much extra audio we need to load to + # # calculate the reverb of the first part of the audio + # reverb_context = min(self.reverb_context, start) + # start -= reverb_context + # read_duration = duration + reverb_context + + # # read audio + # recording_id = self.seg_set.recording_ids(seg_id) + # x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration) + # return x[0].astype(floatstr_torch(), copy=False), fs[0] + + def _apply_aug_mix(self, x, x_augs, aug_idx): + x_aug_mix = {} + alpha_d = (self.aug_mix_alpha,) * len(x_augs) + w = self.rng.dirichlet(alpha_d, self.num_aug_mix) + m = self.rng.beta(alpha_d, self.num_aug_mix) + for i in range(self.num_aug_mix): + x_mix = np.zeros_like(x) + for j, (_, x_aug_j) in enumerate(x_augs.items()): + x_mix += w[i, j] * x_aug_j + + x_aug_mix[f"x_aug_{aug_idx}_{i}"] = m[i] * x + (1 - m[i]) * x_mix + + return x_aug_mix + + def _apply_augs(self, x, duration, fs): + if not self.augmenters: + return {"x": x} + + if duration == 0: + num_samples = len(x) + else: + num_samples = int(duration * fs) - def _apply_augs(self, x, reverb_context_samples): + reverb_context_samples = len(x) - num_samples + x_orig = x[reverb_context_samples:] x_augs = {} # for each type of augmentation for i, augmenter in enumerate(self.augmenters): # we do n_augs per augmentation type + x_augs_i = {} for j in range(self.num_augs): # augment x x_aug, aug_info = augmenter(x) # remove the extra left context used to compute the reverberation. x_aug = x_aug[reverb_context_samples : len(x)] x_aug = x_aug.astype(floatstr_torch(), copy=False) - x_augs[f"x_aug_{i}_{j}"] = x_aug + x_augs_i[f"x_aug_{i}_{j}"] = x_aug + + if self.num_aug_mix > 0: + x_augs_i = self._apply_aug_mix(x_orig, x_augs_i, i) - if not self.return_orig and len(x_augs) == 1: + x_augs.update(x_augs_i) + + if self.return_orig: + x_augs["x"] = x_orig + elif len(x_augs) == 1: # if we just have one aug and we don't return the clean version, # we just call x to the aug version x_augs["x"] = x_augs.pop("x_aug_0_0") @@ -304,62 +344,70 @@ def __getitem__(self, segment): x, fs = self._read_audio(seg_id, start, duration) x, fs = self._resample(x, fs) data = {"seg_id": seg_id, "sample_freq": fs} + x_augs = self._apply_augs(x, duration, fs) + data.update(x_augs) - if self.augmenters: - # augmentations - if duration == 0: - num_samples = len(x) - else: - num_samples = int(duration * fs) - reverb_context_samples = len(x) - num_samples - x_augs = self._apply_augs(x, reverb_context_samples) - data.update(x_augs) - - # add original non augmented audio - if self.return_orig: - x_orig = x[reverb_context_samples:] - data["x"] = x_orig + # if self.augmenters: + # # augmentations + # if duration == 0: + # num_samples = len(x) + # else: + # num_samples = int(duration * fs) - else: - data["x"] = x + # reverb_context_samples = len(x) - num_samples + # x_augs = self._apply_augs(x, reverb_context_samples) + # data.update(x_augs) + + # # add original non augmented audio + # if self.return_orig: + # x_orig = x[reverb_context_samples:] + # data["x"] = x_orig + + # else: + # data["x"] = x seg_info = self._get_segment_info(seg_id) data.update(seg_info) - if np.any(~np.isfinite(data["x"])): - print( - "zzz", - x.max(), - x.min(), - x.mean(), - data["x"].max(), - data["x"].min(), - data["x"].mean(), - flush=True, - ) + # if np.any(~np.isfinite(data["x"])): + # print( + # "zzz", + # x.max(), + # x.min(), + # x.mean(), + # data["x"].max(), + # data["x"].min(), + # data["x"].mean(), + # flush=True, + # ) return data @staticmethod def filter_args(**kwargs): - - ar_args = AR.filter_args(**kwargs) - valid_args = ( - "recordings_file", - "segments_file", - "aug_cfgs", - "num_augs", - "class_names", - "class_files", - "bpe_model", - "text_file", - "return_segment_info", - "return_orig", - "time_durs_file", - "target_sample_freq", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - args.update(ar_args) + args = filter_func_args(AudioDataset.__init__, kwargs) return args + # @staticmethod + # def filter_args(**kwargs): + + # ar_args = AR.filter_args(**kwargs) + # valid_args = ( + # "recordings_file", + # "segments_file", + # "aug_cfgs", + # "num_augs", + # "class_names", + # "class_files", + # "bpe_model", + # "text_file", + # "return_segment_info", + # "return_orig", + # "time_durs_file", + # "target_sample_freq", + # ) + # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # args.update(ar_args) + # return args + @staticmethod def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: @@ -390,7 +438,10 @@ def add_class_args(parser, prefix=None, skip=set()): ) parser.add_argument( - "--class-files", default=None, nargs="+", help=("list of class info files"), + "--class-files", + default=None, + nargs="+", + help=("list of class info files"), ) parser.add_argument( @@ -402,7 +453,9 @@ def add_class_args(parser, prefix=None, skip=set()): ) parser.add_argument( - "--bpe-model", default=None, help=("bpe model for the text label"), + "--bpe-model", + default=None, + help=("bpe model for the text label"), ) parser.add_argument( @@ -421,8 +474,21 @@ def add_class_args(parser, prefix=None, skip=set()): parser.add_argument( "--num-augs", default=1, + type=int, help=("number of augmentations per segment and augmentation type"), ) + parser.add_argument( + "--num-aug-mix", + default=0, + type=int, + help=("number of AugMix augmentations per segment"), + ) + parser.add_argument( + "--aug-mix-alpha", + default=0.5, + type=float, + help=("number of AugMix augmentations per segment"), + ) parser.add_argument( "--return-segment-info", default=None, @@ -449,6 +515,13 @@ def add_class_args(parser, prefix=None, skip=set()): ), ) + parser.add_argument( + "--seed", + default=11235811, + type=int, + help="random seed", + ) + AR.add_class_args(parser) if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/layer_blocks/__init__.py b/hyperion/torch/layer_blocks/__init__.py index 22cc629d..735df21d 100644 --- a/hyperion/torch/layer_blocks/__init__.py +++ b/hyperion/torch/layer_blocks/__init__.py @@ -4,6 +4,7 @@ """ from .conformer_conv import ConformerConvBlock +from .conformer_decoder_v1 import ConformerDecoderBlockV1 from .conformer_encoder_v1 import ConformerEncoderBlockV1 from .dc1d_blocks import DC1dDecBlock, DC1dEncBlock from .dc2d_blocks import DC2dDecBlock, DC2dEncBlock @@ -14,26 +15,47 @@ from .res2net2d_blocks import Res2Net2dBasicBlock, Res2Net2dBNBlock from .res2net_blocks import Res2NetBasicBlock, Res2NetBNBlock from .resetdnn_blocks import ResETDNNBlock -from .resnet1d_blocks import (ResNet1dBasicBlock, ResNet1dBasicDecBlock, - ResNet1dBNBlock, ResNet1dBNDecBlock, - ResNet1dEndpoint, SEResNet1dBasicBlock, - SEResNet1dBasicDecBlock, SEResNet1dBNBlock, - SEResNet1dBNDecBlock) -from .resnet2d_blocks import (ResNet2dBasicBlock, ResNet2dBasicDecBlock, - ResNet2dBNBlock, ResNet2dBNDecBlock, - SEResNet2dBasicBlock, SEResNet2dBasicDecBlock, - SEResNet2dBNBlock, SEResNet2dBNDecBlock) -from .resnet_blocks import (ResNetBasicBlock, ResNetBNBlock, - ResNetEndpointBlock, ResNetInputBlock) -from .se_blocks import (CFwSEBlock2d, FwSEBlock2d, SEBlock1d, SEBlock2D, - SEBlock2d, TSEBlock2D, TSEBlock2d) +from .resnet1d_blocks import ( + ResNet1dBasicBlock, + ResNet1dBasicDecBlock, + ResNet1dBNBlock, + ResNet1dBNDecBlock, + ResNet1dEndpoint, + SEResNet1dBasicBlock, + SEResNet1dBasicDecBlock, + SEResNet1dBNBlock, + SEResNet1dBNDecBlock, +) +from .resnet2d_blocks import ( + ResNet2dBasicBlock, + ResNet2dBasicDecBlock, + ResNet2dBNBlock, + ResNet2dBNDecBlock, + SEResNet2dBasicBlock, + SEResNet2dBasicDecBlock, + SEResNet2dBNBlock, + SEResNet2dBNDecBlock, +) +from .resnet_blocks import ( + ResNetBasicBlock, + ResNetBNBlock, + ResNetEndpointBlock, + ResNetInputBlock, +) +from .se_blocks import ( + CFwSEBlock2d, + FwSEBlock2d, + SEBlock1d, + SEBlock2D, + SEBlock2d, + TSEBlock2D, + TSEBlock2d, +) from .seresnet_blocks import SEResNetBasicBlock, SEResNetBNBlock from .spine_blocks import BlockSpec, SpineConv, SpineEndpoints, SpineResample from .tdnn_blocks import TDNNBlock from .transducer_joiner import TransducerJoiner -from .transducer_predictor import (TransducerConvPredictor, - TransducerRNNPredictor) -from .transformer_conv2d_subsampler import TransformerConv2dSubsampler +from .transducer_predictor import TransducerConvPredictor, TransducerRNNPredictor from .transformer_encoder_v1 import TransformerEncoderBlockV1 -from .transformer_feedforward import (Conv1dLinear, Conv1dx2, - PositionwiseFeedForward) +from .transformer_feedforward import Conv1dLinear, Conv1dx2, PositionwiseFeedForward +from .transformer_input import TransformerConv1dSubsampler, TransformerConv2dSubsampler diff --git a/hyperion/torch/layer_blocks/conformer_decoder_v1.py b/hyperion/torch/layer_blocks/conformer_decoder_v1.py new file mode 100644 index 00000000..e3d0893a --- /dev/null +++ b/hyperion/torch/layer_blocks/conformer_decoder_v1.py @@ -0,0 +1,213 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +# + +import torch +import torch.nn as nn + +from ..layers.attention import * +from .conformer_conv import ConformerConvBlock +from .conformer_encoder_v1 import ConformerEncoderBlockV1 +from .transformer_feedforward import * + + +class ConformerDecoderBlockV1(ConformerEncoderBlockV1): + """Building block for conformer decoder based on conformer encoder introduced in + https://arxiv.org/pdf/2005.08100.pdf + + This includes some optional extra features + not included in the original paper: + - Choose local-attention (attending only to close frames + instead of all the frames in the sequence) + - Choose number of conv blocks + - Squeeze-Excitation after depthwise-conv + - Allows downsampling in time dimension + - Allows choosing activation and layer normalization type + We call this Conformer+ + + Attributes: + num_feats: input/output feat. dimension (aka d_model) + self_attn: attention module in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1'] + num_heads: number of heads + conv_repeats: number of conv blocks + conv_kernel_size: kernel size for conv blocks + conv_stride: stride for depth-wise conv in first conv block + feed_forward: position-wise feed-forward string in ['linear', 'conv1dx2', 'conv1d-linear'] + d_ff: dimension of middle layer in feed_forward block + ff_kernel_size: kernel size for convolutional versions of ff block + hid_act: ff and conv block hidden activation + dropout_rate: dropout rate for ff and conv blocks + att_context: maximum context range for local attention + att_dropout_rate: dropout rate for attention block + causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes + that query q_i only attents to key k_j when j<=i + conv_norm_layer: norm layer constructor for conv block, + if None it uses BatchNorm + se_r: Squeeze-Excitation compression ratio, + if None it doesn't use Squeeze-Excitation + ff_macaron: if True, it uses macaron-net style ff layers, otherwise transformer style. + out_lnorm: if True, use LNorm layer at the output as in the conformer paper, + we think that this layer is redundant and put it to False by default + concat_after: if True, if concats attention input and output and apply linear transform, i.e., + y = x + linear(concat(x, att(x))) + if False, y = x + att(x) + + """ + + def __init__( + self, + num_feats, + self_attn, + cross_attn, + num_heads, + conv_repeats=0, + conv_kernel_size=31, + conv_stride=1, + feed_forward="linear", + d_ff=2048, + ff_kernel_size=3, + hid_act="swish", + dropout_rate=0, + att_context=25, + att_dropout_rate=0, + pos_enc_type="rel", + causal_pos_enc=False, + conv_norm_layer=None, + se_r=None, + ff_macaron=True, + src_lnorm=False, + out_lnorm=False, + concat_after=False, + ): + super().__init__( + num_feats, + self_attn, + num_heads, + conv_repeats=conv_repeats, + conv_kernel_size=conv_kernel_size, + conv_stride=conv_stride, + feed_forward=feed_forward, + d_ff=d_ff, + ff_kernel_size=ff_kernel_size, + hid_act=hid_act, + dropout_rate=dropout_rate, + att_context=att_context, + att_dropout_rate=att_dropout_rate, + pos_enc_type=pos_enc_type, + causal_pos_enc=causal_pos_enc, + conv_norm_layer=conv_norm_layer, + se_r=se_r, + ff_macaron=ff_macaron, + out_lnorm=out_lnorm, + concat_after=concat_after, + ) + + self.cross_att = self._make_att( + cross_attn, + num_feats, + num_heads, + 0, + att_dropout_rate, + "no", + False, + ) + + self.norm_cross_att = nn.LayerNorm(num_feats) + self.src_lnorm = src_lnorm + if src_lnorm: + self.norm_src = nn.LayerNorm(num_feats) + + if self.concat_after: + self.cross_concat_linear = nn.Linear(num_feats + num_feats, num_feats) + + def _forward_self_attn(self, x, pos_emb=None, mask=None, cache=None): + residual = x + x = self.norm_att(x) + + if cache is None: + x_q = x + mask_q = mask + else: + # compute only the last frame query keeping dim: max_time_out -> 1 + assert_cache_shape = (x.size(0), x.size(1) - 1, x.size(2)) + assert ( + cache.shape == assert_cache_shape + ), f"{cache.shape} != {assert_cache_shape}" + x_q = x[:, -1:, :] + residual = residual[:, -1:, :] + mask_q = None if mask is None else mask[:, -1:, :] + + if pos_emb is None: + x_att = self.self_attn(x_q, x, x, mask=mask_q) + else: + x_att = self.self_attn(x_q, x, x, pos_emb=pos_emb, mask=mask_q) + + if self.concat_after: + x = torch.cat((x_q, x_att), dim=-1) + x = self.concat_linear(x) + else: + x = x_att + + if self.dropout_rate > 0: + x = self.dropout(x) + + x = residual + x + return x + + def _forward_cross_attn(self, x, x_src, pos_emb=None, mask=None): + residual = x + x = self.norm_cross_att(x) + if self.src_lnorm: + x_src = self.norm_src(x_src) + + if pos_emb is None: + x_att = self.cross_attn(x, x_src, x_src, mask=mask) + else: + x_att = self.cross_attn(x, x_src, x_src, pos_emb=pos_emb, mask=mask) + + if self.concat_after: + x = torch.cat((x, x_att), dim=-1) + x = self.cross_concat_linear(x) + else: + x = x_att + + if self.dropout_rate > 0: + x = self.dropout(x) + + x = residual + x + return x + + def forward(self, x, x_src, pos_emb=None, mask=None, mask_src=None, cache=None): + """Forward pass function + + Args: + x: input tensor with size=(batch, time, num_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0, + when using relative postional encoder, otherwise None + mask: mask to indicate valid time steps for x (batch, time) + + Returns: + Tensor with output features + Tensor with mask + """ + # macaron feed forward + if self.ff_macaron: + x = self._forward_ff_macaron(x) + + # multihead attention + x = self._forward_self_attn(x, pos_emb, mask, cache=cache) + x = self._forward_cross_attn(x, x_src, mask=mask_src) + + # convolutional blocks + x = self._forward_convs(x) + + # feed-forward block + x = self._forward_ff(x) + + # output norm + if self.out_lnorm: + x = self.norm_out(x) + + return x, mask diff --git a/hyperion/torch/layer_blocks/conformer_encoder_v1.py b/hyperion/torch/layer_blocks/conformer_encoder_v1.py index 5764c85e..4f8e1b4d 100644 --- a/hyperion/torch/layer_blocks/conformer_encoder_v1.py +++ b/hyperion/torch/layer_blocks/conformer_encoder_v1.py @@ -78,7 +78,6 @@ def __init__( out_lnorm=False, concat_after=False, ): - super().__init__() self.self_attn = self._make_att( self_attn, @@ -94,14 +93,14 @@ def __init__( self.ff_macaron = ff_macaron if ff_macaron: self.ff_scale = 0.5 - self.feed_forward_macaron = self._make_ff(feed_forward, num_feats, - d_ff, ff_kernel_size, - hid_act, dropout_rate) + self.feed_forward_macaron = self._make_ff( + feed_forward, num_feats, d_ff, ff_kernel_size, hid_act, dropout_rate + ) self.norm_ff_macaron = nn.LayerNorm(num_feats) - self.feed_forward = self._make_ff(feed_forward, num_feats, d_ff, - ff_kernel_size, hid_act, - dropout_rate) + self.feed_forward = self._make_ff( + feed_forward, num_feats, d_ff, ff_kernel_size, hid_act, dropout_rate + ) conv_blocks = [] for i in range(conv_repeats): @@ -148,6 +147,7 @@ def _make_att( att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1', 'block-scaled-dot-prod-att-v1'] num_feats: input/output feat. dimension (aka d_model) num_heads: number of heads + context: block attention receptive field dropout_rate: dropout rate for attention block pos_enc_type: type of positional encoder causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes @@ -228,8 +228,7 @@ def _make_att( ) @staticmethod - def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, - dropout_rate): + def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rate): """Creates position-wise feed forward block from ff_type string Args: @@ -245,58 +244,38 @@ def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, """ if ff_type == "linear": - return PositionwiseFeedForward(num_feats, - hid_feats, - activation, - dropout_rate, - time_dim=1) + return PositionwiseFeedForward( + num_feats, hid_feats, activation, dropout_rate, time_dim=1 + ) if ff_type == "conv1dx2": - return Conv1dx2(num_feats, - hid_feats, - kernel_size, - activation, - dropout_rate, - time_dim=1) + return Conv1dx2( + num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 + ) if ff_type == "conv1d-linear": - return Conv1dLinear(num_feats, - hid_feats, - kernel_size, - activation, - dropout_rate, - time_dim=1) - - def forward(self, x, pos_emb=None, mask=None): - """Forward pass function - - Args: - x: input tensor with size=(batch, time, num_feats) - pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0, - when using relative postional encoder, otherwise None - mask: mask to indicate valid time steps for x (batch, time) + return Conv1dLinear( + num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 + ) - Returns: - Tensor with output features - Tensor with mask - """ + def _forward_ff_macaron(self, x): + residual = x + x = self.norm_ff_macaron(x) + x = self.feed_forward_macaron(x) + if self.dropout_rate > 0: + x = self.dropout(x) - # macaron feed forward - if self.ff_macaron: - residual = x - x = self.norm_ff_macaron(x) - x = self.feed_forward_macaron(x) - if self.dropout_rate > 0: - x = self.dropout(x) - x = residual + self.ff_scale * x + x = residual + self.ff_scale * x + return x - # multihead attention + def _forward_self_attn(self, x, pos_emb=None, mask=None): residual = x x = self.norm_att(x) if pos_emb is None: x_att = self.self_attn(x, x, x, mask=mask) else: x_att = self.self_attn(x, x, x, pos_emb=pos_emb, mask=mask) + if self.concat_after: x = torch.cat((x, x_att), dim=-1) x = self.concat_linear(x) @@ -307,15 +286,17 @@ def forward(self, x, pos_emb=None, mask=None): x = self.dropout(x) x = residual + x + return x - # convolutional blocks + def _forward_convs(self, x): x = x.transpose(1, 2) for block in range(len(self.conv_blocks)): x = self.conv_blocks[block](x) x = x.transpose(1, 2) + return x - # feed-forward block + def _forward_ff(self, x): residual = x x = self.norm_ff(x) x = self.feed_forward(x) @@ -323,6 +304,33 @@ def forward(self, x, pos_emb=None, mask=None): x = self.dropout(x) x = residual + self.ff_scale * x + return x + + def forward(self, x, pos_emb=None, mask=None): + """Forward pass function + + Args: + x: input tensor with size=(batch, time, num_feats) + pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0, + when using relative postional encoder, otherwise None + mask: mask to indicate valid time steps for x (batch, time) + + Returns: + Tensor with output features + Tensor with mask + """ + # macaron feed forward + if self.ff_macaron: + x = self._forward_ff_macaron(x) + + # multihead attention + x = self._forward_self_attn(x, pos_emb, mask) + + # convolutional blocks + x = self._forward_convs(x) + + # feed-forward block + x = self._forward_ff(x) # output norm if self.out_lnorm: diff --git a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py b/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py deleted file mode 100644 index 942e1313..00000000 --- a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py +++ /dev/null @@ -1,61 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import torch -import torch.nn as nn - -from ..layers import ActivationFactory as AF - - -class TransformerConv2dSubsampler(nn.Module): - """Convolutional 2D subsampling (to 1/4 length) Tor transformer - - Attributes: - in_feats: input feature dimension - out_feats: Transformer d_model - hid_act: activation layer object - pos_enc: positional encoder layer - time_dim: indicates which is the time dimension in the input tensor - """ - - def __init__(self, in_feats, out_feats, hid_act, pos_enc=None, time_dim=1): - super().__init__() - self.time_dim = time_dim - hid_act = AF.create(hid_act) - self.conv = nn.Sequential( - nn.Conv2d(1, out_feats, 3, 2, padding=(0, 1)), - hid_act, - nn.Conv2d(out_feats, out_feats, 3, 2, padding=(0, 1)), - hid_act, - ) - - linear = nn.Linear(out_feats * (((in_feats - 1) // 2 - 1) // 2), - out_feats) - if pos_enc is None: - self.out = linear - else: - self.out = nn.Sequential(linear, pos_enc) - - def forward(self, x, x_mask=None): - """Forward function. - - Args: - x: input tensor with size=(batch, time, in_feats) - x_mask: mask to indicate valid time steps for x (batch, time1, time2) - - Returns: - Tensor with output features with shape = (batch, time//4, out_feats) - Tensor with subsampled mask x4. - """ - if self.time_dim == 1: - x = x.transpose(1, 2) - - x = x.unsqueeze(1) # (b, c, f, t) - x = self.conv(x) - b, c, f, t = x.size() - x = self.out(x.contiguous().view(b, c * f, t).transpose(1, 2)) - if x_mask is None: - return x, None - return x, x_mask[:, :, :-2:2][:, :, :-2:2] diff --git a/hyperion/torch/layer_blocks/transformer_input.py b/hyperion/torch/layer_blocks/transformer_input.py new file mode 100644 index 00000000..6c5de188 --- /dev/null +++ b/hyperion/torch/layer_blocks/transformer_input.py @@ -0,0 +1,151 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import math + +import torch +import torch.nn as nn + +from ..layers import ActivationFactory as AF + + +class TransformerConv2dSubsampler(nn.Module): + """Convolutional 2D subsampling (to 1//stride length) Tor transformer + + Attributes: + in_feats: input feature dimension + out_feats: Transformer d_model + hid_act: activation layer object + stride: total stride of the subsampler + pos_enc: positional encoder layer + time_dim: indicates which is the time dimension in the input tensor + """ + + def __init__( + self, in_feats, out_feats, hid_act, stride=4, pos_enc=None, time_dim=1 + ): + super().__init__() + self.time_dim = time_dim + hid_act = AF.create(hid_act) + self.stride = stride + if stride == 4: + stride_1 = 2 + stride_2 = 2 + hid_feats = out_feats * (((in_feats - 1) // 2 - 1) // 2) + elif stride == 2: + stride_1 = 2 + stride_2 = 1 + hid_feats = out_feats * ((in_feats - 1) // 2 - 2) + elif stride == 1: + stride_1 = 1 + stride_2 = 1 + hid_feats = out_feats * (in_feats - 4) + else: + raise NotImplementedError( + "Valid TransformerConv2dSubsampler stride==1,2,4 !={stride}" + ) + + self.conv = nn.Sequential( + nn.Conv2d(1, out_feats, 3, stride_1, padding=(0, 1)), + hid_act, + nn.Conv2d(out_feats, out_feats, 3, stride_2, padding=(0, 1)), + hid_act, + ) + + linear = nn.Linear(hid_feats, out_feats) + if pos_enc is None: + self.out = linear + else: + self.out = nn.Sequential(linear, pos_enc) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with size=(batch, time, in_feats) + x_mask: mask to indicate valid time steps for x (batch, time1, time2) + + Returns: + Tensor with output features with shape = (batch, time//stride, out_feats) + Tensor with subsampled mask // stride. + """ + if self.time_dim == 1: + x = x.transpose(1, 2) + + x = x.unsqueeze(1) # (b, c, f, t) + x = self.conv(x) + b, c, f, t = x.size() + x = self.out(x.contiguous().view(b, c * f, t).transpose(1, 2)) + if x_mask is None: + return x, None + + return x, x_mask[:, :, :: self.stride] + + +class TransformerConv1dSubsampler(nn.Module): + """Convolutional 1D subsampling (to 1//stride length) Tor transformer + + Attributes: + in_feats: input feature dimension + out_feats: Transformer d_model + hid_act: activation layer object + stride: total stride of the subsampler + pos_enc: positional encoder layer + time_dim: indicates which is the time dimension in the input tensor + """ + + def __init__( + self, in_feats, out_feats, hid_act, stride=4, pos_enc=None, time_dim=1 + ): + super().__init__() + self.time_dim = time_dim + hid_act = AF.create(hid_act) + self.stride = stride + if stride == 4: + stride_1 = 2 + stride_2 = 2 + elif stride == 2: + stride_1 = 2 + stride_2 = 1 + elif stride == 1: + stride_1 = 1 + stride_2 = 1 + else: + raise NotImplementedError( + "Valid TransformerConv1dSubsampler stride==1,2,4 !={stride}" + ) + + self.conv = nn.Sequential( + nn.Conv1d(in_feats, out_feats, 3, stride_1, padding=1), + hid_act, + nn.Conv1d(out_feats, out_feats, 3, stride_2, padding=1), + hid_act, + ) + + linear = nn.Linear(out_feats, out_feats) + if pos_enc is None: + self.out = linear + else: + self.out = nn.Sequential(linear, pos_enc) + + def forward(self, x, x_mask=None): + """Forward function. + + Args: + x: input tensor with size=(batch, time, in_feats) + x_mask: mask to indicate valid time steps for x (batch, time1, time2) + + Returns: + Tensor with output features with shape = (batch, time//stride, out_feats) + Tensor with subsampled mask // stride. + """ + if self.time_dim == 1: + x = x.transpose(1, 2) + + x = self.conv(x) + x = self.out(x.transpose(1, 2)) + if x_mask is None: + return x, None + + return x, x_mask[:, :, :: self.stride] diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py index bea52c95..b0b607e2 100644 --- a/hyperion/torch/layers/__init__.py +++ b/hyperion/torch/layers/__init__.py @@ -14,6 +14,13 @@ from .audio_feats_factory import AudioFeatsFactory from .calibrators import LinBinCalibrator from .dropout import DropConnect1d, DropConnect2d, Dropout1d +from .feat_fuser_factory import FeatFuserFactory +from .feat_fusers import ( + CatFeatFuser, + LastFeatFuser, + LinearFeatFuser, + WeightedAvgFeatFuser, +) from .global_pool import * from .interpolate import Interpolate from .lora import LoRAFactory diff --git a/hyperion/torch/layers/audio_feats.py b/hyperion/torch/layers/audio_feats.py index 3bc4add9..ed26b576 100644 --- a/hyperion/torch/layers/audio_feats.py +++ b/hyperion/torch/layers/audio_feats.py @@ -2,7 +2,6 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# import logging import math @@ -158,7 +157,6 @@ def __init__( raw_energy=True, return_log_energy=False, ): - super().__init__() self.fs = fs self.frame_length = frame_length @@ -211,7 +209,6 @@ def __str__(self): return s def forward(self, x): - # Add dither if self.dither != 0.0: n = torch.randn(x.shape, device=x.device) @@ -308,13 +305,12 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__() N = int(math.floor(frame_length * fs / 1000)) if N > fft_length: k = math.ceil(math.log(N) / math.log(2)) - self.fft_length = int(2 ** k) + self.fft_length = int(2**k) self.wav2win = Wav2Win( fs, @@ -432,7 +428,6 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__( fs, frame_length, @@ -526,7 +521,6 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__( fs, frame_length, @@ -634,7 +628,6 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__( fs, frame_length, @@ -768,7 +761,6 @@ def __init__( raw_energy=True, use_energy=True, ): - super().__init__( fs, frame_length, @@ -929,7 +921,6 @@ def __init__( snip_edges=False, center=True, ): - super().__init__( fs=fs, frame_length=frame_length, @@ -976,7 +967,6 @@ def __init__( num_filters=23, norm_filters=False, ): - super().__init__() self.fs = fs self.fft_length = fft_length diff --git a/hyperion/torch/layers/audio_feats_factory.py b/hyperion/torch/layers/audio_feats_factory.py index 6d0b4df4..1694e84e 100644 --- a/hyperion/torch/layers/audio_feats_factory.py +++ b/hyperion/torch/layers/audio_feats_factory.py @@ -4,10 +4,9 @@ """ import re -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ...np.feats.filter_banks import FilterBankFactory as FBF -from ...utils.misc import str2bool from .audio_feats import * FFT = "fft" @@ -20,7 +19,7 @@ FEAT_TYPES = [FFT, SPEC, LOG_SPEC, LOG_FB, MFCC, KAN_BAYASHI] -class AudioFeatsFactory(object): +class AudioFeatsFactory: """Factory class to create acoustic features layers like FFT, Spectrogram, log-Spectrogram, log-filter-bank, MFCC. """ @@ -213,6 +212,8 @@ def create( snip_edges=snip_edges, ) + raise ValueError(f"unknown feature type {audio_feat}") + @staticmethod def filter_args(**kwargs): """Filters feature extractor args from arguments dictionary. @@ -284,7 +285,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--remove-dc-offset", default=True, - type=str2bool, + action=ActionYesNo, help="Subtract mean from waveform on each frame", ) @@ -315,7 +316,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--dither", type=float, - default=1.0 / 2 ** 15, + default=1.0 / 2**15, help="Dithering constant (0.0 means no dither)", ) @@ -331,7 +332,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--snip-edges", default=True, - type=str2bool, + action=ActionYesNo, help=( "If true, end effects will be handled by outputting only " "frames that completely fit in the file, and the number of " @@ -344,7 +345,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--center", default=False, - type=str2bool, + action=ActionYesNo, help=( "If true, puts the center of the frame at t*frame_shift, " "it over-wrides snip-edges and set it to false" @@ -361,13 +362,13 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--raw-energy", default=True, - type=str2bool, + action=ActionYesNo, help="If true, compute energy before preemphasis and windowing", ) parser.add_argument( "--use-energy", default=True, - type=str2bool, + action=ActionYesNo, help="Use energy (not C0) in MFCC computation", ) @@ -380,10 +381,10 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--audio-feat", - default="cepstrum", + default="logfb", choices=FEAT_TYPES, help=( - "It can return intermediate result: fft, spec, log_spec, " "logfb, mfcc" + "It can return intermediate result: fft, spec, log_spec, logfb, mfcc" ), ) diff --git a/hyperion/torch/layers/feat_fuser_factory.py b/hyperion/torch/layers/feat_fuser_factory.py new file mode 100644 index 00000000..edc4d933 --- /dev/null +++ b/hyperion/torch/layers/feat_fuser_factory.py @@ -0,0 +1,101 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from typing import Optional + +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from .feat_fusers import ( + CatFeatFuser, + LastFeatFuser, + LinearFeatFuser, + WeightedAvgFeatFuser, +) + +LAST_FUSER = "last" +WAVG_FUSER = "weighted-avg" +LINEAR_FUSER = "linear" +CAT_FUSER = "cat" + +FUSER_TYPES = [LAST_FUSER, WAVG_FUSER, LINEAR_FUSER, CAT_FUSER] + + +class FeatFuserFactory: + """Factory class to create feature fusers for Wav2Vec style hidden features.""" + + @staticmethod + def create( + fuser_type: str = WAVG_FUSER, + num_feats: Optional[int] = None, + feat_dim: Optional[int] = None, + proj_dim: Optional[int] = None, + proj_bias: bool = True, + ): + if fuser_type == WAVG_FUSER: + return WeightedAvgFeatFuser( + num_feats, feat_dim=feat_dim, proj_dim=proj_dim, proj_bias=proj_bias + ) + elif fuser_type == LAST_FUSER: + return LastFeatFuser( + feat_dim=feat_dim, proj_dim=proj_dim, proj_bias=proj_bias + ) + elif fuser_type == LINEAR_FUSER: + return LinearFeatFuser( + num_feats, feat_dim=feat_dim, proj_dim=proj_dim, proj_bias=proj_bias + ) + elif fuser_type == CAT_FUSER: + return CatFeatFuser( + num_feats, feat_dim=feat_dim, proj_dim=proj_dim, proj_bias=proj_bias + ) + else: + raise ValueError(f"unknown feature fuser type {fuser_type}") + + @staticmethod + def filter_args(**kwargs): + """Filters arguments correspondin to Feature Fuser + from args dictionary + + Args: + kwargs: args dictionary + + Returns: + args dictionary + """ + args = filter_func_args(FeatFuserFactory.create, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds feature extractor options to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--fuser-type", + default=WAVG_FUSER, + choices=FUSER_TYPES, + help=f"One of {FUSER_TYPES}", + ) + parser.add_argument( + "--proj-dim", + default=None, + type=int, + help="project features after fusion to proj_dim", + ) + parser.add_argument( + "--proj-bias", + default=True, + action=ActionYesNo, + help="linear projection has bias", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/layers/feat_fusers.py b/hyperion/torch/layers/feat_fusers.py new file mode 100644 index 00000000..44c72ffb --- /dev/null +++ b/hyperion/torch/layers/feat_fusers.py @@ -0,0 +1,86 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math + +import torch +import torch.nn as nn + + +class FeatFuser(nn.Module): + def __init__(self): + super().__init__() + + +class _ProjFeatFuser(FeatFuser): + def __init__(self, feat_dim=None, proj_dim=None, proj_bias=True): + super().__init__() + self.feat_dim = feat_dim + self.proj_dim = proj_dim + self.feat_proj = None + if feat_dim is not None and proj_dim is not None: + self.feat_proj = nn.Linear(feat_dim, proj_dim, bias=proj_bias) + + +class LastFeatFuser(_ProjFeatFuser): + def __init__(self, feat_dim=None, proj_dim=None, proj_bias=True): + super().__init__(feat_dim, proj_dim, proj_bias) + + def forward(self, feats): + feats = feats[-1] + if self.feat_proj is not None: + feats = self.feat_proj(feats) + + return feats + + +class WeightedAvgFeatFuser(_ProjFeatFuser): + def __init__(self, num_feats, feat_dim=None, proj_dim=None, proj_bias=True): + super().__init__(feat_dim, proj_dim, proj_bias) + self.num_feats = num_feats + self.feat_fuser = nn.Parameter(torch.zeros(num_feats)) + + def forward(self, feats): + feats = torch.stack(feats, dim=-1) + norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + feats = torch.sum(feats * norm_weights, dim=-1) + if self.feat_proj is not None: + feats = self.feat_proj(feats) + + return feats + + +class LinearFeatFuser(_ProjFeatFuser): + def __init__(self, num_feats, feat_dim=None, proj_dim=None, proj_bias=True): + super().__init__(feat_dim, proj_dim, proj_bias) + self.num_feats = num_feats + self.feat_fuser = nn.Linear(num_feats, 1, bias=False) + self.feat_fuser.weight.data = torch.ones(1, num_feats) / num_feats + + def forward(self, feats): + feats = torch.stack(feats, dim=-1) + feats = self.feat_fuser(feats).squeeze(dim=-1) + if self.feat_proj is not None: + feats = self.feat_proj(feats) + + return feats + + +class CatFeatFuser(FeatFuser): + def __init__(self, num_feats, feat_dim, proj_dim=None, proj_bias=True): + super().__init__() + self.num_feats = num_feats + self.feat_dim = feat_dim + if proj_dim is None: + proj_dim = feat_dim + self.proj_dim = proj_dim + self.proj_bias = proj_bias + self.feat_fuser = nn.Linear(num_feats * feat_dim, proj_dim, bias=proj_bias) + + def forward(self, feats): + feats = torch.cat(feats, dim=-1) + feats = self.feat_fuser(feats) + return feats diff --git a/hyperion/torch/layers/mvn.py b/hyperion/torch/layers/mvn.py index 4b4c5927..736b69c6 100644 --- a/hyperion/torch/layers/mvn.py +++ b/hyperion/torch/layers/mvn.py @@ -2,28 +2,31 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ..utils import seq_lengths_to_mask + +SQRT_EPS = 1e-5 class MeanVarianceNorm(nn.Module): """Class to apply short-time mean-variance normalization to features. - + Attributes: norm_mean: if True, it normalizes the mean. norm_var: if True, is also normalized the variance. left_context: left context for the window that computes the normalization stats. right_context: right context for the window that computes the normalization stats. - dim: normalization dimension (time dimension). + dim: normalization dimension (time dimension). If left_context = right_context = 0, it computes the stats on the whole utterance. """ + def __init__( self, norm_mean=True, norm_var=False, left_context=0, right_context=0, dim=1 ): - super().__init__() self.norm_mean = norm_mean self.norm_var = norm_var @@ -45,62 +48,124 @@ def __str__(self): ) return s - def forward(self, x): + def forward(self, x, x_lengths=None, x_mask=None): """Short-time mean-var normalizes feature tensor. - + Args: x: feature tensor. Returns: Normalized feature tensor. """ + if not self.norm_mean and not self.norm_var: + return x + + if self.dim != 1: + x = x.transpose(x, 1, self.dim) + + max_length = x.size(1) + if x_lengths is not None and x_mask is None: + x_mask = seq_lengths_to_mask( + x_lengths, + max_length, + dtype=x.dtype, + none_if_all_max=True, + ) - T = x.shape[self.dim] if (self.left_context == 0 and self.right_context == 0) or ( - T <= self.left_context + self.right_context + 1 + max_length <= self.left_context + self.right_context + 1 ): - return self.normalize_global(x) + x = self.normalize_global(x, x_mask) + else: + x = self.normalize_cumsum(x, x_mask) + + if self.dim != 1: + x = x.transpose(x, 1, self.dim).contiguous() - return self.normalize_cumsum(x) + return x - def normalize_global(self, x): + def _normalize_global_nomask(self, x): """Applies global mean-var normalization.""" # Global mean/var norm. + if self.norm_mean: - m_x = torch.mean(x, dim=self.dim, keepdim=True) + m_x = torch.mean(x, dim=1, keepdim=True) x = x - m_x if self.norm_var: - s_x = torch.std(x, dim=self.dim, keepdim=True).clamp(min=1e-5) + s_x = torch.std(x, dim=1, keepdim=True).clamp(min=1e-5) x = x / s_x return x - def normalize_cumsum(self, x): - """Applies short-time mean-var normalization using cumulative sums.""" + def _normalize_global_mask(self, x, x_mask): + """Applies global mean-var normalization with masking.""" + # Global mean/var norm. + den = torch.mean(x_mask, dim=1, keepdim=True) + x = x * x_mask + m_x = torch.mean(x, dim=1, keepdim=True) / den if self.norm_mean: + x = x - m_x + if self.norm_var: + s2_x = torch.mean(x**2, dim=1, keepdim=True) / den + s_x = torch.sqrt(s2_x.clamp(min=SQRT_EPS)) + x = x / s_x + elif self.norm_var: + s2_x = torch.mean((x - m_x) ** 2, dim=1, keepdim=True) / den + s_x = torch.sqrt(s2_x.clamp(min=SQRT_EPS)) + x = x / s_x + + return x + + def normalize_global(self, x, x_mask=None): + """Applies global mean-var normalization.""" + # Global mean/var norm. + if x_mask is None: + return self._normalize_global_nomask(x) + else: + return self._normalize_global_mask(x, x_mask) + + def _prenormalize_cumsum(self, x, x_mask): + """substract first global mean + it will help cumsum numerical stability + and set masked values to the global mean""" + if self.norm_mean or x_mask is not None: # substract first global mean # it will help cumsum numerical stability - m_x = torch.mean(x, dim=self.dim, keepdim=True) + if x_mask is not None: + x = x * x_mask + den = torch.mean(x_mask, dim=1, keepdim=True) + else: + den = 1 + m_x = torch.mean(x, dim=1, keepdim=True) / den + + if self.norm_mean: x = x - m_x + if x_mask is not None: + x = x * x_mask + elif x_mask is not None: + x = x * x_mask + m_x * (1 - x_mask) - if self.dim != 1: - x = x.transpose(self.dim, 1) + return x + + def normalize_cumsum(self, x, x_mask=None): + """Applies short-time mean-var normalization using cumulative sums.""" + x = self._prenormalize_cumsum(x, x_mask) total_context = self.left_context + self.right_context + 1 xx = nn.functional.pad( x.transpose(1, -1), (self.left_context, self.right_context), mode="reflect" ).transpose(1, -1) - if self.norm_mean: + if self.norm_mean or self.norm_var: c_x = torch.cumsum(xx, dim=1) m_x = ( c_x[:, total_context - 1 :] - c_x[:, : -total_context + 1] ) / total_context if self.norm_var: - c_x = torch.cumsum(xx ** 2, dim=1) + c_x = torch.cumsum(xx**2, dim=1) m_x2 = ( c_x[:, total_context - 1 :] - c_x[:, : -total_context + 1] ) / total_context @@ -109,12 +174,9 @@ def normalize_cumsum(self, x): x = x - m_x if self.norm_var: - s_x = torch.sqrt((m_x2 - m_x ** 2).clamp(min=1e-5)) + s_x = torch.sqrt((m_x2 - m_x**2).clamp(min=SQRT_EPS)) x = x / s_x - if self.dim != 1: - x = x.transpose(self.dim, 1) - return x.contiguous() @staticmethod diff --git a/hyperion/torch/lr_schedulers/triangular_lr.py b/hyperion/torch/lr_schedulers/triangular_lr.py index 10e3f83d..45704014 100644 --- a/hyperion/torch/lr_schedulers/triangular_lr.py +++ b/hyperion/torch/lr_schedulers/triangular_lr.py @@ -54,7 +54,6 @@ def __init__( step=0, update_lr_on_opt_step=False, ): - super().__init__(optimizer, min_lr, 0, epoch, step, update_lr_on_opt_step) self.T = T self.T_mul = T_mul @@ -68,7 +67,7 @@ def on_epoch_begin(self, epoch=None, epoch_updates=1, **kwargs): # T has to correspond to an integer number of epochs T = int(math.ceil(self.T / epoch_updates) * epoch_updates) if self.T != T: - logging.info("readjusting triangular_lr T %d -> %d" % (self.T, T)) + logging.info("readjusting triangular_lr T %d -> %d", self.T, T) self.T = T def get_lr(self, step): @@ -80,10 +79,10 @@ def get_lr(self, step): self.T *= self.T_mul self.num_restarts += 1 logging.info( - "triangular_lr warm-restart=%d T=%d" % (self.num_restarts, self.T) + "triangular_lr warm-restart=%d T=%d", self.num_restarts, self.T ) - alpha = self.gamma ** self.num_restarts + alpha = self.gamma**self.num_restarts x = abs(2 * x / self.T - 1) return [ diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 29b6cdaa..fa4addcd 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -14,11 +14,15 @@ HFWav2Vec2Transducer, ) from .wav2xvectors import ( + HFHubert2ConformerV1XVector, HFHubert2ResNet1dXVector, + HFWav2Vec2ConformerV1XVector, HFWav2Vec2ResNet1dXVector, + HFWavLM2ConformerV1XVector, HFWavLM2ResNet1dXVector, - Wav2ResNetXVector, + Wav2ConformerV1XVector, Wav2ResNet1dXVector, + Wav2ResNetXVector, ) from .xvectors.efficient_net_xvector import EfficientNetXVector from .xvectors.resnet1d_xvector import ResNet1dXVector diff --git a/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py b/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py index 05a82103..89173eff 100644 --- a/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py +++ b/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py @@ -32,7 +32,7 @@ def __init__(self, encoder, decoder): if isinstance(encoder, dict): encoder = ConformerEncoderV1(**encoder) else: - assert isinstance(encoder, RNNEncoder) + assert isinstance(encoder, ConformerEncoderV1) super().__init__(encoder, decoder) @@ -45,7 +45,6 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -53,8 +52,7 @@ def add_class_args(parser, prefix=None, skip=set()): ConformerEncoderV1.add_class_args(parser, prefix="encoder", skip=skip) RNNTransducer.add_class_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) def change_config( self, @@ -68,8 +66,7 @@ def change_config( @staticmethod def filter_finetune_args(**kwargs): args = RNNTransducer.filter_finetune_args(**kwargs) - encoder_args = ConformerEncoderV1.filter_finetune_args( - **kwargs["encoder"]) + encoder_args = ConformerEncoderV1.filter_finetune_args(**kwargs["encoder"]) args["encoder"] = encoder_args return args @@ -83,5 +80,4 @@ def add_finetune_args(parser, prefix=None): RNNTransducer.add_finetune_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/__init__.py b/hyperion/torch/models/wav2xvectors/__init__.py index 62123d13..6bafd26d 100644 --- a/hyperion/torch/models/wav2xvectors/__init__.py +++ b/hyperion/torch/models/wav2xvectors/__init__.py @@ -4,12 +4,18 @@ """ +from .hf_hubert2conformer_v1_xvector import HFHubert2ConformerV1XVector from .hf_hubert2resnet1d_xvector import HFHubert2ResNet1dXVector +from .hf_wav2vec2conformer_v1_xvector import HFWav2Vec2ConformerV1XVector from .hf_wav2vec2resnet1d_xvector import HFWav2Vec2ResNet1dXVector +from .hf_wavlm2conformer_v1_xvector import HFWavLM2ConformerV1XVector from .hf_wavlm2resnet1d_xvector import HFWavLM2ResNet1dXVector +from .wav2conformer_v1_xvector import Wav2ConformerV1XVector + # from .wav2efficient_net_xvector import Wav2EfficientNetXVector # from .wav2transformer_xvector_v1 import Wav2TransformerXVectorV1 # from .wav2spinenet_xvector import Wav2SpineNetXVector from .wav2resnet1d_xvector import Wav2ResNet1dXVector + # from .wav2tdnn_xvector import Wav2TDNNXVector from .wav2resnet_xvector import Wav2ResNetXVector diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py new file mode 100644 index 00000000..aeabd09e --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py @@ -0,0 +1,93 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFHubert +from ..xvectors import ConformerV1XVector +from .hf_wav2xvector import HFWav2XVector + + +class HFHubert2ConformerV1XVector(HFWav2XVector): + """Class extracting Hubert + ConformerV1 x-vectors from waveform. + + Attributes: + Attributes: + hf_feats: HFHubert configuration dictionary or object. + This is a warpper over Hugging Face Hubert model. + xvector: ConformerV1XVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the Hubert layers from "feat_fusion_start" to + the Hubert "num_layers". + feat_fusion_method: method to fuse the hidden layers from the Hubert model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFHubert], + xvector: Union[Dict, ConformerV1XVector], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + if isinstance(hf_feats, dict): + hf_feats = HFHubert(**hf_feats) + else: + assert isinstance(hf_feats, HFHubert) + + if isinstance(xvector, dict): + xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + xvector = ConformerV1XVector(**xvector) + else: + assert isinstance(xvector, ConformerV1XVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFHubert.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFHubert.add_class_args(parser, prefix="hf_feats") + ConformerV1XVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFHubert.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFHubert.add_finetune_args(parser, prefix="hf_feats") + ConformerV1XVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py new file mode 100644 index 00000000..3a670d1c --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py @@ -0,0 +1,96 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..xvectors import ConformerV1XVector +from .hf_wav2xvector import HFWav2XVector + + +class HFWav2Vec2ConformerV1XVector(HFWav2XVector): + """Class extracting Wav2Vec2 + ConformerV1 x-vectors from waveform. + + Attributes: + hf_feats: HFWav2Vec configuration dictionary or object. + This is a warpper over Hugging Face Wav2Vec model. + xvector: ConformerV1XVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to + the wav2vec "num_layers". + feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWav2Vec2], + xvector: Union[Dict, ConformerV1XVector], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + if isinstance(hf_feats, dict): + if "class_name" in hf_feats: + del hf_feats["class_name"] + hf_feats = HFWav2Vec2(**hf_feats) + else: + assert isinstance(hf_feats, HFWav2Vec2) + + if isinstance(xvector, dict): + xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + if "class_name" in xvector: + del xvector["class_name"] + xvector = ConformerV1XVector(**xvector) + else: + assert isinstance(xvector, ConformerV1XVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_class_args(parser, prefix="hf_feats") + ConformerV1XVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWav2Vec2.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWav2Vec2.add_finetune_args(parser, prefix="hf_feats") + ConformerV1XVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 925f1172..d6be544a 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -9,6 +9,7 @@ import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser +from ...layers import MeanVarianceNorm from ...torch_model import TorchModel from ...utils import remove_silence diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py new file mode 100644 index 00000000..30e450eb --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py @@ -0,0 +1,93 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWavLM +from ..xvectors import ConformerV1XVector +from .hf_wav2xvector import HFWav2XVector + + +class HFWavLM2ConformerV1XVector(HFWav2XVector): + """Class extracting WavLM + ConformerV1 x-vectors from waveform. + + Attributes: + Attributes: + hf_feats: HFWavLM configuration dictionary or object. + This is a warpper over Hugging Face WavLM model. + xvector: ConformerV1XVector configuration dictionary or object. + feat_fusion_start: the input to x-vector model will fuse the WavLM layers from "feat_fusion_start" to + the WavLM "num_layers". + feat_fusion_method: method to fuse the hidden layers from the WavLM model, when more + than one layer is used. + """ + + def __init__( + self, + hf_feats: Union[Dict, HFWavLM], + xvector: Union[Dict, ConformerV1XVector], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): + if isinstance(hf_feats, dict): + hf_feats = HFWavLM(**hf_feats) + else: + assert isinstance(hf_feats, HFWavLM) + + if isinstance(xvector, dict): + xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + xvector = ConformerV1XVector(**xvector) + else: + assert isinstance(xvector, ConformerV1XVector) + assert xvector.encoder_net.in_feats == hf_feats.hidden_size + + super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + + @staticmethod + def filter_args(**kwargs): + base_args = HFWav2XVector.filter_args(**kwargs) + child_args = HFWavLM.filter_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWavLM.add_class_args(parser, prefix="hf_feats") + ConformerV1XVector.add_class_args(parser, prefix="xvector") + HFWav2XVector.add_class_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = HFWavLM.filter_finetune_args(**kwargs["hf_feats"]) + base_args["hf_feats"] = child_args + child_args = ConformerV1XVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + HFWavLM.add_finetune_args(parser, prefix="hf_feats") + ConformerV1XVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py new file mode 100644 index 00000000..ad6ae4c7 --- /dev/null +++ b/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py @@ -0,0 +1,70 @@ +""" + Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ..xvectors import ConformerV1XVector +from .wav2xvector import Wav2XVector + + +class Wav2ConformerV1XVector(Wav2XVector): + """Class extracting ConformerV1 x-vectors from waveform. + It contains acoustic feature extraction, feature normalization and + ConformerV1XVector extractor. + + Attributes: + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + xvector: ConformerV1XVector configuration dictionary or object. + """ + + def __init__(self, feats, xvector): + if isinstance(xvector, dict): + xvector = ConformerV1XVector.filter_args(**xvector) + xvector = ConformerV1XVector(**xvector) + else: + assert isinstance(xvector, ConformerV1XVector) + + super().__init__(feats, xvector) + + @staticmethod + def add_class_args(parser, prefix=None): + """Adds Wav2ConformerV1XVector options to parser. + + Args: + parser: Arguments parser + prefix: Options prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2XVector.add_class_args(parser) + ConformerV1XVector.add_class_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ConformerV1XVector.filter_finetune_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerV1XVector.add_finetune_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/__init__.py b/hyperion/torch/models/xvectors/__init__.py index 57819128..92e69a5d 100644 --- a/hyperion/torch/models/xvectors/__init__.py +++ b/hyperion/torch/models/xvectors/__init__.py @@ -4,6 +4,7 @@ """ +from .conformer_v1_xvector import ConformerV1XVector from .efficient_net_xvector import EfficientNetXVector from .resnet1d_xvector import ResNet1dXVector from .resnet_xvector import ResNetXVector diff --git a/hyperion/torch/models/xvectors/conformer_v1_xvector.py b/hyperion/torch/models/xvectors/conformer_v1_xvector.py new file mode 100644 index 00000000..323c22a9 --- /dev/null +++ b/hyperion/torch/models/xvectors/conformer_v1_xvector.py @@ -0,0 +1,168 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging + +from jsonargparse import ActionParser, ArgumentParser + +import torch +import torch.nn as nn + +from ...narchs import ConformerEncoderV1 as Encoder +from .xvector import XVector + + +class ConformerV1XVector(XVector): + def __init__( + self, + encoder, + num_classes, + pool_net="mean+stddev", + embed_dim=256, + num_embed_layers=1, + hid_act={"name": "relu", "inplace": True}, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=0, + intertop_k=5, + intertop_margin=0.0, + num_subcenters=2, + dropout_rate=0, + norm_layer=None, + head_norm_layer=None, + use_norm=True, + norm_before=True, + head_use_in_norm=False, + embed_layer=0, + proj_feats=None, + ): + if isinstance(encoder, dict): + logging.info("making %s conformer encoder network") + encoder = Encoder(**encoder) + + super().__init__( + encoder, + num_classes, + pool_net=pool_net, + embed_dim=embed_dim, + num_embed_layers=num_embed_layers, + hid_act=hid_act, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + norm_layer=norm_layer, + head_norm_layer=head_norm_layer, + use_norm=use_norm, + norm_before=norm_before, + head_use_in_norm=head_use_in_norm, + dropout_rate=dropout_rate, + embed_layer=embed_layer, + proj_feats=proj_feats, + ) + + def get_config(self): + base_config = super().get_config() + del base_config["encoder_cfg"] + del base_config["in_feats"] + + encoder_cfg = self.encoder_net.get_config() + del encoder_cfg["class_name"] + config = { + "resnet_enc": encoder_cfg, + } + + config.update(base_config) + return config + + def change_config( + self, + encoder, + override_dropouts=False, + dropout_rate=0, + num_classes=None, + loss_type="arc-softmax", + cos_scale=64, + margin=0.3, + margin_warmup_epochs=10, + intertop_k=5, + intertop_margin=0, + num_subcenters=2, + ): + super().change_config( + False, + dropout_rate, + num_classes, + loss_type, + cos_scale, + margin, + margin_warmup_epochs, + intertop_k, + intertop_margin, + num_subcenters, + ) + if override_dropouts: + logging.info("chaning x-vector head dropouts") + self.classif_net.change_dropouts(dropout_rate) + + self.encoder_net.change_config(**encoder) + + @classmethod + def load(cls, file_path=None, cfg=None, state_dict=None): + cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) + try: + del cfg["in_feats"] + except: + pass + + model = cls(**cfg) + if state_dict is not None: + model.load_state_dict(state_dict) + + return model + + @staticmethod + def filter_args(**kwargs): + base_args = XVector.filter_args(**kwargs) + child_args = Encoder.filter_args(**kwargs["encoder"]) + + base_args["encoder"] = child_args + return base_args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_class_args(parser, skip=set(["in_feats"])) + Encoder.add_class_args(parser, prefix="encoder", skip=set()) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + add_argparse_args = add_class_args + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = XVector.filter_finetune_args(**kwargs) + child_args = Encoder.filter_finetune_args(**kwargs["encoder"]) + base_args["encoder"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_finetune_args(parser) + Encoder.add_finetune_args(parser, prefix="encoder", skip=set()) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index 440c22b6..b42f48f1 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -2,10 +2,9 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ..layers import AudioFeatsFactory as AFF from ..layers import MeanVarianceNorm as MVN @@ -80,7 +79,7 @@ def forward(self, x, x_lengths=None): f = self.spec_augment(f, f_lengths) if self.mvn is not None: - f = self.mvn(f) + f = self.mvn(f, f_lengths) if self.spec_augment is not None and self.aug_after_mvn: f = self.spec_augment(f, f_lengths) diff --git a/hyperion/torch/narchs/conformer_decoder_v1.py b/hyperion/torch/narchs/conformer_decoder_v1.py new file mode 100644 index 00000000..ef55d6c3 --- /dev/null +++ b/hyperion/torch/narchs/conformer_decoder_v1.py @@ -0,0 +1,724 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from ..layer_blocks import ConformerDecoderBlockV1 as DBlock +from ..layer_blocks import TransformerConv1dSubsampler as Conv1dSubsampler +from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler +from ..layers import ActivationFactory as AF +from ..layers import ConvPosEncoder, NoPosEncoder +from ..layers import NormLayer1dFactory as NLF +from ..layers import PosEncoder, RelPosEncoder +from ..utils import make_attn_mask_causal, scale_seq_lengths, seq_lengths_to_mask +from .net_arch import NetArch + + +class ConformerDecoderV1(NetArch): + """Conformer decoder mixing Transformer Decoder with Conformer Encoder Conv blocks + + This becomes a standard Transformer Decoder by setting conv_repeats=0, pos_enc_type='abs', ff_macaron=False. + + Attributes: + in_feats: input features dimension + d_model: encoder blocks feature dimension + num_heads: number of heads + num_blocks: number of self attn blocks + self_att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1', 'block-scaled-dot-prod-att-v1'] + self_att_context: maximum context range for local attention + cross_att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1', 'block-scaled-dot-prod-att-v1'] + conv_repeats: number of conv blocks in each conformer block + conv_kernel_sizes: kernel size for conv blocks + conv_strides: stride for depth-wise conv in the first conv block of each conformer block + ff_type: string in ['linear', 'conv1dx2', 'conv1d-linear'] + d_ff: dimension of middle layer in feed_forward block + ff_kernel_size: kernel size for convolutional versions of ff block + dropout_rate: dropout rate for ff and conv blocks + pos_dropout_rate: dropout rate for positional encoder + att_dropout_rate: dropout rate for attention block + in_layer_type: input layer block type in ['linear','conv2d-sub', 'embed', None] + pos_enc_type: type of positional encoder ['no', 'abs', 'rel', 'conv'] + + causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes + that query q_i only attents to key k_j when j<=i + hid_act: hidden activations in ff and input blocks + conv_norm_layer: norm layer constructor or str for conv block, + if None it uses BatchNorm1d + se_r: Squeeze-Excitation compression ratio, + if None it doesn't use Squeeze-Excitation + ff_macaron: if True, it uses macaron-net style ff layers, otherwise transformer style. + red_lnorms: it True, use redundant LNorm layers at the output of the conformer blocks as + in the paper + concat_after: if True, if concats attention input and output and apply linear transform, i.e., + y = x + linear(concat(x, att(x))) + if False, y = x + att(x) + padding_idx: padding idx for embed layer + in_time_dim: time dimension in the input Tensor + out_time_dim: dimension that we want to be time in the output tensor + """ + + def __init__( + self, + num_classes, + d_model=256, + num_heads=4, + num_blocks=6, + self_att_type="scaled-dot-prod-v1", + att_context=25, + cross_att_type="scaled-dot-prod-v1", + conv_repeats=0, + conv_kernel_sizes=31, + conv_strides=1, + ff_type="linear", + d_ff=2048, + ff_kernel_size=1, + dropout_rate=0.1, + pos_dropout_rate=0.1, + att_dropout_rate=0.0, + in_layer_type="embed", + in_stride=4, + pos_enc_type="abs", + causal_pos_enc=False, + pos_kernel_size=128, + pos_num_groups=16, + hid_act="swish", + conv_norm_layer=None, + se_r=None, + ff_macaron=True, + red_lnorms=True, + concat_after=False, + padding_idx=-1, + in_time_dim=1, + src_time_dim=1, + out_time_dim=1, + in_feats=None, + with_output=True, + ): + super().__init__() + self.num_classes = num_classes + self.with_output = with_output + if in_feats is None: + in_feats = num_classes + self.in_feats = in_feats + self.d_model = d_model + self.num_heads = num_heads + self.num_blocks = num_blocks + + self.self_att_type = self_att_type + self.cross_att_type = cross_att_type + self.att_context = att_context + + self.conv_repeats = self._standarize_cblocks_param( + conv_repeats, num_blocks, "conv_repeats" + ) + self.conv_kernel_sizes = self._standarize_cblocks_param( + conv_kernel_sizes, num_blocks, "conv_kernel_sizes" + ) + self.conv_strides = self._standarize_cblocks_param( + conv_strides, num_blocks, "conv_strides" + ) + + self.ff_type = ff_type + self.d_ff = d_ff + self.ff_kernel_size = ff_kernel_size + self.dropout_rate = dropout_rate + self.pos_enc_type = pos_enc_type + self.causal_pos_enc = causal_pos_enc + self.att_dropout_rate = att_dropout_rate + self.pos_dropout_rate = pos_dropout_rate + self.in_layer_type = in_layer_type + self.in_stride = in_stride + self.se_r = se_r + self.ff_macaron = ff_macaron + self.red_lnorms = red_lnorms + self.concat_after = concat_after + self.padding_idx = padding_idx + self.in_time_dim = in_time_dim + self.src_time_dim = src_time_dim + self.out_time_dim = out_time_dim + self.hid_act = hid_act + self.pos_kernel_size = pos_kernel_size + self.pos_num_groups = pos_num_groups + + self.conv_norm_layer = conv_norm_layer + norm_groups = None + if conv_norm_layer == "group-norm": + norm_groups = min(d_model // 2, 32) + self._conv_norm_layer = NLF.create(conv_norm_layer, norm_groups) + + self._make_in_layer() + + blocks = [] + for i in range(num_blocks): + blocks.append( + DBlock( + d_model, + self_att_type, + cross_att_type, + num_heads, + self.conv_repeats[i], + self.conv_kernel_sizes[i], + self.conv_strides[i], + ff_type, + d_ff, + ff_kernel_size, + hid_act=hid_act, + dropout_rate=dropout_rate, + att_context=att_context, + att_dropout_rate=att_dropout_rate, + pos_enc_type=pos_enc_type, + causal_pos_enc=causal_pos_enc, + conv_norm_layer=self._conv_norm_layer, + se_r=se_r, + ff_macaron=ff_macaron, + out_lnorm=self.red_lnorms, + concat_after=concat_after, + ) + ) + + self.blocks = nn.ModuleList(blocks) + if not self.red_lnorms: + self.norm_out = nn.LayerNorm(d_model) + + if with_output: + self.output_layer = nn.Linear(d_model, num_classes) + + @staticmethod + def _standarize_cblocks_param(p, num_blocks, p_name): + if isinstance(p, int): + p = [p] * num_blocks + elif isinstance(p, list): + if len(p) == 1: + p = p * num_blocks + + assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( + p_name, + len(p), + num_blocks, + ) + else: + raise TypeError("wrong type for param {}={}".format(p_name, p)) + + return p + + def _make_in_layer(self): + in_feats = self.in_feats + d_model = self.d_model + dropout_rate = self.dropout_rate + if self.pos_enc_type == "no": + pos_enc = NoPosEncoder() + elif self.pos_enc_type == "rel": + pos_enc = RelPosEncoder(d_model, self.pos_dropout_rate) + elif self.pos_enc_type == "abs": + pos_enc = PosEncoder(d_model, self.pos_dropout_rate) + elif self.pos_enc_type == "conv": + pos_enc = ConvPosEncoder( + d_model, self.pos_kernel_size, self.pos_num_groups, self.hid_act + ) + else: + raise Exception("wrong pos-enc-type={}".format(self.pos_enc_type)) + + hid_act = AF.create(self.hid_act) + + if self.in_layer_type == "linear": + self.in_layer = nn.Sequential( + nn.Linear(in_feats, d_model), + nn.LayerNorm(d_model), + nn.Dropout(dropout_rate), + hid_act, + pos_enc, + ) + elif self.in_layer_type == "conv2d-sub": + self.in_layer = Conv2dSubsampler( + in_feats, + d_model, + hid_act, + self.in_stride, + pos_enc, + time_dim=self.in_time_dim, + ) + elif self.in_layer_type == "conv1d-sub": + self.in_layer = Conv1dSubsampler( + in_feats, + d_model, + hid_act, + self.in_stride, + pos_enc, + time_dim=self.in_time_dim, + ) + elif self.in_layer_type == "embed": + self.in_layer = nn.Sequential( + nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc + ) + elif isinstance(self.in_layer_type, nn.Module): + self.in_layer = nn.Sequential(self.in_layer_type, pos_enc) + elif self.in_layer_type is None: + self.in_layer = pos_enc + else: + raise ValueError(f"unknown in_layer_type: {self.in_layer_type}") + + def _make_masks( + self, + max_in_length, + x_lengths, + x_mask, + max_src_length, + x_src_lengths, + x_src_mask, + causal_mask, + ): + if x_mask is None: + if x_lengths is not None: + x_mask = seq_lengths_to_mask(x_lengths, max_in_length, time_dim=1) + if causal_mask: + x_mask = make_attn_mask_causal(x_mask) + + if x_src_mask is None and x_src_lengths is not None: + x_src_mask = seq_lengths_to_mask(x_src_lengths, max_src_length, time_dim=1) + + return x_mask, x_src_mask + + def _forward_input(self, x, x_mask): + if isinstance(self.in_layer, (Conv2dSubsampler, Conv1dSubsampler)): + x, x_mask = self.in_layer(x, x_mask) + else: + if self.in_time_dim != 1: + x = x.transpose(1, self.in_time_dim).contiguous() + x = self.in_layer(x) + + return x, x_mask + + def forward( + self, + x, + x_src, + x_lengths=None, + x_src_lengths=None, + x_mask=None, + x_src_mask=None, + causal_mask=True, + ): + """Forward pass function + + Args: + x: input tensor with size=(batch, time_out, num_feats) or (batch, time_out) + x_src: source tensor with size=(batch, time_in, num_feats) + x_lengths: lengths of the input sequences. + x_src_lengths: lengths of the source sequences + x_mask: mask to indicate valid time steps for x (batch, time_out). + It overwrites the mask of x_lengths. + x_src_mask: mask to indicate valid time steps for x_src (batch, time_in). + It overwrites the mask of x_src_lengths. + return_mask: if True, it also return the output mask + + Returns: + Tensor with output logits + Tensor with output lengths + Tensor with mask if return_mask is True + """ + if self.src_time_dim != 1: + x_src = x_src.transpose(1, 2) + + max_in_length = x.size(self.in_time_dim) + max_src_length = x_src.size(1) + x_mask, x_src_mask = self._make_masks( + max_in_length, + x_lengths, + x_mask, + max_src_length, + x_src_lengths, + x_src_mask, + causal_mask, + ) + x, x_mask = self._forward_input(x, x_mask) + + if isinstance(x, tuple): + x, pos_emb = x + b_args = {"pos_emb": pos_emb} + else: + b_args = {} + + for i in range(len(self.blocks)): + x, x_mask = self.blocks[i]( + x, x_src, mask=x_mask, mask_src=x_src_mask, **b_args + ) + + if not self.red_lnorms: + x = self.norm_out(x) + + if self.with_output: + x = self.output_layer(x) + + if self.out_time_dim != 1: + x = x.transpose(1, self.out_time_dim) + + return x, x_lengths + + def forward_1step( + self, + x, + x_src, + x_lengths=None, + x_mask=None, + cache=None, + ): + """Forward pass function + + Args: + x: input tensor with size=(batch, time, num_feats) + x_lengths: lengths of the input sequences. + x_mask: mask to indicate valid time steps for x (batch, time). + It overwrites the mask of x_lengths. + return_mask: if True, it also return the output mask + target_shape: unused + + Returns: + Tensor with output features + Tensor with output lengths + Tensor with mask if return_mask is True + """ + max_in_length = x.size(self.in_time_dim) + if x_mask is None and x_lengths is not None: + x_mask = seq_lengths_to_mask(x_lengths, max_in_length, time_dim=1) + + if self.src_time_dim != 1: + x_src = x_src.transpose(1, 2) + + max_src_length = x_src.size(1) + x, x_mask = self._forward_input(x, x_mask) + + if isinstance(x, tuple): + x, pos_emb = x + b_args = {"pos_emb": pos_emb} + else: + b_args = {} + + if cache is None: + cache = [None] * len(self.blocks) + + next_cache = [] + for i in range(len(self.blocks)): + x, x_mask = self.blocks[i](x, x_src, mask=x_mask, cache=cache[i] ** b_args) + next_cache.apppend(x) + + if not self.red_lnorms: + x = self.norm_out(x[:, -1]) + else: + x = x[:, -1] + + if self.with_output: + x = self.output_layer(x) + + return x, next_cache + + def get_config(self): + """Gets network config + Returns: + dictionary with config params + """ + config = { + "num_classes": self.num_classes, + "in_feats": self.in_feats, + "d_model": self.d_model, + "num_heads": self.num_heads, + "num_blocks": self.num_blocks, + "att_type": self.att_type, + "att_context": self.att_context, + "conv_repeats": self.conv_repeats, + "conv_kernel_sizes": self.conv_kernel_sizes, + "conv_strides": self.conv_strides, + "ff_type": self.ff_type, + "d_ff": self.d_ff, + "ff_kernel_size": self.ff_kernel_size, + "dropout_rate": self.dropout_rate, + "att_dropout_rate": self.att_dropout_rate, + "pos_dropout_rate": self.pos_dropout_rate, + "in_layer_type": self.in_layer_type, + "in_stride": self.in_stride, + "pos_enc_type": self.pos_enc_type, + "causal_pos_enc": self.causal_pos_enc, + "pos_kernel_size": self.pos_kernel_size, + "pos_num_groups": self.pos_num_groups, + "hid_act": self.hid_act, + "se_r": self.se_r, + "ff_macaron": self.ff_macaron, + "red_lnorms": self.red_lnorms, + "conv_norm_layer": self.conv_norm_layer, + "concat_after": self.concat_after, + "padding_idx": self.padding_idx, + "in_time_dim": self.in_time_dim, + "out_time_dim": self.out_time_dim, + "with_output": self.with_output, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def in_context(self): + return (self.att_context, self.att_context) + + def in_shape(self): + """Input shape for network + + Returns: + Tuple describing input shape + """ + if self.in_time_dim == 1: + return (None, None, self.in_feats) + else: + return (None, self.in_feats, None) + + def out_shape(self, in_shape=None): + """Infers the network output shape given the input shape + + Args: + in_shape: input shape tuple + + Returns: + Tuple with the output shape + """ + if in_shape is None: + out_t = None + batch_size = None + else: + assert len(in_shape) == 3 + batch_size = in_shape[0] + in_t = in_shape[self.in_time_dim] + if in_t is None: + out_t = None + else: + if isinstance(self.in_layer, Conv2dSubsampler): + # out_t = in_t//4 + out_t = ((in_t - 1) // 2 - 1) // 2 + else: + out_t = in_t + + if self.out_time_dim == 1: + return (batch_size, out_t, self.d_model) + else: + return (batch_size, self.d_model, out_t) + + @staticmethod + def filter_args(**kwargs): + """Filters arguments correspondin to ConformerDecoder + from args dictionary + + Args: + kwargs: args dictionary + + Returns: + args dictionary + """ + args = filter_func_args(ConformerDecoderV1.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + """Adds Conformer config parameters to argparser + + Args: + parser: argparse object + prefix: prefix string to add to the argument names + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + if "in_feats" not in skip: + parser.add_argument( + "--in-feats", type=int, default=None, help=("input feature dimension") + ) + + parser.add_argument( + "--num-blocks", default=6, type=int, help=("number of tranformer blocks") + ) + + parser.add_argument( + "--d-model", default=512, type=int, help=("encoder layer sizes") + ) + + parser.add_argument( + "--num-heads", + default=4, + type=int, + help=("number of heads in self-attention layers"), + ) + + parser.add_argument( + "--self-att-type", + default="scaled-dot-prod-v1", + choices=[ + "scaled-dot-prod-v1", + "local-scaled-dot-prod-v1", + "block-scaled-dot-prod-v1", + ], + help=("type of self-attention"), + ) + + parser.add_argument( + "--cross-att-type", + default="scaled-dot-prod-v1", + choices=[ + "scaled-dot-prod-v1", + "local-scaled-dot-prod-v1", + "block-scaled-dot-prod-v1", + ], + help=("type of self-attention"), + ) + + parser.add_argument( + "--att-context", + default=25, + type=int, + help=("context size when using local attention"), + ) + + parser.add_argument( + "--conv-repeats", + default=[0], + type=int, + nargs="+", + help=("number of conv blocks in each conformer block"), + ) + + parser.add_argument( + "--conv-kernel-sizes", + default=[31], + nargs="+", + type=int, + help=("kernels sizes for the depth-wise convs of each conformer block"), + ) + + parser.add_argument( + "--conv-strides", + default=[1], + nargs="+", + type=int, + help=("resb-blocks strides for each encoder stage"), + ) + + parser.add_argument( + "--ff-type", + default="linear", + choices=["linear", "conv1dx2", "conv1dlinear"], + help=("type of feed forward layers in transformer block"), + ) + + parser.add_argument( + "--d-ff", + default=2048, + type=int, + help=("size middle layer in feed forward block"), + ) + + parser.add_argument( + "--ff-kernel-size", + default=3, + type=int, + help=("kernel size in convolutional feed forward block"), + ) + + parser.add_argument("--hid-act", default="swish", help="hidden activation") + + parser.add_argument( + "--pos-dropout-rate", + default=0.1, + type=float, + help="positional encoder dropout", + ) + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + parser.add_argument( + "--dropout-rate", default=0.1, type=float, help="feed-forward layer dropout" + ) + + parser.add_argument( + "--in-layer-type", + default="linear", + choices=["embed", "linear", "conv2d-sub", "conv1d-sub"], + help=("type of input layer"), + ) + + parser.add_argument( + "--in-stride", + default=4, + type=int, + choices=[1, 2, 4], + help="stride of conformer input layer", + ) + + parser.add_argument( + "--pos-enc-type", + default="rel", + choices=["no", "rel", "abs", "conv"], + help=("type of positional encoder"), + ) + + parser.add_argument( + "--causal-pos-enc", + default=False, + action=ActionYesNo, + help="relative positional encodings are zero when attending to the future", + ) + parser.add_argument( + "--pos-kernel-size", + default=128, + type=int, + help="kernel size for conv positional encoder", + ) + parser.add_argument( + "--pos-num-groups", + default=16, + type=int, + help="number of conv groups for conv positional encoder", + ) + + parser.add_argument( + "--conv-norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer for conv block in conformer", + ) + + parser.add_argument( + "--se-r", + default=None, + type=int, + help=("squeeze-excitation compression ratio"), + ) + + parser.add_argument( + "--ff-macaron", + default=True, + action=ActionYesNo, + help="do not use macaron style ff layers ", + ) + + parser.add_argument( + "--red-lnorms", + default=True, + action=ActionYesNo, + help="use redundant Lnorm at conformer blocks' outputs", + ) + + parser.add_argument( + "--concat-after", + default=False, + action=ActionYesNo, + help="concatenate attention input and output instead of adding", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index 97cb6d5b..54c2f400 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -3,13 +3,13 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ...utils.misc import filter_func_args from ..layer_blocks import ConformerEncoderBlockV1 as EBlock +from ..layer_blocks import TransformerConv1dSubsampler as Conv1dSubsampler from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler from ..layers import ActivationFactory as AF from ..layers import ConvPosEncoder, NoPosEncoder @@ -90,6 +90,7 @@ def __init__( pos_dropout_rate=0.1, att_dropout_rate=0.0, in_layer_type="conv2d-sub", + in_stride=4, pos_enc_type="rel", causal_pos_enc=False, pos_kernel_size=128, @@ -104,7 +105,6 @@ def __init__( in_time_dim=1, out_time_dim=1, ): - super().__init__() self.in_feats = in_feats self.d_model = d_model @@ -115,11 +115,14 @@ def __init__( self.att_context = att_context self.conv_repeats = self._standarize_cblocks_param( - conv_repeats, num_blocks, "conv_repeats") + conv_repeats, num_blocks, "conv_repeats" + ) self.conv_kernel_sizes = self._standarize_cblocks_param( - conv_kernel_sizes, num_blocks, "conv_kernel_sizes") + conv_kernel_sizes, num_blocks, "conv_kernel_sizes" + ) self.conv_strides = self._standarize_cblocks_param( - conv_strides, num_blocks, "conv_strides") + conv_strides, num_blocks, "conv_strides" + ) self.ff_type = ff_type self.d_ff = d_ff @@ -130,6 +133,7 @@ def __init__( self.att_dropout_rate = att_dropout_rate self.pos_dropout_rate = pos_dropout_rate self.in_layer_type = in_layer_type + self.in_stride = in_stride self.se_r = se_r self.ff_macaron = ff_macaron self.red_lnorms = red_lnorms @@ -173,7 +177,8 @@ def __init__( ff_macaron=ff_macaron, out_lnorm=self.red_lnorms, concat_after=concat_after, - )) + ) + ) self.blocks = nn.ModuleList(blocks) if not self.red_lnorms: @@ -198,7 +203,6 @@ def _standarize_cblocks_param(p, num_blocks, p_name): return p def _make_in_layer(self): - in_feats = self.in_feats d_model = self.d_model dropout_rate = self.dropout_rate @@ -209,8 +213,9 @@ def _make_in_layer(self): elif self.pos_enc_type == "abs": pos_enc = PosEncoder(d_model, self.pos_dropout_rate) elif self.pos_enc_type == "conv": - pos_enc = ConvPosEncoder(d_model, self.pos_kernel_size, - self.pos_num_groups, self.hid_act) + pos_enc = ConvPosEncoder( + d_model, self.pos_kernel_size, self.pos_num_groups, self.hid_act + ) else: raise Exception("wrong pos-enc-type={}".format(self.pos_enc_type)) @@ -225,28 +230,53 @@ def _make_in_layer(self): pos_enc, ) elif self.in_layer_type == "conv2d-sub": - self.in_layer = Conv2dSubsampler(in_feats, - d_model, - hid_act, - pos_enc, - time_dim=self.in_time_dim) + self.in_layer = Conv2dSubsampler( + in_feats, + d_model, + hid_act, + self.in_stride, + pos_enc, + time_dim=self.in_time_dim, + ) + elif self.in_layer_type == "conv1d-sub": + self.in_layer = Conv1dSubsampler( + in_feats, + d_model, + hid_act, + self.in_stride, + pos_enc, + time_dim=self.in_time_dim, + ) elif self.in_layer_type == "embed": self.in_layer = nn.Sequential( - nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), - pos_enc) + nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc + ) elif isinstance(self.in_layer_type, nn.Module): self.in_layer = nn.Sequential(self.in_layer_type, pos_enc) elif self.in_layer_type is None: self.in_layer = pos_enc else: - raise ValueError("unknown in_layer_type: " + self.in_layer_type) - - def forward(self, - x, - x_lengths=None, - x_mask=None, - return_mask=False, - target_shape=None): + raise ValueError(f"unknown in_layer_type: {self.in_layer_type}") + + def _make_masks(self, max_in_length, x_lengths=None, x_mask=None): + if x_mask is None and x_lengths is not None: + x_mask = seq_lengths_to_mask(x_lengths, max_in_length, time_dim=1) + + return x_mask + + def _forward_input(self, x, x_mask): + if isinstance(self.in_layer, (Conv2dSubsampler, Conv1dSubsampler)): + x, x_mask = self.in_layer(x, x_mask) + else: + if self.in_time_dim != 1: + x = x.transpose(1, self.in_time_dim).contiguous() + x = self.in_layer(x) + + return x, x_mask + + def forward( + self, x, x_lengths=None, x_mask=None, return_mask=False, target_shape=None + ): """Forward pass function Args: @@ -263,16 +293,8 @@ def forward(self, Tensor with mask if return_mask is True """ max_in_length = x.size(self.in_time_dim) - if x_mask is None and x_lengths is not None: - x_mask = seq_lengths_to_mask(x_lengths, max_in_length, time_dim=1) - - if isinstance(self.in_layer, Conv2dSubsampler): - x, x_mask = self.in_layer(x, x_mask) - else: - if self.in_time_dim != 1: - x = x.transpose(1, self.in_time_dim).contiguous() - x = self.in_layer(x) - + x_mask = self._make_masks(x, x_lengths, x_mask) + x, x_mask = self._forward_input(x, x_mask) if isinstance(x, tuple): x, pos_emb = x b_args = {"pos_emb": pos_emb} @@ -318,6 +340,7 @@ def get_config(self): "att_dropout_rate": self.att_dropout_rate, "pos_dropout_rate": self.pos_dropout_rate, "in_layer_type": self.in_layer_type, + "in_stride": self.in_stride, "pos_enc_type": self.pos_enc_type, "causal_pos_enc": self.causal_pos_enc, "pos_kernel_size": self.pos_kernel_size, @@ -382,7 +405,7 @@ def out_shape(self, in_shape=None): @staticmethod def filter_args(**kwargs): - """Filters arguments correspondin to TransformerXVector + """Filters arguments correspondin to Conformer Encoder from args dictionary Args: @@ -407,20 +430,17 @@ def add_class_args(parser, prefix=None, skip=set()): parser = ArgumentParser(prog="") if "in_feats" not in skip: - parser.add_argument("--in-feats", - type=int, - default=80, - help=("input feature dimension")) + parser.add_argument( + "--in-feats", type=int, default=80, help=("input feature dimension") + ) - parser.add_argument("--num-blocks", - default=6, - type=int, - help=("number of tranformer blocks")) + parser.add_argument( + "--num-blocks", default=6, type=int, help=("number of tranformer blocks") + ) - parser.add_argument("--d-model", - default=512, - type=int, - help=("encoder layer sizes")) + parser.add_argument( + "--d-model", default=512, type=int, help=("encoder layer sizes") + ) parser.add_argument( "--num-heads", @@ -433,8 +453,9 @@ def add_class_args(parser, prefix=None, skip=set()): "--att-type", default="scaled-dot-prod-v1", choices=[ - "scaled-dot-prod-v1", "local-scaled-dot-prod-v1", - "block-scaled-dot-prod-v1" + "scaled-dot-prod-v1", + "local-scaled-dot-prod-v1", + "block-scaled-dot-prod-v1", ], help=("type of self-attention"), ) @@ -459,9 +480,7 @@ def add_class_args(parser, prefix=None, skip=set()): default=[31], nargs="+", type=int, - help=( - "kernels sizes for the depth-wise convs of each conformer block" - ), + help=("kernels sizes for the depth-wise convs of each conformer block"), ) parser.add_argument( @@ -493,9 +512,7 @@ def add_class_args(parser, prefix=None, skip=set()): help=("kernel size in convolutional feed forward block"), ) - parser.add_argument("--hid-act", - default="swish", - help="hidden activation") + parser.add_argument("--hid-act", default="swish", help="hidden activation") parser.add_argument( "--pos-dropout-rate", @@ -503,22 +520,28 @@ def add_class_args(parser, prefix=None, skip=set()): type=float, help="positional encoder dropout", ) - parser.add_argument("--att-dropout-rate", - default=0, - type=float, - help="self-att dropout") - parser.add_argument("--dropout-rate", - default=0.1, - type=float, - help="feed-forward layer dropout") + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + parser.add_argument( + "--dropout-rate", default=0.1, type=float, help="feed-forward layer dropout" + ) parser.add_argument( "--in-layer-type", default="linear", - choices=["linear", "conv2d-sub"], + choices=["linear", "conv2d-sub", "conv1d-sub"], help=("type of input layer"), ) + parser.add_argument( + "--in-stride", + default=4, + type=int, + choices=[1, 2, 4], + help="stride of conformer input layer", + ) + parser.add_argument( "--pos-enc-type", default="rel", @@ -530,8 +553,7 @@ def add_class_args(parser, prefix=None, skip=set()): "--causal-pos-enc", default=False, action=ActionYesNo, - help= - "relative positional encodings are zero when attending to the future", + help="relative positional encodings are zero when attending to the future", ) parser.add_argument( "--pos-kernel-size", @@ -588,5 +610,4 @@ def add_class_args(parser, prefix=None, skip=set()): ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/feat_fuser_mvn.py b/hyperion/torch/narchs/feat_fuser_mvn.py new file mode 100644 index 00000000..17b396bc --- /dev/null +++ b/hyperion/torch/narchs/feat_fuser_mvn.py @@ -0,0 +1,107 @@ +""" + Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +from typing import Dict, Optional + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ..layers import FeatFuserFactory as FFF +from ..layers import MeanVarianceNorm as MVN +from ..layers import SpecAugment +from .net_arch import NetArch + + +class FeatFuserMVN(NetArch): + """FeatureFuser for Wav2Vec style hidden features + ST-MVN + Optional SpecAugment + """ + + def __init__( + self, + feat_fuser: Dict[str], + mvn: Optional[Dict[str]] = None, + spec_augment: Optional[Dict[str]] = None, + trans: bool = False, + aug_after_mvn: bool = False, + ): + super().__init__() + + feat_fuser = FFF.filter_args(**feat_fuser) + self.feat_fuser_cfg = feat_fuser + self.feat_fuser = FFF.create(**feat_fuser) + + self.mvn = None + self.mvn_cfg = None + if mvn is not None: + mvn = MVN.filter_args(**mvn) + self.mvn_cfg = mvn + if ( + ("norm_mean" in mvn) + and mvn["norm_mean"] + or ("norm_var" in mvn) + and mvn["norm_var"] + ): + self.mvn = MVN(**mvn) + + self.spec_augment = None + self.spec_augment_cfg = None + if spec_augment is not None: + spec_augment = SpecAugment.filter_args(**spec_augment) + self.spec_augment_cfg = spec_augment + self.spec_augment = SpecAugment(**spec_augment) + + self.trans = trans + self.aug_after_mvn = aug_after_mvn + + def forward(self, feats, feats_lengths=None): + feats = self.feat_fuser(feats) + if self.spec_augment is not None and not self.aug_after_mvn: + feats = self.spec_augment(feats, feats_lengths) + + if self.mvn is not None: + feats = self.mvn(feats, feats_lengths) + + if self.spec_augment is not None and self.aug_after_mvn: + feats = self.spec_augment(feats, feats_lengths) + + if self.trans: + feats = feats.transpose(1, 2).contiguous() + + return feats, feats_lengths + + def get_config(self): + config = { + "feat_fuser": self.feat_feats_cfg, + "mvn": self.mvn_cfg, + "spec_augment": self.spec_augment_cfg, + "trans": self.trans, + "aug_after_mvn": self.aug_after_mvn, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + valid_args = ("feat_fuser", "mvn", "spec_augment", "trans", "aug_after_mvn") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + FFF.add_class_args(parser, prefix="feat_fuser") + MVN.add_class_args(parser, prefix="mvn") + SpecAugment.add_class_args(parser, prefix="spec_augment") + parser.add_argument( + "--aug-after-mvn", + default=False, + action="store_true", + help=("do spec augment after st-mvn," "instead of before"), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index e7020e1d..912c2640 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -2,15 +2,17 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging from collections import OrderedDict as ODict from copy import deepcopy -from enum import Enum -from typing import Optional from pathlib import Path +from typing import Callable, Dict, Optional, Union import torch import torch.nn as nn +from ..utils.misc import PathLike + class TorchModel(nn.Module): """Base class for all Pytorch Models and NNet architectures""" @@ -45,6 +47,49 @@ def non_trainable_parameters(self, recurse: bool = True): if not param.requires_grad: yield param + def trainable_named_parameters(self, recurse: bool = True): + for name, param in self.named_parameters(recurse=recurse): + if param.requires_grad: + yield name, param + + def non_trainable_named_parameters(self, recurse: bool = True): + for name, param in self.named_parameters(recurse=recurse): + if not param.requires_grad: + yield name, param + + def parameter_summary(self, verbose: bool = False): + trainable_params = sum(p.numel() for p in self.trainable_parameters()) + non_trainable_params = sum(p.numel() for p in self.non_trainable_parameters()) + buffer_params = sum(p.numel() for p in self.buffers()) + non_trainable_total = non_trainable_params + buffer_params + total_params = trainable_params + non_trainable_total + if verbose: + logging.info( + "total-params=%d, trainable-params=%d, non-trainable-params+buffers=%d, non-trainable-params=%d, buffer-params=%d", + total_params, + trainable_params, + non_trainable_total, + non_trainable_params, + buffer_params, + ) + return ( + total_params, + trainable_params, + non_trainable_total, + non_trainable_params, + buffer_params, + ) + + def print_parameter_list(self): + for n, p in self.trainable_named_parameters(): + logging.info("trainable: %s", n) + + for n, p in self.non_trainable_named_parameters(): + logging.info("non_trainable: %s", n) + + for n, p in self.named_buffers(): + logging.info("buffers: %s", n) + def has_param_groups(self): return False @@ -65,7 +110,7 @@ def change_dropouts(self, dropout_rate): if isinstance(module, nn.modules.dropout._DropoutNd): module.p = dropout_rate if isinstance(module, nn.RNNBase): - module.dropout = dropout + module.dropout = dropout_rate if hasattr(self, "dropout_rate"): assert dropout_rate == 0 or self.dropout_rate > 0 @@ -184,7 +229,73 @@ def _fix_cfg_compatibility(class_obj, cfg): return cfg @staticmethod - def auto_load(file_path, extra_objs={}, map_location=None): + def _is_hf_path(file_path: Path): + # hf path can have only 2 dir levels + return len(file_path.parents) == 2 + + @staticmethod + def _get_from_hf( + file_path: Path, cache_dir: PathLike = None, local_dir: PathLike = None + ): + from huggingface_hub import hf_hub_download + + return hf_hub_download( + repo_id=file_path.parent, + filename=file_path.name, + cache_dir=cache_dir, + local_dir=local_dir, + ) + + @staticmethod + def _try_to_get_from_hf( + file_path: Path, cache_dir: PathLike = None, local_dir: PathLike = None + ): + if str(file_path)[:3] == "hf:": + # hf: prefix indicates to download from hub + file_path = Path(str(file_path)[3:]) + assert TorchModel._is_hf_path( + file_path + ), f"{file_path} is not a valid HF path" + file_path = TorchModel._get_from_hf( + file_path, cache_dir=cache_dir, local_dir=local_dir + ) + return Path(file_path) + elif not file_path.is_file(): + # if no prefix but file not in local dir try to get it from hub + if not TorchModel._is_hf_path(file_path): + return file_path + + try: + file_path = TorchModel._get_from_hf(file_path) + return Path(file_path) + except: + return file_path + + else: + # file is local + return file_path + + @staticmethod + def auto_load( + file_path: PathLike, + extra_objs: dict = {}, + map_location: Optional[ + Union[ + Callable[[torch.Tensor, str], torch.Tensor], + torch.device, + str, + Dict[str, str], + ] + ] = None, + cache_dir: PathLike = None, + local_dir: PathLike = None, + ): + file_path = Path(file_path) + file_path = TorchModel._try_to_get_from_hf( + file_path, cache_dir=cache_dir, local_dir=local_dir + ) + + assert file_path.is_file(), f"TorchModel file: {file_path} not found" if map_location is None: map_location = torch.device("cpu") @@ -193,7 +304,6 @@ def auto_load(file_path, extra_objs={}, map_location=None): cfg = model_data["model_cfg"] class_name = cfg["class_name"] del cfg["class_name"] - print(TorchModel.registry) if class_name in TorchModel.registry: class_obj = TorchModel.registry[class_name] elif class_name in extra_objs: diff --git a/hyperion/torch/tpm/hf/hf_wav2vec_base.py b/hyperion/torch/tpm/hf/hf_wav2vec_base.py index e0bcee1c..2cb95a53 100644 --- a/hyperion/torch/tpm/hf/hf_wav2vec_base.py +++ b/hyperion/torch/tpm/hf/hf_wav2vec_base.py @@ -245,7 +245,7 @@ def change_config( self, override_dropouts: bool, override_spec_augment: bool, - override_lora: bool, + override_lora: bool = False, feat_extract_lr: Optional[float] = None, encoder_lr: Optional[float] = None, use_lora: bool = False, @@ -538,22 +538,22 @@ def forward_impl( x, x_mask = self._preprocess(x, x_lengths) # if ddp_get_rank() == 0: # lora_layer = self.hf_model.encoder.layers[0].attention.v_proj - # print( - # "lora\nw=", - # lora_layer.weight[:3, :3], - # "\na=", - # lora_layer.lora_A[:3, :3], - # "\nb=", - # lora_layer.lora_B[:3, :3], - # "\n", - # "merged=", - # lora_layer.merged, - # "training=", - # lora_layer.training, - # flush=True, - # ) - # assert self.training == lora_layer.training - # assert self.training == (not lora_layer.merged) + # print( + # "lora\nw=", + # lora_layer.weight[:3, :3], + # "\na=", + # lora_layer.lora_A[:3, :3], + # "\nb=", + # lora_layer.lora_B[:3, :3], + # "\n", + # "merged=", + # lora_layer.merged, + # "training=", + # lora_layer.training, + # flush=True, + # ) + # assert self.training == lora_layer.training + # assert self.training == (not lora_layer.merged) output = self.hf_model( x, x_mask, @@ -760,7 +760,7 @@ def filter_args(**kwargs): @staticmethod def _add_lr_args(parser): parser.add_argument( - "--feat-extractor-lr", + "--feat-extract-lr", default=None, type=float, help=( diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 5e41747c..7260595c 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -11,13 +11,12 @@ from enum import Enum from pathlib import Path -from fairscale.optim.grad_scaler import ShardedGradScaler -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.distributed as dist import torch.nn as nn +from fairscale.optim.grad_scaler import ShardedGradScaler +from jsonargparse import ActionParser, ArgumentParser from torch.optim.swa_utils import SWALR, AveragedModel from ...utils.misc import filter_func_args @@ -108,7 +107,6 @@ def __init__( input_key="x", target_key="class_id", ): - self.model = model self.loss = loss self.epochs = epochs @@ -139,6 +137,13 @@ def __init__( self.amp_args = {} self.input_key = input_key self.target_key = target_key + self.ddp = ddp + self.ddp_type = ddp_type + self.rank = 0 + self.world_size = 1 + if ddp: + self.rank = dist.get_rank() + self.world_size = dist.get_world_size() self.set_train_mode() @@ -147,13 +152,7 @@ def __init__( if loss is not None: self.loss.to(device) - self.ddp = ddp - self.ddp_type = ddp_type - self.rank = 0 - self.world_size = 1 if ddp: - self.rank = dist.get_rank() - self.world_size = dist.get_world_size() if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP: self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) if self.rank == 0: @@ -288,6 +287,9 @@ def fit(self, train_data, val_data=None): def set_train_mode(self): self.model.set_train_mode(self.train_mode) + if self.rank == 0: + self.model.parameter_summary(verbose=True) + self.model.print_parameter_list() def train_epoch(self, data_loader): """Training epoch loop @@ -465,6 +467,20 @@ def _get_lr(self): lrs = [param_group["lr"] for param_group in self.optimizer.param_groups] return max(lrs) + def _get_lrs(self): + """Returns the current learning rates of all param groups to show in the loggers""" + lrs = [param_group["lr"] for param_group in self.optimizer.param_groups] + all_eq = True + for lr in lrs: + if lr != lrs[0]: + all_eq = False + break + + if all_eq: + return {"lr": lrs[0]} + + return {f"lr_{i}": lr for i, lr in enumerate(lrs)} + def _compute_grad_acc_steps(self, data_loader): if self.eff_batch_size is None: return @@ -505,6 +521,7 @@ def checkpoint(self, logs=None): Args: logs: logs containing the current value of the metrics. """ + self.model.train() checkpoint = { "epoch": self.cur_epoch, "rng_state": torch.get_rng_state(), @@ -545,6 +562,7 @@ def save_checkpoint(self, logs=None): if self.rank != 0: return + checkpoint = self.checkpoint(logs) file_path = "%s/model_ep%04d.pth" % (self.exp_path, self.cur_epoch) @@ -629,32 +647,34 @@ def load_last_checkpoint(self): return None + @staticmethod + def get_augs_keys(batch, base_key, skip={}): + keys = [] + if base_key in batch and base_key not in skip: + keys.append(base_key) + + aug_idx_1 = 0 + while True: + aug_idx_2 = 0 + while True: + aug_key = f"{base_key}_aug_{aug_idx_1}_{aug_idx_2}" + if aug_key in batch: + if aug_key not in skip: + keys.append(aug_key) + aug_idx_2 += 1 + else: + break + + if aug_idx_2 == 0: + break + + aug_idx_1 += 1 + + return keys + @staticmethod def filter_args(**kwargs): args = filter_func_args(TorchTrainer.__init__, kwargs) - - # valid_args = ( - # "grad_acc_steps", - # "eff_batch_size", - # "epochs", - # "log_interval", - # "use_amp", - # "ddp_type", - # "grad_clip", - # "grad_clip_norm", - # "swa_start", - # "swa_lr", - # "swa_anneal_epochs", - # "exp_path", - # "optim", - # "lrsched", - # "cpu_offload", - # "use_tensorboard", - # "use_wandb", - # "wandb", - # "train_mode", - # ) - # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args @staticmethod diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index a59cbe14..aedd5be0 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -81,7 +81,6 @@ def __init__( input_key="x", target_key="class_id", ): - if loss is None: loss = nn.CrossEntropyLoss() @@ -101,38 +100,53 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() + for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) + # try: + # l1 = self.model.hf_feats.hf_model.encoder.layers[0].attention.v_proj + # # print(f"lora train {l1.training}") + # print(f"loraA {l1.lora_A}") + # print(f"loraB {l1.lora_B}", flush=True) + # except: + # pass + if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - x, target = tensors_subset(data, batch_keys, self.device) - batch_size = x.size(0) - with amp.autocast(enabled=self.use_amp): - output = self.model(x, y=target) - loss = self.loss(output, target).mean() / self.grad_acc_steps - - if self.use_amp: - self.grad_scaler.scale(loss).backward() - else: - loss.backward() + input_keys = self.get_augs_keys(data, self.input_key) + loss_scale = self.grad_acc_steps * len(input_keys) + for aug_key in input_keys: + batch_keys = [aug_key, self.target_key] + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) + with amp.autocast(enabled=self.use_amp): + output = self.model(x, y=target) + loss = self.loss(output, target) / loss_scale + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() if (batch + 1) % self.grad_acc_steps == 0: if self.lr_scheduler is not None and not self.in_swa: self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics["loss"] = loss.item() * self.grad_acc_steps + batch_metrics["loss"] = loss.item() * loss_scale for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 0f6ccd9b..6d00806a 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -81,7 +81,6 @@ def __init__( input_key="x", target_key="class_id", ): - super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) self.feat_extractor = feat_extractor @@ -131,12 +130,14 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index 0fee1bdb..610a43e9 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -6,9 +6,14 @@ from .collation import collate_seq_1d, collate_seq_2d, collate_seq_nd from .data_parallel import TorchDataParallel from .ddp import FairFullyShardedDDP, FairShardedDDP, TorchDDP -from .devices import (open_device, tensors_subset, tensors_to_cpu, - tensors_to_device, tensors_to_numpy) +from .devices import ( + open_device, + tensors_subset, + tensors_to_cpu, + tensors_to_device, + tensors_to_numpy, +) from .eval_utils import eval_nnet_by_chunks, eval_nnet_overlap_add -from .masking import scale_seq_lengths, seq_lengths_to_mask +from .masking import make_attn_mask_causal, scale_seq_lengths, seq_lengths_to_mask from .metric_acc import MetricAcc from .vad_utils import remove_silence diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py index 934b4b90..c7095b31 100644 --- a/hyperion/torch/utils/masking.py +++ b/hyperion/torch/utils/masking.py @@ -20,7 +20,9 @@ def scale_seq_lengths(lengths, max_out_length, max_in_length=None): return torch.div(lengths * max_out_length, max_in_length, rounding_mode="floor") -def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): +def seq_lengths_to_mask( + lengths, max_length=None, dtype=None, time_dim=1, none_if_all_max=False +): """Creates a binary masks indicating the valid values in a sequence. Args: @@ -43,6 +45,10 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): if max_length is None: max_length = lengths.max() + + if none_if_all_max and torch.all(lengths == max_length): + return None + idx = torch.arange(max_length, dtype=lengths.dtype, device=lengths.device) # compute mask shape=(batch, max_length) @@ -60,3 +66,16 @@ def seq_lengths_to_mask(lengths, max_length=None, dtype=None, time_dim=1): mask = mask.to(dtype) return mask + + +def make_attn_mask_causal(mask: torch.Tensor): + """Make causal mask for decoder self-attention.""" + size = mask.size(-1) + causal_mask = torch.ones(size, size, device=mask.device, dtype=torch.bool) + torch.tril(causal_mask, out=causal_mask) + return mask & causal_mask + + +def make_dec_causal_att_mask(y: torch.Tensor, padding_idx: int): + mask = (y != padding_idx).unsqueeze(-2) + return make_attn_mask_causal(mask) diff --git a/hyperion/utils/misc.py b/hyperion/utils/misc.py index 6fb7d24b..6afd4a88 100644 --- a/hyperion/utils/misc.py +++ b/hyperion/utils/misc.py @@ -10,7 +10,7 @@ import numpy as np -PathLike = TypeVar("PathLike", str, Path, None) +PathLike = TypeVar("PathLike", str, Path, type(None)) def generate_data(g): @@ -77,9 +77,8 @@ def energy_vad(P): def compute_snr(x, n, axis=-1): - - P_x = 10 * np.log10(np.mean(x ** 2, axis=axis)) - P_n = 10 * np.log10(np.mean(n ** 2, axis=axis)) + P_x = 10 * np.log10(np.mean(x**2, axis=axis)) + P_n = 10 * np.log10(np.mean(n**2, axis=axis)) return P_x - P_n diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py index 070e4f53..3d8b5e9d 100644 --- a/hyperion/utils/scp_list.py +++ b/hyperion/utils/scp_list.py @@ -36,7 +36,7 @@ def __init__(self, key, file_path, offset=None, range_spec=None): def validate(self): """Validates the attributes of the SCPList object.""" self.key = list2ndarray(self.key) - self.file_path = list2ndarray(self.file_path, dtype=np.object) + self.file_path = list2ndarray(self.file_path, dtype=object) assert len(self.key) == len(self.file_path) if self.offset is not None: if isinstance(self.offset, list): From c977186e3729441dcb52bf6a874d49a9d90ae338 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 3 Nov 2023 21:27:02 -0400 Subject: [PATCH 117/154] added feature fuser to hf w2v xvector --- ...v2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml | 18 +++- ...wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml | 18 +++- .../wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml | 18 +++- .../wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml | 18 +++- .../wavlmbaseplus_ecapatdnn512x3_v2.0.yaml | 18 +++- .../wavlmlarge12l_ecapatdnn512x3_v2.0.yaml | 18 +++- .../conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml | 18 +++- hyperion/bin/extract_wav2vec2xvectors.py | 7 +- .../generate_adv_attacks_xvector_classif.py | 8 +- hyperion/torch/layers/mvn.py | 12 +-- hyperion/torch/lr_schedulers/lr_scheduler.py | 4 +- hyperion/torch/lr_schedulers/triangular_lr.py | 6 ++ .../hf_hubert2conformer_v1_xvector.py | 7 +- .../hf_hubert2resnet1d_xvector.py | 9 +- .../hf_wav2vec2conformer_v1_xvector.py | 7 +- .../hf_wav2vec2resnet1d_xvector.py | 18 ++-- .../models/wav2xvectors/hf_wav2xvector.py | 99 ++++++++++++++----- .../hf_wavlm2conformer_v1_xvector.py | 7 +- .../wav2xvectors/hf_wavlm2resnet1d_xvector.py | 10 +- .../models/xvectors/conformer_v1_xvector.py | 12 ++- hyperion/torch/models/xvectors/xvector.py | 36 +------ hyperion/torch/narchs/__init__.py | 1 + hyperion/torch/narchs/feat_fuser_mvn.py | 14 ++- hyperion/torch/torch_model.py | 80 +++++++++++---- hyperion/torch/utils/eval_utils.py | 6 +- 25 files changed, 327 insertions(+), 142 deletions(-) diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml index c3466259..5d27b093 100644 --- a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.yaml @@ -1,6 +1,23 @@ hf_feats: pretrained_model_path: facebook/wav2vec2-xls-r-300m drop_layers_gt: 12 +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean xvector: resnet_enc: in_feats: 765 @@ -41,5 +58,4 @@ xvector: dropout_rate: 0.0 norm_before: false hid_act: swish -feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml index d9c9b782..fe89d2fc 100644 --- a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn1024x3_v2.0.yaml @@ -1,5 +1,22 @@ hf_feats: pretrained_model_path: facebook/wav2vec2-xls-r-300m +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean xvector: resnet_enc: in_feats: 1024 @@ -40,5 +57,4 @@ xvector: dropout_rate: 0.0 norm_before: false hid_act: swish -feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml index dc3737e3..63afdb58 100644 --- a/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/wav2vec2xlsr300m_ecapatdnn512x3_v2.0.yaml @@ -1,5 +1,22 @@ hf_feats: pretrained_model_path: facebook/wav2vec2-xls-r-300m +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean xvector: resnet_enc: in_feats: 765 @@ -40,5 +57,4 @@ xvector: dropout_rate: 0.0 norm_before: false hid_act: swish -feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml index d7e3388f..4de306e4 100644 --- a/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus9l_ecapatdnn512x3_v2.0.yaml @@ -1,6 +1,23 @@ hf_feats: pretrained_model_path: microsoft/wavlm-base-plus drop_layers_gt: 9 +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean xvector: resnet_enc: in_feats: 765 @@ -41,5 +58,4 @@ xvector: dropout_rate: 0.0 norm_before: false hid_act: swish -feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml index b2430d97..2c2c6db3 100644 --- a/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/wavlmbaseplus_ecapatdnn512x3_v2.0.yaml @@ -1,5 +1,22 @@ hf_feats: pretrained_model_path: microsoft/wavlm-base-plus +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean xvector: resnet_enc: in_feats: 765 @@ -40,5 +57,4 @@ xvector: dropout_rate: 0.0 norm_before: false hid_act: swish -feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml index 5025f047..52246639 100644 --- a/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/wavlmlarge12l_ecapatdnn512x3_v2.0.yaml @@ -1,6 +1,23 @@ hf_feats: pretrained_model_path: microsoft/wavlm-large drop_layers_gt: 12 +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean xvector: resnet_enc: in_feats: 765 @@ -41,5 +58,4 @@ xvector: dropout_rate: 0.0 norm_before: false hid_act: swish -feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml index 0a6303f5..a05e82e1 100644 --- a/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/wavlmlarge_ecapatdnn512x3_v2.0.yaml @@ -1,5 +1,22 @@ hf_feats: pretrained_model_path: microsoft/wavlm-large +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean xvector: resnet_enc: in_feats: 765 @@ -40,5 +57,4 @@ xvector: dropout_rate: 0.0 norm_before: false hid_act: swish -feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/hyperion/bin/extract_wav2vec2xvectors.py b/hyperion/bin/extract_wav2vec2xvectors.py index 02a3b68e..336ec818 100755 --- a/hyperion/bin/extract_wav2vec2xvectors.py +++ b/hyperion/bin/extract_wav2vec2xvectors.py @@ -25,7 +25,9 @@ from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.np.augment import SpeechAugment -from hyperion.torch import TorchModelLoader as TML + +# from hyperion.torch import TorchModelLoader as TML +from hyperion.torch import TorchModel from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info @@ -59,7 +61,8 @@ def init_device(use_gpu): def load_model(model_path, device): logging.info("loading model {}".format(model_path)) - model = TML.load(model_path) + # model = TML.load(model_path) + model = TorchModel.auto_load(model_path) logging.info("xvector-model={}".format(model)) model.to(device) model.eval() diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 711c4194..4d0e762a 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -34,10 +34,10 @@ def read_utt_list(list_file, class2int_file, part_idx, num_parts): - logging.info("reading utt list %s" % (list_file)) + logging.info("reading utt list %s", list_file) utt_list = Utt2Info.load(list_file) utt_list = utt_list.split(part_idx, num_parts) - logging.info("reading class2int-file %s" % (class2int_file)) + logging.info("reading class2int-file %s", class2int_file) class_info = pd.read_csv(class2int_file, header=None, sep=" ") class2idx = {str(k): i for i, k in enumerate(class_info[0])} class_idx = np.array([class2idx[k] for k in utt_list.info], dtype=int) @@ -193,7 +193,7 @@ def generate_attacks( torch.manual_seed(random_seed + len(s)) # this is to make results reproducible p = torch.rand(1).item() if p > p_attack: - logging.info("skipping attack for utt %s" % (key)) + logging.info("skipping attack for utt %s", key) continue if random_utt_length: @@ -228,7 +228,7 @@ def generate_attacks( _, pred = torch.max(score_benign, dim=1) if pred[0] != class_id: - logging.info("utt %s failed benign classification, skipping..." % (key)) + logging.info("utt %s failed benign classification, skipping...", key) continue t3 = time.time() diff --git a/hyperion/torch/layers/mvn.py b/hyperion/torch/layers/mvn.py index 736b69c6..a46ce20d 100644 --- a/hyperion/torch/layers/mvn.py +++ b/hyperion/torch/layers/mvn.py @@ -4,7 +4,7 @@ """ import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..utils import seq_lengths_to_mask @@ -225,16 +225,16 @@ def add_class_args(parser, prefix=None): parser = ArgumentParser(prog="") parser.add_argument( - "--no-norm-mean", - default=False, - action="store_true", - help="don't center the features", + "--norm-mean", + default=True, + action=ActionYesNo, + help="center the features", ) parser.add_argument( "--norm-var", default=False, - action="store_true", + action=ActionYesNo, help="normalize the variance of the features", ) diff --git a/hyperion/torch/lr_schedulers/lr_scheduler.py b/hyperion/torch/lr_schedulers/lr_scheduler.py index 5cbb3ff1..5008e1be 100644 --- a/hyperion/torch/lr_schedulers/lr_scheduler.py +++ b/hyperion/torch/lr_schedulers/lr_scheduler.py @@ -3,12 +3,11 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import torch import torch.optim as optim -class LRScheduler(object): +class LRScheduler: """Base class for learning rate schedulers. Attributes: @@ -114,7 +113,6 @@ def on_epoch_end(self, metrics=None): self.epoch += 1 def on_opt_step(self): - if self.in_warmup: for param_group, lr in zip( self.optimizer.param_groups, self.get_warmup_lr() diff --git a/hyperion/torch/lr_schedulers/triangular_lr.py b/hyperion/torch/lr_schedulers/triangular_lr.py index 45704014..0a5efd38 100644 --- a/hyperion/torch/lr_schedulers/triangular_lr.py +++ b/hyperion/torch/lr_schedulers/triangular_lr.py @@ -61,6 +61,12 @@ def __init__( self.num_restarts = num_restarts self.gamma = gamma + def load_state_dict(self, state_dict): + # we want to be able to change gamma and T_mul in the middle of training + del state_dict["gamma"] + del state_dict["T_mul"] + super().load_state_dict(state_dict) + def on_epoch_begin(self, epoch=None, epoch_updates=1, **kwargs): super().on_epoch_begin(epoch) if self.update_lr_on_opt_step: diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py index aeabd09e..2dc37052 100644 --- a/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2conformer_v1_xvector.py @@ -9,6 +9,7 @@ import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser +from ...narchs import FeatFuserMVN from ...tpm import HFHubert from ..xvectors import ConformerV1XVector from .hf_wav2xvector import HFWav2XVector @@ -31,9 +32,9 @@ class HFHubert2ConformerV1XVector(HFWav2XVector): def __init__( self, hf_feats: Union[Dict, HFHubert], + feat_fuser: Union[Dict, FeatFuserMVN], xvector: Union[Dict, ConformerV1XVector], feat_fusion_start: int = 0, - feat_fusion_method: str = "weighted-avg", ): if isinstance(hf_feats, dict): hf_feats = HFHubert(**hf_feats) @@ -41,13 +42,13 @@ def __init__( assert isinstance(hf_feats, HFHubert) if isinstance(xvector, dict): - xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + xvector["encoder"]["in_feats"] = hf_feats.hidden_size xvector = ConformerV1XVector(**xvector) else: assert isinstance(xvector, ConformerV1XVector) assert xvector.encoder_net.in_feats == hf_feats.hidden_size - super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) @staticmethod def filter_args(**kwargs): diff --git a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py index b75ac53f..a9495ba5 100644 --- a/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_hubert2resnet1d_xvector.py @@ -5,11 +5,11 @@ import logging from typing import Dict, Optional, Union -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...narchs import FeatFuserMVN from ...tpm import HFHubert from ..xvectors import ResNet1dXVector from .hf_wav2xvector import HFWav2XVector @@ -32,11 +32,10 @@ class HFHubert2ResNet1dXVector(HFWav2XVector): def __init__( self, hf_feats: Union[Dict, HFHubert], + feat_fuser: Union[Dict, FeatFuserMVN], xvector: Union[Dict, ResNet1dXVector], feat_fusion_start: int = 0, - feat_fusion_method: str = "weighted-avg", ): - if isinstance(hf_feats, dict): hf_feats = HFHubert(**hf_feats) else: @@ -49,7 +48,7 @@ def __init__( assert isinstance(xvector, ResNet1dXVector) assert xvector.encoder_net.in_feats == hf_feats.hidden_size - super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) @staticmethod def filter_args(**kwargs): diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py index 3a670d1c..1526c467 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2conformer_v1_xvector.py @@ -9,6 +9,7 @@ import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser +from ...narchs import FeatFuserMVN from ...tpm import HFWav2Vec2 from ..xvectors import ConformerV1XVector from .hf_wav2xvector import HFWav2XVector @@ -30,9 +31,9 @@ class HFWav2Vec2ConformerV1XVector(HFWav2XVector): def __init__( self, hf_feats: Union[Dict, HFWav2Vec2], + feat_fuser: Union[Dict, FeatFuserMVN], xvector: Union[Dict, ConformerV1XVector], feat_fusion_start: int = 0, - feat_fusion_method: str = "weighted-avg", ): if isinstance(hf_feats, dict): if "class_name" in hf_feats: @@ -42,7 +43,7 @@ def __init__( assert isinstance(hf_feats, HFWav2Vec2) if isinstance(xvector, dict): - xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + xvector["encoder"]["in_feats"] = hf_feats.hidden_size if "class_name" in xvector: del xvector["class_name"] xvector = ConformerV1XVector(**xvector) @@ -50,7 +51,7 @@ def __init__( assert isinstance(xvector, ConformerV1XVector) assert xvector.encoder_net.in_feats == hf_feats.hidden_size - super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) @staticmethod def filter_args(**kwargs): diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py index 8a17379c..3709e980 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2vec2resnet1d_xvector.py @@ -5,11 +5,11 @@ import logging from typing import Dict, Optional, Union -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...narchs import FeatFuserMVN from ...tpm import HFWav2Vec2 from ..xvectors import ResNet1dXVector from .hf_wav2xvector import HFWav2XVector @@ -31,11 +31,10 @@ class HFWav2Vec2ResNet1dXVector(HFWav2XVector): def __init__( self, hf_feats: Union[Dict, HFWav2Vec2], + feat_fuser: Union[Dict, FeatFuserMVN], xvector: Union[Dict, ResNet1dXVector], feat_fusion_start: int = 0, - feat_fusion_method: str = "weighted-avg", ): - if isinstance(hf_feats, dict): if "class_name" in hf_feats: del hf_feats["class_name"] @@ -52,12 +51,11 @@ def __init__( assert isinstance(xvector, ResNet1dXVector) assert xvector.encoder_net.in_feats == hf_feats.hidden_size - super().__init__(hf_feats, xvector, feat_fusion_start, - feat_fusion_method) + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) + # feat_fusion_method) @staticmethod def filter_args(**kwargs): - base_args = HFWav2XVector.filter_args(**kwargs) child_args = HFWav2Vec2.filter_args(**kwargs["hf_feats"]) base_args["hf_feats"] = child_args @@ -76,8 +74,7 @@ def add_class_args(parser, prefix=None): HFWav2XVector.add_class_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def filter_finetune_args(**kwargs): @@ -98,5 +95,4 @@ def add_finetune_args(parser, prefix=None): ResNet1dXVector.add_finetune_args(parser, prefix="xvector") if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index d6be544a..2b4ef876 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -9,7 +9,7 @@ import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser -from ...layers import MeanVarianceNorm +from ...narchs import FeatFuserMVN from ...torch_model import TorchModel from ...utils import remove_silence @@ -19,25 +19,38 @@ class HFWav2XVector(TorchModel): Attributes: hf_feats: hugging face model wrapper object. + feat_fuser: Dictionary to build feature fuser object. xvector: x-vector model object. feat_fusion_start: the input to x-vector model will fuse the wav2vec layers from "feat_fusion_start" to the wav2vec "num_layers". feat_fusion_method: method to fuse the hidden layers from the wav2vec model, when more - than one layer is used. + than one layer is used (deprecated). """ def __init__( - self, hf_feats, xvector, feat_fusion_start=0, feat_fusion_method="weighted-avg" + self, + hf_feats, + feat_fuser, + xvector, + feat_fusion_start=0, + # feat_fusion_method="weighted-avg", ): super().__init__() self.hf_feats = hf_feats self.xvector = xvector self.feat_fusion_start = feat_fusion_start - self.feat_fusion_method = feat_fusion_method + # self.feat_fusion_method = feat_fusion_method self._hf_context = contextlib.nullcontext() - self._make_fuser() + self._make_fuser(feat_fuser) + + def _make_fuser(self, feat_fuser): + num_feats = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + feat_dim = self.hf_feats.hidden_size + feat_fuser["feat_fuser"]["num_feats"] = num_feats + feat_fuser["feat_fuser"]["feat_dim"] = feat_dim + self.feat_fuser = FeatFuserMVN(**feat_fuser) - def _make_fuser(self): + def _make_fuser_legacy(self): if self.feat_fusion_method == "last": self.feat_fuser = None return @@ -52,7 +65,7 @@ def _make_fuser(self): elif self.feat_fusion_method == "cat": self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) - def _fuse_hid_feats(self, hid_feats): + def _fuse_hid_feats_legacy(self, hid_feats): """Fuses the hidden features from the Wav2Vec model. Args: @@ -121,6 +134,44 @@ def rebuild_output_layer( def forward_feats( self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fuser.fuser_type == "last" + else True + ) + with self._hf_context: + hf_output = self.hf_feats( + x, + x_lengths, + return_hid_states=return_hid_states, + chunk_length=chunk_length, + detach_chunks=detach_chunks, + ) + feat_lengths = hf_output["hidden_states_lengths"] + if return_hid_states: + hid_feats = hf_output["hidden_states"] + hid_feats = hid_feats[self.feat_fusion_start :] + else: + hid_feats = [hf_output["last_hidden_state"]] + + feats, feat_lengths = self.feat_fuser(hid_feats, feat_lengths) + feats = feats.transpose(1, 2) + if return_feat_layers is not None: + # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # as the hidden features of the x-vector encoder. + hid_feats = [ + f.transpose(1, 2) + for i, f in enumerate(hid_feats) + if i in return_feat_layers + ] + else: + hid_feats = None + + return feats, hid_feats, feat_lengths + + def forward_feats_legacy( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False ): return_hid_states = ( False @@ -360,23 +411,27 @@ def valid_train_modes(): def filter_args(**kwargs): valid_args = ( "hf_feats", + "feat_fuser", "xvector", "feat_fusion_start", - "feat_fusion_method", + # "feat_fusion_method", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args def get_config(self): hf_cfg = self.hf_feats.get_config() + fuser_cfg = self.feat_fuser.get_config() xvec_cfg = self.xvector.get_config() del hf_cfg["class_name"] + del fuser_cfg["class_name"] del xvec_cfg["class_name"] config = { "hf_feats": hf_cfg, + "feat_fuser": fuser_cfg, "xvector": xvec_cfg, "feat_fusion_start": self.feat_fusion_start, - "feat_fusion_method": self.feat_fusion_method, + # "feat_fusion_method": self.feat_fusion_method, } base_config = super().get_config() @@ -393,6 +448,8 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") + FeatFuserMVN.add_class_args(parser, prefix="feat_fuser") + parser.add_argument( "--feat-fusion-start", default=0, @@ -402,19 +459,15 @@ def add_class_args(parser, prefix=None, skip=set()): "the wav2vec num_layers" ), ) - parser.add_argument( - "--feat-fusion-method", - default="weighted-avg", - choices=["weighted-avg", "linear", "cat", "last"], - help=( - "method to fuse the hidden layers from the wav2vec model " - "in [weighted-avg, cat]" - ), - ) + # parser.add_argument( + # "--feat-fusion-method", + # default="weighted-avg", + # choices=["weighted-avg", "linear", "cat", "last"], + # help=( + # "method to fuse the hidden layers from the wav2vec model " + # "in [weighted-avg, cat]" + # ), + # ) if prefix is not None: - outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), - help="xvector options", - ) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py index 30e450eb..bcf82bba 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2conformer_v1_xvector.py @@ -9,6 +9,7 @@ import torch.nn as nn from jsonargparse import ActionParser, ArgumentParser +from ...narchs import FeatFuserMVN from ...tpm import HFWavLM from ..xvectors import ConformerV1XVector from .hf_wav2xvector import HFWav2XVector @@ -31,9 +32,9 @@ class HFWavLM2ConformerV1XVector(HFWav2XVector): def __init__( self, hf_feats: Union[Dict, HFWavLM], + feat_fuser: Union[Dict, FeatFuserMVN], xvector: Union[Dict, ConformerV1XVector], feat_fusion_start: int = 0, - feat_fusion_method: str = "weighted-avg", ): if isinstance(hf_feats, dict): hf_feats = HFWavLM(**hf_feats) @@ -41,13 +42,13 @@ def __init__( assert isinstance(hf_feats, HFWavLM) if isinstance(xvector, dict): - xvector["resnet_enc"]["in_feats"] = hf_feats.hidden_size + xvector["encoder"]["in_feats"] = hf_feats.hidden_size xvector = ConformerV1XVector(**xvector) else: assert isinstance(xvector, ConformerV1XVector) assert xvector.encoder_net.in_feats == hf_feats.hidden_size - super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) @staticmethod def filter_args(**kwargs): diff --git a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py index 56a19130..30ace453 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wavlm2resnet1d_xvector.py @@ -5,11 +5,11 @@ import logging from typing import Dict, Optional, Union -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from ...narchs import FeatFuserMVN from ...tpm import HFWavLM from ..xvectors import ResNet1dXVector from .hf_wav2xvector import HFWav2XVector @@ -32,11 +32,10 @@ class HFWavLM2ResNet1dXVector(HFWav2XVector): def __init__( self, hf_feats: Union[Dict, HFWavLM], + feat_fuser: Union[Dict, FeatFuserMVN], xvector: Union[Dict, ResNet1dXVector], feat_fusion_start: int = 0, - feat_fusion_method: str = "weighted-avg", ): - if isinstance(hf_feats, dict): hf_feats = HFWavLM(**hf_feats) else: @@ -49,11 +48,10 @@ def __init__( assert isinstance(xvector, ResNet1dXVector) assert xvector.encoder_net.in_feats == hf_feats.hidden_size - super().__init__(hf_feats, xvector, feat_fusion_start, feat_fusion_method) + super().__init__(hf_feats, feat_fuser, xvector, feat_fusion_start) @staticmethod def filter_args(**kwargs): - base_args = HFWav2XVector.filter_args(**kwargs) child_args = HFWavLM.filter_args(**kwargs["hf_feats"]) base_args["hf_feats"] = child_args diff --git a/hyperion/torch/models/xvectors/conformer_v1_xvector.py b/hyperion/torch/models/xvectors/conformer_v1_xvector.py index 323c22a9..f52b8700 100644 --- a/hyperion/torch/models/xvectors/conformer_v1_xvector.py +++ b/hyperion/torch/models/xvectors/conformer_v1_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import ConformerEncoderV1 as Encoder from .xvector import XVector @@ -40,8 +39,13 @@ def __init__( proj_feats=None, ): if isinstance(encoder, dict): - logging.info("making %s conformer encoder network") + logging.info(f"making conformer encoder network={encoder}") + encoder["in_time_dim"] = 2 + encoder["out_time_dim"] = 2 encoder = Encoder(**encoder) + else: + encoder.in_time_dim = 2 + encoder.out_time_dim = 2 super().__init__( encoder, @@ -75,7 +79,7 @@ def get_config(self): encoder_cfg = self.encoder_net.get_config() del encoder_cfg["class_name"] config = { - "resnet_enc": encoder_cfg, + "encoder": encoder_cfg, } config.update(base_config) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index 9ccd0d31..de28ccae 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -283,46 +283,14 @@ def forward_logits(self, x, x_lengths=None, y=None): Returns: class logits tensor with shape=(batch, num_classes). """ - f = x max_in_length = x.size(-1) x = self._pre_enc(x) x = self.encoder_net(x) + if isinstance(x, tuple): + x = x[0] x, x_lengths = self._post_enc(x, x_lengths, max_in_length) p = self.pool_net(x, x_lengths=x_lengths) y = self.classif_net(p, y) - # if not self.training: - # fnf = ( - # torch.any(torch.any(torch.logical_not(torch.isfinite(f)), dim=1), dim=1) - # .sum() - # .cpu() - # .item() - # ) - # xnf = ( - # torch.any(torch.any(torch.logical_not(torch.isfinite(x)), dim=1), dim=1) - # .sum() - # .cpu() - # .item() - # ) - # pnf = ( - # torch.any(torch.logical_not(torch.isfinite(p)), dim=1) - # .sum() - # .cpu() - # .item() - # ) - # ynf = ( - # torch.any(torch.logical_not(torch.isfinite(y)), dim=1) - # .sum() - # .cpu() - # .item() - # ) - # # if xnf + pnf + ynf > 0: - # logging.warning("ff %d xnf %d pnf %d ynf %d", fnf, xnf, pnf, ynf) - # if xnf > 0: - # ii = torch.any( - # torch.any(torch.logical_not(torch.isfinite(x)), dim=1), dim=1 - # ) - # xx = x[ii] - # logging.info(f"xx={xx}") return y diff --git a/hyperion/torch/narchs/__init__.py b/hyperion/torch/narchs/__init__.py index 4fe8b4ed..c46c87fa 100644 --- a/hyperion/torch/narchs/__init__.py +++ b/hyperion/torch/narchs/__init__.py @@ -13,6 +13,7 @@ from .efficient_net import EfficientNet from .etdnn import ETDNNV1 from .fcnet import FCNetV1, FCNetV2 +from .feat_fuser_mvn import FeatFuserMVN from .resetdnn import ResETDNNV1 from .resnet import * from .resnet1d_decoder import ResNet1dDecoder diff --git a/hyperion/torch/narchs/feat_fuser_mvn.py b/hyperion/torch/narchs/feat_fuser_mvn.py index 17b396bc..6fa4c6c0 100644 --- a/hyperion/torch/narchs/feat_fuser_mvn.py +++ b/hyperion/torch/narchs/feat_fuser_mvn.py @@ -2,7 +2,7 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from typing import Dict, Optional +from typing import Any, Dict, Optional import torch import torch.nn as nn @@ -21,9 +21,9 @@ class FeatFuserMVN(NetArch): def __init__( self, - feat_fuser: Dict[str], - mvn: Optional[Dict[str]] = None, - spec_augment: Optional[Dict[str]] = None, + feat_fuser: Dict[str, Any], + mvn: Optional[Dict[str, Any]] = None, + spec_augment: Optional[Dict[str, Any]] = None, trans: bool = False, aug_after_mvn: bool = False, ): @@ -56,6 +56,10 @@ def __init__( self.trans = trans self.aug_after_mvn = aug_after_mvn + @property + def fuser_type(self): + return self.feat_fuser_cfg["fuser_type"] + def forward(self, feats, feats_lengths=None): feats = self.feat_fuser(feats) if self.spec_augment is not None and not self.aug_after_mvn: @@ -74,7 +78,7 @@ def forward(self, feats, feats_lengths=None): def get_config(self): config = { - "feat_fuser": self.feat_feats_cfg, + "feat_fuser": self.feat_fuser_cfg, "mvn": self.mvn_cfg, "spec_augment": self.spec_augment_cfg, "trans": self.trans, diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 912c2640..97be320c 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -208,7 +208,42 @@ def device(self): return next(iter(devices)) @staticmethod - def _fix_cfg_compatibility(class_obj, cfg): + def _remove_module_prefix(state_dict): + import re + + p = re.compile("^(module\.)+") + if p.match(list(state_dict.keys())[0]) is not None: + state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) + + return state_dict + + @staticmethod + def _fix_xvector_cfg(cfg): + # We renamed AM-softmax scale parameer s to cos_scale + if "s" in cfg: + cfg["cos_scale"] = cfg.pop("s") + + return cfg + + @staticmethod + def _fix_hf_wav2xvector(cfg, state_dict): + key = "feat_fusion_method" + if key in cfg: + fuser_type = cfg.pop(key) + feat_fuser = { + "feat_fuser": {"fuser_type": fuser_type}, + "mvn": None, + "spec_augment": None, + } + cfg["feat_fuser"] = feat_fuser + state_dict["feat_fuser.feat_fuser.feat_fuser"] = state_dict.pop( + "feat_fuser" + ) + + return cfg, state_dict + + @staticmethod + def _fix_model_compatibility(class_obj, cfg, state_dict): """Function that fixed compatibility issues with deprecated models Args: @@ -221,12 +256,14 @@ def _fix_cfg_compatibility(class_obj, cfg): # for compatibility with older x-vector models XVector = TorchModel.registry["XVector"] if issubclass(class_obj, XVector): - # We renamed AM-softmax scale parameer s to cos_scale - if "s" in cfg: - cfg["cos_scale"] = cfg["s"] - del cfg["s"] + cfg = TorchModel._fix_xvector_cfg(cfg) - return cfg + # switch old feature fuser to new feature fuser in w2v x-vectors + HFWav2XVector = TorchModel.registry["HFWav2XVector"] + if issubclass(class_obj, HFWav2XVector): + cfg, state_dict = TorchModel._fix_hf_wav2xvector(cfg, state_dict) + + return cfg, state_dict @staticmethod def _is_hf_path(file_path: Path): @@ -316,19 +353,20 @@ def auto_load( if "n_averaged" in state_dict: del state_dict["n_averaged"] - cfg = TorchModel._fix_cfg_compatibility(class_obj, cfg) - - import re + state_dict = TorchModel._remove_module_prefix(state_dict) + cfg, state_dict = TorchModel._fix_model_compatibility( + class_obj, cfg, state_dict + ) - p = re.compile("^module\.") - num_tries = 3 - for tries in range(num_tries): - try: - return class_obj.load(cfg=cfg, state_dict=state_dict) - except RuntimeError as err: - # remove module prefix when is trained with dataparallel - if tries == num_tries - 1: - # if it failed the 3 trials raise exception - raise err - # remove module prefix when is trained with dataparallel - state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) + return class_obj.load(cfg=cfg, state_dict=state_dict) + # num_tries = 3 + # for tries in range(num_tries): + # try: + # return class_obj.load(cfg=cfg, state_dict=state_dict) + # except RuntimeError as err: + # # remove module prefix when is trained with dataparallel + # if tries == num_tries - 1: + # # if it failed the 3 trials raise exception + # raise err + # # remove module prefix when is trained with dataparallel + # state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) diff --git a/hyperion/torch/utils/eval_utils.py b/hyperion/torch/utils/eval_utils.py index d74835f6..d6a9063a 100644 --- a/hyperion/torch/utils/eval_utils.py +++ b/hyperion/torch/utils/eval_utils.py @@ -9,7 +9,6 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1): - device = None if nnet.device == x.device else nnet.device T = x.shape[time_dim] if T <= chunk_length or chunk_length == 0: @@ -17,6 +16,8 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1 x = x.to(device) y = nnet(x) + if isinstance(y, tuple): + y = y[0] if detach_chunks: y = y.detach() return y @@ -50,6 +51,8 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1 x_i = x_i.to(device) y_i = nnet(x_i) + if isinstance(y_i, tuple): + y_i = y_i[0] if detach_chunks: y_i = y_i.detach() @@ -99,7 +102,6 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1 def eval_nnet_overlap_add( x, nnet, chunk_length=0, chunk_overlap=None, detach_chunks=True, time_dim=-1 ): - device = None if nnet.device == x.device else nnet.device # assume time is the last dimension From 4c5c4fbfc335993f5598f793a522dc2fed6fd234 Mon Sep 17 00:00:00 2001 From: System User Date: Mon, 6 Nov 2023 18:43:55 -0500 Subject: [PATCH 118/154] started lre22/open.v2.8k --- egs/lre22/open.v1.8k/README.md | 26 ++++++++++++++----- .../open.v1.8k/run_003_prepare_noises_rirs.sh | 8 +++--- .../run_010_prepare_xvec_train_data.sh | 26 ++++++++++--------- egs/lre22/open.v1.8k/run_011_train_xvector.sh | 3 +-- egs/lre22/open.v2.8k/cmd.sh | 4 +-- egs/lre22/open.v2.8k/datapath.sh | 2 +- egs/lre22/open.v2.8k/run_001_prepare_data.sh | 20 +++++++++++--- .../open.v2.8k/run_003_prepare_noises_rirs.sh | 8 +++--- egs/lre22/open.v2.8k/run_011_train_xvector.sh | 9 +++---- .../preprocess_audios_for_nnet_train.sh | 3 +-- 10 files changed, 65 insertions(+), 44 deletions(-) diff --git a/egs/lre22/open.v1.8k/README.md b/egs/lre22/open.v1.8k/README.md index 877f99ca..9ad41229 100644 --- a/egs/lre22/open.v1.8k/README.md +++ b/egs/lre22/open.v1.8k/README.md @@ -1,6 +1,6 @@ # LRE22 Fixed Condition V1 -Recipe for the NIST LRE22 fixed condition based to the JHU-MIT Submission. +Recipe for the NIST LRE22 open condition based to the JHU-MIT Submission. ## Citing ``` @@ -18,14 +18,27 @@ Recipe for the NIST LRE22 fixed condition based to the JHU-MIT Submission. - x-Vector networks trained on: - VoxLingua107 - - NIST LRE17 Train + Dev + Eval / CTS + AfV + - NIST LRE17 Train + Dev + Eval / CTS + AfV without Maghrebi Arabic + - NIST SRE16 + - NIST SRE18 + - NIST SRE19 CMN2 + - NIST SRE21 + - NIST SRE CTS Superset + - IARPA Babel + - Fleurs + - LWAZI 2009 + - NCHLT 2014 + - AMMI 2020 + - CommonVoice Tigrinya, Indian English, French + - ADI 2017 + - AST - Gaussian back-end trained on: - NIST LRE22 dev with 2-fold cross-val + x10 augmentations ## Usage - Run the run_0*.sh scripts in sequence - - By default it uses ECAPA-TDNN 4 layers of 2048 dim. + - By default it uses Res2Net50 - To change the default network run scripts with the config-file argument: ```bash run_011_train_xvector.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh @@ -37,7 +50,6 @@ run_040_be_final.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s | Config | Model Type | Model Details | Back-end | Dev MinCp | Dev ActCp | Eval MinCp | Eval ActCp | | ------ | ---------- | ------------- | -------- | :-------: | :-------: | :--------: | :--------: | -| config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | ECAPA-TDNN 2048x4 | Stage-2 | GBE | 0.207 | 0.209 | 0.198 | 0.199 | -| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-2 | GBE | 0.227 | 0.229 | 0.213 | 0.215 | -| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | 0.182 | 0.183 | 0.180 | 0.181 | - +| config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | ECAPA-TDNN 2048x4 | Stage-1 | GBE | 0.100 | 0.101 | 0.105 | 0.106 | +| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-1 | GBE | +| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | diff --git a/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh b/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh index 638143f0..09f01f4d 100755 --- a/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh +++ b/egs/lre22/open.v1.8k/run_003_prepare_noises_rirs.sh @@ -23,7 +23,7 @@ if [ $stage -le 1 ]; then for name in musan_noise musan_music do steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ - --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') \ + --storage_name lre22-open-v1.8k-$(date +'%m_%d_%H_%M') \ data/${name} data/${name}_proc_audio exp/${name}_proc_audio utils/fix_data_dir.sh data/${name}_proc_audio done @@ -36,7 +36,7 @@ if [ $stage -le 2 ]; then for name in musan_speech do steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ - --storage_name lre22-fixed-v1.8k-$(date +'%m_%d_%H_%M') \ + --storage_name lre22-open-v1.8k-$(date +'%m_%d_%H_%M') \ data/${name} data/${name}_babble exp/${name}_babble # utils/fix_data_dir.sh data/${name}_babble done @@ -44,8 +44,8 @@ fi if [ $stage -le 3 ]; then if [ ! -d "RIRS_NOISES" ]; then - if [ -d ../v1.16k/RIRS_NOISES ];then - ln -s ../v1.16k/RIRS_NOISES + if [ -d ../fixed.v1.8k/RIRS_NOISES ];then + ln -s ../fixed.v1.8k/RIRS_NOISES else # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip diff --git a/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh b/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh index d261a287..9f3eff6c 100755 --- a/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh +++ b/egs/lre22/open.v1.8k/run_010_prepare_xvec_train_data.sh @@ -77,15 +77,17 @@ if [ $stage -le 5 ]; then done fi -if [ $stage -le 6 ]; then - awk 'BEGIN{ -adapt_langs_list="ara-acm ara-aeb ara-apc ara-arq ara-ary ara-arz ara-ayl ara-jor ara-ksa ara-kuw ara-leb ara-mau ara-mor ara-oma ara-pal ara-qat ara-sud ara-syr ara-uae ara-yem fra-can fra-fra fra-ntf eng-ens eng-gbr eng-iaf eng-ine eng-usg eng-zho afr-afr nbl-nbl orm-orm tir-tir tso-tso ven-ven xho-xho zul-zul"; -nf=split(adapt_langs_list, f, " "); -for(i=1;i<=nf;i++){ adapt_langs[f[i]]=1;}; -FS=","; OFS=","; -getline; print $0; -} -{ if ($1 in adapt_langs) { $3="1."} else{ $3="0.01"}; print $0}' \ - data/open_proc_audio_no_sil/train_val_split/class_file.csv > \ - data/open_proc_audio_no_sil/train_val_split/class_file_adapt_1.csv -fi +exit + +# if [ $stage -le 6 ]; then +# awk 'BEGIN{ +# adapt_langs_list="ara-acm ara-aeb ara-apc ara-arq ara-ary ara-arz ara-ayl ara-jor ara-ksa ara-kuw ara-leb ara-mau ara-mor ara-oma ara-pal ara-qat ara-sud ara-syr ara-uae ara-yem fra-can fra-fra fra-ntf eng-ens eng-gbr eng-iaf eng-ine eng-usg eng-zho afr-afr nbl-nbl orm-orm tir-tir tso-tso ven-ven xho-xho zul-zul"; +# nf=split(adapt_langs_list, f, " "); +# for(i=1;i<=nf;i++){ adapt_langs[f[i]]=1;}; +# FS=","; OFS=","; +# getline; print $0; +# } +# { if ($1 in adapt_langs) { $3="1."} else{ $3="0.01"}; print $0}' \ +# data/open_proc_audio_no_sil/train_val_split/class_file.csv > \ +# data/open_proc_audio_no_sil/train_val_split/class_file_adapt_1.csv +# fi diff --git a/egs/lre22/open.v1.8k/run_011_train_xvector.sh b/egs/lre22/open.v1.8k/run_011_train_xvector.sh index 056a9754..4b3f9642 100755 --- a/egs/lre22/open.v1.8k/run_011_train_xvector.sh +++ b/egs/lre22/open.v1.8k/run_011_train_xvector.sh @@ -31,7 +31,6 @@ fi if [ "$use_wandb" == "true" ];then extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-fixed-v1.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" fi - if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi @@ -50,7 +49,7 @@ if [ $stage -le 1 ]; then --data.val.dataset.recordings-file $list_dir/wav.scp \ --data.val.dataset.segments-file $list_dir/train_val_split/val_segments.csv \ --trainer.exp-path $nnet_s1_dir \ - --num-gpus $ngpu --master-port 3456 + --num-gpus $ngpu #--master-port 3456 else $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ diff --git a/egs/lre22/open.v2.8k/cmd.sh b/egs/lre22/open.v2.8k/cmd.sh index 4efc96e1..f22c66b4 100755 --- a/egs/lre22/open.v2.8k/cmd.sh +++ b/egs/lre22/open.v2.8k/cmd.sh @@ -15,11 +15,11 @@ if [ "$(hostname -d)" == "cm.gemini" ];then export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" - export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else - export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V" + export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01][234589]*\" -V" export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V" export cuda_eval_cmd="$train_cmd" fi diff --git a/egs/lre22/open.v2.8k/datapath.sh b/egs/lre22/open.v2.8k/datapath.sh index fec52329..02e2ddd4 100644 --- a/egs/lre22/open.v2.8k/datapath.sh +++ b/egs/lre22/open.v2.8k/datapath.sh @@ -63,7 +63,7 @@ elif [ "$(hostname --domain)" == "cm.gemini" ];then lre22_dev_root=$my_root/LDC2022E14_2022_NIST_Language_Recognition_Evaluation_Development_Data lre22_eval_root=$my_root/lre22_test_data_v2 voxlingua_root=$my_root/voxlingua107 - musan_root=/expscratch/dgromero/corpora/musan + musan_root=/export/common/data/corpora/MUSAN/musan babel_assamese_root=$ldc_root/LDC2016S06 babel_bengali_root=$ldc_root/LDC2016S08 babel_pashto_root=$ldc_root/LDC2016S09 diff --git a/egs/lre22/open.v2.8k/run_001_prepare_data.sh b/egs/lre22/open.v2.8k/run_001_prepare_data.sh index 99a72cab..bb64cdbe 100755 --- a/egs/lre22/open.v2.8k/run_001_prepare_data.sh +++ b/egs/lre22/open.v2.8k/run_001_prepare_data.sh @@ -233,17 +233,17 @@ if [ $stage -le 11 ];then --map-langs-to-lre-codes --target-fs 8000 hyp_utils/conda_env.sh \ - local/prepare_some_data_for_lre.py \ + local/prepare_some_data_for_lre_cat.py \ --corpus-dir $lwazi_root \ --output-dir data/lwazi09 \ --map-langs-to-lre-codes --target-fs 8000 hyp_utils/conda_env.sh \ - local/prepare_some_data_for_lre.py \ + local/prepare_some_data_for_lre_cat.py \ --corpus-dir $nchlt_root \ --output-dir data/nchlt14 \ --map-langs-to-lre-codes --target-fs 8000 hyp_utils/conda_env.sh \ - local/prepare_some_data_for_lre.py \ + local/prepare_some_data_for_lre_cat.py \ --corpus-dir $ammi_root \ --output-dir data/ammi20 \ --map-langs-to-lre-codes --target-fs 8000 @@ -286,7 +286,7 @@ fi if [ $stage -le 15 ];then hyp_utils/conda_env.sh \ - local/prepare_ast.py \ + local/prepare_ast_cat.py \ --corpus-dir $ast_root \ --output-dir data/ast \ --map-langs-to-lre-codes --target-fs 8000 @@ -328,3 +328,15 @@ if [ $stage -le 16 ];then fi +if [ $stage -le 5 ];then + if [ -d ../fixed.v1.8k/lre-scorer ];then + ln -s ../fixed.v1.8k/lre-scorer + else + local/download_lre22_scorer.sh + fi + if [ -d ../fixed.v1.8k/focal_multiclass ];then + ln -s ../fixed.v1.8k/focal_multiclass + else + local/download_focal.sh + fi +fi diff --git a/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh b/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh index 08d4d910..55da7f2a 100755 --- a/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh +++ b/egs/lre22/open.v2.8k/run_003_prepare_noises_rirs.sh @@ -23,7 +23,7 @@ if [ $stage -le 1 ]; then for name in musan_noise musan_music do steps_xvec/preprocess_audios_for_nnet_train.sh --nj 10 --cmd "$train_cmd" \ - --storage_name lre22-fixed-v2.8k-$(date +'%m_%d_%H_%M') \ + --storage_name lre22-open-v2.8k-$(date +'%m_%d_%H_%M') \ data/${name} data/${name}_proc_audio exp/${name}_proc_audio utils/fix_data_dir.sh data/${name}_proc_audio done @@ -36,7 +36,7 @@ if [ $stage -le 2 ]; then for name in musan_speech do steps_xvec/make_babble_noise_for_nnet_train.sh --cmd "$train_cmd" \ - --storage_name lre22-fixed-v2.8k-$(date +'%m_%d_%H_%M') \ + --storage_name lre22-open-v2.8k-$(date +'%m_%d_%H_%M') \ data/${name} data/${name}_babble exp/${name}_babble # utils/fix_data_dir.sh data/${name}_babble done @@ -44,8 +44,8 @@ fi if [ $stage -le 3 ]; then if [ ! -d "RIRS_NOISES" ]; then - if [ -d ../v1.8k/RIRS_NOISES ];then - ln -s ../v1.8k/RIRS_NOISES + if [ -d ../fixed.v1.8k/RIRS_NOISES ];then + ln -s ../fixed.v1.8k/RIRS_NOISES else # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip diff --git a/egs/lre22/open.v2.8k/run_011_train_xvector.sh b/egs/lre22/open.v2.8k/run_011_train_xvector.sh index 3a7a47a4..611a33ca 100755 --- a/egs/lre22/open.v2.8k/run_011_train_xvector.sh +++ b/egs/lre22/open.v2.8k/run_011_train_xvector.sh @@ -28,15 +28,12 @@ fi if [ "$use_tb" == "true" ];then extra_args="$extra_args --trainer.use-tensorboard" fi - -if [ "$interactive" == "true" ];then - export cuda_cmd=run.pl -fi - if [ "$use_wandb" == "true" ];then extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project lre22-open-v2.8k --trainer.wandb.name $nnet_s1_name.$(date -Iminutes)" fi - +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi # Network Training if [ $stage -le 1 ]; then diff --git a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh index aed40672..afd13d74 100755 --- a/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh +++ b/hyp_utils/xvectors/preprocess_audios_for_nnet_train.sh @@ -95,11 +95,10 @@ $cmd JOB=1:$nj $dir/log/preproc_audios_${name}.JOB.log \ preprocess_audio_files.py ${args} --audio-format $file_format $args $proc_opts \ --write-time-durs $output_dir/utt2dur.${name}.JOB \ --part-idx JOB --num-parts $nj \ - # --input $data_in/wav.scp \ --recordings-file $data_in/wav.scp \ --output-path $output_dir \ --output-recordings-file $output_dir/wav.${name}.JOB.scp - #--output-script $output_dir/wav.${name}.JOB.scp + for n in $(seq $nj); do cat $output_dir/wav.${name}.$n.scp || exit 1; From c2f0602f625f2e37dc4769a9953f2d113f31b625 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 8 Nov 2023 10:29:46 -0500 Subject: [PATCH 119/154] fix bug missing conformer xvector in __init__ --- hyperion/torch/models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index fa4addcd..7292dbad 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -24,6 +24,7 @@ Wav2ResNet1dXVector, Wav2ResNetXVector, ) +from .xvectors.conformer_v1_xvector import ConformerV1XVector from .xvectors.efficient_net_xvector import EfficientNetXVector from .xvectors.resnet1d_xvector import ResNet1dXVector from .xvectors.resnet_xvector import ResNetXVector From 9fd2141bbdb956b28705dafd69f22a0605224361 Mon Sep 17 00:00:00 2001 From: System User Date: Wed, 8 Nov 2023 10:32:30 -0500 Subject: [PATCH 120/154] started to clean lre22/open.v2.8k --- egs/lre22/open.v2.8k/cmd.sh | 2 +- ...c2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml | 9 +++------ egs/lre22/open.v2.8k/conf/vad_8k.yaml | 9 +++++++++ ...vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml | 18 +++++++++++++++++- ...fig_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh | 6 +++--- 5 files changed, 33 insertions(+), 11 deletions(-) create mode 100644 egs/lre22/open.v2.8k/conf/vad_8k.yaml diff --git a/egs/lre22/open.v2.8k/cmd.sh b/egs/lre22/open.v2.8k/cmd.sh index f22c66b4..15e4a015 100755 --- a/egs/lre22/open.v2.8k/cmd.sh +++ b/egs/lre22/open.v2.8k/cmd.sh @@ -15,7 +15,7 @@ if [ "$(hostname -d)" == "cm.gemini" ];then export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" - #export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else diff --git a/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml index b8998830..a7f3b111 100644 --- a/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml +++ b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml @@ -37,17 +37,16 @@ model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml trainer: optim: opt_type: sgd - lr: 0.45 + lr: 0.4 momentum: 0.9 weight_decay: 4e-4 lrsched: lrsch_type: exp_lr decay_rate: 0.5 - #decay_steps: 4200 - #hold_steps: 1500 decay_steps: 16000 hold_steps: 18000 - min_lr: 4e-4 + #min_lr: 4e-4 + min_lr: 1e-6 warmup_steps: 4000 update_lr_on_opt_step: true use_amp: true @@ -55,5 +54,3 @@ trainer: epochs: 12 eff_batch_size: 1024 train_mode: hf-feats-frozen-nograd - - \ No newline at end of file diff --git a/egs/lre22/open.v2.8k/conf/vad_8k.yaml b/egs/lre22/open.v2.8k/conf/vad_8k.yaml new file mode 100644 index 00000000..1cfe34b0 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/vad_8k.yaml @@ -0,0 +1,9 @@ +sample_frequency: 8000 +frame_shift: 10 +frame_length: 25 +snip_edges: false +vad_energy_threshold: -4.89 +vad_energy_mean_scale: 0.5 +vad_proportion_threshold: 0.12 +vad_frames_context: 2 +wav_scale: 1 diff --git a/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml b/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml index d8193f59..beb687d2 100644 --- a/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml +++ b/egs/lre22/open.v2.8k/conf/wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml @@ -1,5 +1,22 @@ hf_feats: pretrained_model_path: facebook/wav2vec2-xls-r-300m +feat_fuser: + feat_fuser: + fuser_type: weighted-avg + mvn: + norm_mean: false + spec_augment: + time_mask_prob: 1. + time_mask_min_width: 0 + time_mask_max_width: 5 + time_mask_min_num_masks: 1 + time_mask_max_num_masks: 1 + freq_mask_prob: 1. + freq_mask_min_width: 0 + freq_mask_max_width: 10 + freq_mask_min_num_masks: 1 + freq_mask_max_num_masks: 1 + mask_method: mean xvector: resnet_enc: in_feats: 1024 @@ -43,5 +60,4 @@ xvector: dropout_rate: 0.0 norm_before: false hid_act: swish -feat_fusion_method: weighted-avg feat_fusion_start: 2 diff --git a/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh b/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh index b39d817b..bf6c3528 100644 --- a/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh +++ b/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh @@ -13,14 +13,14 @@ nnet_data=open nnet_type=hf_wav2vec2resnet1d -nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v2.2.yaml +nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml nnet_s1_args="" -nnet_name=${hf_model_name}_ecapatdnn1024x3_v2.2 +nnet_name=${hf_model_name}_ecapatdnn1024x3_v1.0 nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/model_ep0011.pth +nnet_s1=$nnet_s1_dir/model_ep0012.pth nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v2.2.yaml nnet_s2_args="" From 8446f7819284ea32c18ed34b6530f961f57a90b0 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 14 Nov 2023 12:13:22 -0500 Subject: [PATCH 121/154] fix param groups in hf w2vec2xvec --- egs/voxceleb/v2.1/README.md | 182 +++++++++++++++++ .../models/wav2xvectors/hf_wav2xvector.py | 186 +++++++++--------- 2 files changed, 276 insertions(+), 92 deletions(-) create mode 100644 egs/voxceleb/v2.1/README.md diff --git a/egs/voxceleb/v2.1/README.md b/egs/voxceleb/v2.1/README.md new file mode 100644 index 00000000..cb5b5368 --- /dev/null +++ b/egs/voxceleb/v2.1/README.md @@ -0,0 +1,182 @@ +# VoxCeleb V2.1 + +Recipe for the VoxCeleb Speaker Verification Task using Wav2Vec2, WavLM or Hubert models from HuggingFace as feature extractors + +## Differences w.r.t VoxCeleb V2 recipe + + - Kaldi format is replaced by new format based on pandas tables + - Kaldi style bash scripts are removed and replaced by python scripts + - Most python scripts are called using Hyperion entry points + +## Citing + +## Training Data + + - x-Vector network is trained on Voxceleb2 dev + test with augmentations + - MUSAN noise + - RIR reverberation + +## Test data + + - Test data is VoxCeleb 1 + - We evaluate the 3 conditions (with cleaned lists): + - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers + - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 + - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it will use config global_conf/config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh + - To use other configs: +```bash +run_005_train_xvector.sh --config-file global_conf/other_config.sh +run_006_extract_xvectors.sh --config-file global_conf/other_config.sh --use-gpu true +run_007_eval_be.sh --config-file global_conf/other_config.sh +``` + + +## Recipe Steps: + + - `run_001_prepare_data.sh` + - Data preparation script to generate Kaldi style data directories for + - VoxCeleb2 train+test + - VoxCeleb1 O/E/H eval sets + + - `run_002_compute_evad.sh` + - Computes Energy VAD for all datasets + + - `run_003_prepare_noises_rirs.sh` + - Prepares MUSAN noises, music to be used by SpeechAugment class. + - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. + - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. + + - `run_004_prepare_xvec_train_data.sh` + - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. + - Removes silence from the audios + - Removes utterances shorter than 4secs and speakers with less than 8 utterances. + - Creates training and validation lists for x-vector training + + - `run_005_train_xvector.sh` + - Trains the x-vector model on frozen wav2vec features + - Finetunes wav2vec+x-vector model + - Large margin finetuning of wav2vec+x-vector model + + - `run_006_extract_xvectors.sh` + - Extracts x-vectors for VoxCeleb2 or VoxCeleb2+augmentation for PLDA training + - Exctracts x-vectors for VoxCeleb1 test sets + + - `run_007_eval_be.sh` + - Trains PLDA and evals PLDA and cosine scoring back-ends + + +## Results + + + + + +### VoxCeleb 1 Original-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.060 | 0.116 | +| | | | Cosine + AS-Norm | 0.81 | 0.058 | 0.108 | +| | | | Cosine + QMF | 0.75 | 0.054 | 0.086 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.069 | 0.108 | +| | | | Cosine + AS-Norm | 0.86 | 0.067 | 0.108 | +| | | | Cosine + QMF | 0.77 | 0.066 | 0.105 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.057 | 0.085 | +| | | | Cosine + AS-Norm | 0.73 | 0.055 | 0.093 | +| | | | Cosine + QMF | 0.66 | 0.051 | 0.094 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.74 | 0.053 | 0.080 | +| | | | Cosine + AS-Norm | 0.71 | 0.050 | 0.087 | +| | | | Cosine + QMF | 0.64 | 0.045 | 0.087 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.84 | 0.063 | 0.111 | +| | | | Cosine + AS-Norm | 0.68 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.63 | 0.048 | 0.071 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.14 | 0.074 | 0.107 | +| | | | Cosine + AS-Norm | 0.94 | 0.060 | 0.089 | +| | | | Cosine + QMF | 0.89 | 0.054 | 0.076 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.1.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.69 | 0.048 | 0.094 | +| | | | Cosine + AS-Norm | 0.63 | 0.046 | 0.082 | +| | | | Cosine + QMF | 0.57 | 0.041 | 0.076 | + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.81 | 0.051 | 0.087 | +| | | | Cosine + AS-Norm | 0.78 | 0.047 | 0.083 | +| | | | Cosine + QMF | 0.75 | 0.046 | 0.076 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.89 | 0.056 | 0.099 | +| | | | Cosine + AS-Norm | 0.86 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.82 | 0.050 | 0.085 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.049 | 0.088 | +| | | | Cosine + AS-Norm | 0.76 | 0.045 | 0.080 | +| | | | Cosine + QMF | 0.73 | 0.043 | 0.078 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.91 | 0.056 | 0.094 | +| | | | Cosine + AS-Norm | 0.87 | 0.053 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.086 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.80 | 0.050 | 0.086 | +| | | | Cosine + AS-Norm | 0.73 | 0.045 | 0.074 | +| | | | Cosine + QMF | 0.69 | 0.042 | 0.069 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.99 | 0.058 | 0.103 | +| | | | Cosine + AS-Norm | 0.87 | 0.052 | 0.090 | +| | | | Cosine + QMF | 0.83 | 0.050 | 0.085 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.1.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.72 | 0.044 | 0.079 | +| | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.068 | +| | | | Cosine + QMF | 0.64 | 0.037 | 0.065 | + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.73 | 0.113 | 0.182 | +| | | | Cosine + AS-Norm | 1.63 | 0.100 | 0.160 | +| | | | Cosine + QMF | 1.56 | 0.096 | 0.155 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.88 | 0.122 | 0.200 | +| | | | Cosine + AS-Norm | 1.77 | 0.110 | 0.175 | +| | | | Cosine + QMF | 1.66 | 0.104 | 0.168 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.67 | 0.103 | 0.165 | +| | | | Cosine + AS-Norm | 1.54 | 0.093 | 0.152 | +| | | | Cosine + QMF | 1.45 | 0.089 | 0.145 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.78 | 0.106 | 0.174 | +| | | | Cosine + AS-Norm | 1.70 | 0.099 | 0.162 | +| | | | Cosine + QMF | 1.61 | 0.094 | 0.153 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.49 | 0.087 | 0.137 | +| | | | Cosine + AS-Norm | 1.29 | 0.074 | 0.117 | +| | | | Cosine + QMF | 1.22 | 0.069 | 0.111 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.84 | 0.107 | 0.172 | +| | | | Cosine + AS-Norm | 1.47 | 0.083 | 0.128 | +| | | | Cosine + QMF | 1.39 | 0.079 | 0.123 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.24 | 0.076 | 0.121 | +| | | | Cosine + AS-Norm | 1.15 | 0.068 | 0.109 | +| | | | Cosine + QMF | 1.09 | 0.065 | 0.107 | + +### VoxSRC2022 dev + +| Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | -------- | :----: | :------------: | :------------: | +| config_wavlmbaseplus_ecapatdnn512x3_v2.0.sh | WavLM+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.60 | 0.163 | 0.257 | +| | | | Cosine + AS-Norm | 2.43 | 0.150 | 0.244 | +| | | | Cosine + QMF | 2.31 | 0.143 | 0.232 | +| config_wavlmbaseplus9l_ecapatdnn512x3_v2.0.sh | WavLM(layer=2-9)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.82 | 0.183 | 0.286 | +| | | | Cosine + AS-Norm | 2.69 | 0.168 | 0.265 | +| | | | Cosine + QMF | 2.52 | 0.158 | 0.252 | +| config_wavlmlarge_ecapatdnn512x3_v2.0.sh | WavLM-Large+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.65 | 0.176 | 0.289 | +| | | | Cosine + AS-Norm | 2.55 | 0.171 | 0.292 | +| | | | Cosine + QMF | 2.38 | 0.159 | 0.266 | +| config_wavlmlarge12l_ecapatdnn512x3_v2.0.sh | WavLM-Large(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.62 | 0.153 | 0.251 | +| | | | Cosine + AS-Norm | 2.53 | 0.149 | 0.247 | +| | | | Cosine + QMF | 2.42 | 0.144 | 0.231 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.225 | +| | | | Cosine + AS-Norm | 2.01 | 0.125 | 0.209 | +| | | | Cosine + QMF | 1.92 | 0.117 | 0.200 | +| config_wav2vec2xlsr300m12l_ecapatdnn512x3_v2.0.sh | Wav2Vec2-XLSR300M(layer=2-12)+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.83 | 0.175 | 0.276 | +| | | | Cosine + AS-Norm | 2.31 | 0.149 | 0.244 | +| | | | Cosine + QMF | 2.22 | 0.137 | 0.229 | +| config_wav2vec2xlsr300m_ecapatdnn512x3_v2.1.sh | Wav2Vec2-XLSR300M+ECAPA-TDNN 512x3 | Stage3: ArcFace m=0.4/intertop_m=0.1 | Cosine | 2.06 | 0.124 | 0.206 | +| | | | Cosine + AS-Norm | 1.97 | 0.125 | 0.212 | +| | | | Cosine + QMF | 1.87 | 0.120 | 0.204 | + diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index 2b4ef876..fc10f810 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -50,49 +50,49 @@ def _make_fuser(self, feat_fuser): feat_fuser["feat_fuser"]["feat_dim"] = feat_dim self.feat_fuser = FeatFuserMVN(**feat_fuser) - def _make_fuser_legacy(self): - if self.feat_fusion_method == "last": - self.feat_fuser = None - return - - num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start - layer_dim = self.hf_feats.hidden_size - if self.feat_fusion_method == "weighted-avg": - self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) - elif self.feat_fusion_method == "linear": - self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers - elif self.feat_fusion_method == "cat": - self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) - - def _fuse_hid_feats_legacy(self, hid_feats): - """Fuses the hidden features from the Wav2Vec model. - - Args: - hid_feats: list of hidden features Tensors from Wav2Vec model. - - Returns: - Tensor of fused features (batch, channels, time) - """ - if len(hid_feats) == 1: - # There is only one layer of features - return hid_feats[0] - - hid_feats = hid_feats[self.feat_fusion_start :] - if self.feat_fusion_method == "weighted-avg": - hid_feats = torch.stack(hid_feats, dim=-1) - norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) - feats = torch.sum(hid_feats * norm_weights, dim=-1) - elif self.feat_fusion_method == "linear": - hid_feats = torch.stack(hid_feats, dim=-1) - feats = self.feat_fuser(hid_feats).squeeze(dim=-1) - elif self.feat_fusion_method == "cat": - hid_feats = torch.cat(hid_feats, dim=-1) - feats = self.feat_fuser(hid_feats) - elif self.feat_fusion_method == "last": - feats = hid_feats[-1] - - return feats + # def _make_fuser_legacy(self): + # if self.feat_fusion_method == "last": + # self.feat_fuser = None + # return + + # num_layers = self.hf_feats.num_encoder_layers + 1 - self.feat_fusion_start + # layer_dim = self.hf_feats.hidden_size + # if self.feat_fusion_method == "weighted-avg": + # self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) + # elif self.feat_fusion_method == "linear": + # self.feat_fuser = nn.Linear(num_layers, 1, bias=False) + # self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers + # elif self.feat_fusion_method == "cat": + # self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) + + # def _fuse_hid_feats_legacy(self, hid_feats): + # """Fuses the hidden features from the Wav2Vec model. + + # Args: + # hid_feats: list of hidden features Tensors from Wav2Vec model. + + # Returns: + # Tensor of fused features (batch, channels, time) + # """ + # if len(hid_feats) == 1: + # # There is only one layer of features + # return hid_feats[0] + + # hid_feats = hid_feats[self.feat_fusion_start :] + # if self.feat_fusion_method == "weighted-avg": + # hid_feats = torch.stack(hid_feats, dim=-1) + # norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) + # feats = torch.sum(hid_feats * norm_weights, dim=-1) + # elif self.feat_fusion_method == "linear": + # hid_feats = torch.stack(hid_feats, dim=-1) + # feats = self.feat_fuser(hid_feats).squeeze(dim=-1) + # elif self.feat_fusion_method == "cat": + # hid_feats = torch.cat(hid_feats, dim=-1) + # feats = self.feat_fuser(hid_feats) + # elif self.feat_fusion_method == "last": + # feats = hid_feats[-1] + + # return feats @property def sample_frequency(self): @@ -170,43 +170,43 @@ def forward_feats( return feats, hid_feats, feat_lengths - def forward_feats_legacy( - self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False - ): - return_hid_states = ( - False - if return_feat_layers is None and self.feat_fusion_method == "last" - else True - ) - with self._hf_context: - hf_output = self.hf_feats( - x, - x_lengths, - return_hid_states=return_hid_states, - chunk_length=chunk_length, - detach_chunks=detach_chunks, - ) - feat_lengths = hf_output["hidden_states_lengths"] - if return_hid_states: - hid_feats = hf_output["hidden_states"] - feats = self._fuse_hid_feats(hid_feats) - else: - hid_feats = None - feats = hf_output["last_hidden_state"] - - feats = feats.transpose(1, 2) - if return_feat_layers is not None: - # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) - # as the hidden features of the x-vector encoder. - hid_feats = [ - f.transpose(1, 2) - for i, f in enumerate(hid_feats) - if i in return_feat_layers - ] - else: - hid_feats = None - - return feats, hid_feats, feat_lengths + # def forward_feats_legacy( + # self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + # ): + # return_hid_states = ( + # False + # if return_feat_layers is None and self.feat_fusion_method == "last" + # else True + # ) + # with self._hf_context: + # hf_output = self.hf_feats( + # x, + # x_lengths, + # return_hid_states=return_hid_states, + # chunk_length=chunk_length, + # detach_chunks=detach_chunks, + # ) + # feat_lengths = hf_output["hidden_states_lengths"] + # if return_hid_states: + # hid_feats = hf_output["hidden_states"] + # feats = self._fuse_hid_feats(hid_feats) + # else: + # hid_feats = None + # feats = hf_output["last_hidden_state"] + + # feats = feats.transpose(1, 2) + # if return_feat_layers is not None: + # # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) + # # as the hidden features of the x-vector encoder. + # hid_feats = [ + # f.transpose(1, 2) + # for i, f in enumerate(hid_feats) + # if i in return_feat_layers + # ] + # else: + # hid_feats = None + + # return feats, hid_feats, feat_lengths def forward( self, @@ -289,15 +289,16 @@ def extract_embed( ) def freeze_feat_fuser(self): - if self.feat_fuser is None: - return + self.feat_fuser.freeze() + # if self.feat_fuser is None: + # return - if self.feat_fusion_method == "weighted-avg": - self.feat_fuser.requires_grad = False - return + # if self.feat_fusion_method == "weighted-avg": + # self.feat_fuser.requires_grad = False + # return - for param in self.feat_fuser.parameters(): - param.requires_grad = False + # for param in self.feat_fuser.parameters(): + # param.requires_grad = False def freeze_hf_feats(self): self.hf_feats.freeze() @@ -316,11 +317,12 @@ def trainable_param_groups(self): return self.trainable_parameters() param_groups = self.hf_feats.trainable_param_groups() - if self.feat_fusion_method == "weighted-avg": - if self.feat_fuser.requires_grad: - param_groups.append({"params": self.feat_fuser}) - else: - param_groups.append({"params": self.feat_fuser.parameters()}) + param_groups.append({"params": self.feat_fuser.trainable_parameters()}) + # if self.feat_fusion_method == "weighted-avg": + # if self.feat_fuser.requires_grad: + # param_groups.append({"params": self.feat_fuser}) + # else: + # param_groups.append({"params": self.feat_fuser.parameters()}) param_groups.append({"params": self.xvector.trainable_parameters()}) return param_groups From 398bf36b9afb6e8caaf369e27802db6d9061d36e Mon Sep 17 00:00:00 2001 From: System User Date: Tue, 14 Nov 2023 12:18:10 -0500 Subject: [PATCH 122/154] xxx --- ...rain_fwseres2net50s8_xvec_stage1_v1.0.yaml | 2 +- ...onfig_fbank64_stmn_fwseres2net50s8_v1.0.sh | 31 +- .../open.v1.8k/run_030_extract_xvectors.sh | 4 +- egs/lre22/open.v2.8k/run_040_be_final.sh | 434 ++++++++++++++++++ 4 files changed, 440 insertions(+), 31 deletions(-) create mode 100755 egs/lre22/open.v2.8k/run_040_be_final.sh diff --git a/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml b/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml index 13ce9445..e501abdb 100644 --- a/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml +++ b/egs/lre22/open.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml @@ -50,7 +50,7 @@ model: margin: 0.0 intertop_margin: 0.0 margin_warmup_epochs: 3.0 - dropout_rate: 0.0 + dropout_rate: 0.05 norm_before: false hid_act: swish trainer: diff --git a/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh index 6a735e4c..6f6bc98b 100644 --- a/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +++ b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh @@ -9,37 +9,12 @@ vad_config=conf/vad_8k.yaml nnet_data=open # x-vector cfg - nnet_type=resnet -nnet_stages=2 +nnet_stages=1 nnet_s1_base_cfg=conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml -nnet_name=${feat_type}_fwseres2net50s8_v1.0 +nnet_name=${feat_type}_fwseres2net50s8_v1.2 nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/swa_model_ep0012.pth -#nnet_s1=$nnet_s1_dir/model_ep0001.pth -nnet_s1=$nnet_s1_dir/model_ep0008.pth -nnet_s1=$nnet_s1_dir/model_ep0011.pth -nnet_s1=$nnet_s1_dir/model_ep0015.pth -nnet_s1=$nnet_s1_dir/swa_model_ep0016.pth - -nnet_s2_base_cfg=conf/train_tseres2net50s8_xvec_stage2_v1.0.yaml -nnet_s2_name=${nnet_name}.s2 -nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name -#nnet_s2=$nnet_s2_dir/swa_model_ep0013.pth -nnet_s2=$nnet_s2_dir/model_ep0001.pth -nnet_s2=$nnet_s2_dir/model_ep0002.pth -nnet_s2=$nnet_s2_dir/model_ep0004.pth -# nnet_s2=$nnet_s2_dir/model_ep0008.pth -# nnet_s2=$nnet_s2_dir/swa_model_ep0012.pth - -nnet_s3_base_cfg=conf/train_tseres2net50s8_xvec_stage3_v2.1.yaml -nnet_s3_name=${nnet_name}.s3 -nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name -#nnet_s3=$nnet_s3_dir/swa_model_ep0013.pth -#nnet_s3=$nnet_s3_dir/model_ep0007.pth -nnet_s3=$nnet_s3_dir/model_ep0001.pth -nnet_s3=$nnet_s3_dir/model_ep0004.pth -nnet_s3=$nnet_s3_dir/model_ep0008.pth +nnet_s1=$nnet_s1_dir/model_ep0012.pth diff --git a/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh b/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh index ea2c59f6..227d1047 100755 --- a/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh +++ b/egs/lre22/open.v1.8k/run_030_extract_xvectors.sh @@ -8,10 +8,10 @@ set -e stage=2 -nnet_stage=2 +nnet_stage=1 config_file=default_config.sh use_gpu=false -do_tsne=true +do_tsne=false split_dev=false xvec_chunk_length=12800 . parse_options.sh || exit 1; diff --git a/egs/lre22/open.v2.8k/run_040_be_final.sh b/egs/lre22/open.v2.8k/run_040_be_final.sh new file mode 100755 index 00000000..fe5b6f18 --- /dev/null +++ b/egs/lre22/open.v2.8k/run_040_be_final.sh @@ -0,0 +1,434 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file + +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +fi + +xvector_dir=exp/xvectors/$nnet_name +be_base_dir=exp/be/$nnet_name +score_base_dir=exp/scores/$nnet_name + +if [ $stage -le 1 ];then + for r in 1 #0.9999 0.999 #0.99 0.975 0.95 + do + be_name=pca${r}_cw_lnorm_lgbe_lre22_aug + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + + + ) & + + done + + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --pca.pca-var-r $r \ + --do-lnorm --whiten \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}_p12/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v1.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + + + done + wait + +fi + +exit +# Back-ends below over-fitted + +if [ $stage -le 2 ];then + for r in 1 + do + for penalty in l2 #l1 + do + for c in 1 #0.1 1 + do + for ary_thr in 0.975 #0.85 0.7 #0.99 0.95 0.9 #15 ##1 5 10 20 + do + be_name=pca${r}_cw_lnorm_lsvm_${penalty}_c${c}_sqhinge_lre22_aug_lre17_aryt${ary_thr} + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd \ + $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p + + $train_cmd \ + ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd \ + $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v3.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.penalty $penalty --svm.c $c --svm.dual false \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir + + $train_cmd \ + ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd \ + ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v2.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + ) & + done + done + done + done + wait + +fi + +if [ $stage -le 3 ];then + for r in 1 # 0.9999 0.99 0.975 0.95 0.9 0.8 + do + for shrinking in true #false + do + for c in 1 10 #0.1 1 10 #0.01 0.1 1 10 # 0.0001 + do + for vl in false #true #false + do + if [ "$vl" == "true" ];then + do_vl="--do-vl" + else + do_vl="--no_do-vl" + fi + ary_thr=0.975 + be_name=pca${r}_cw_lnorm_gsvm_shrinking_${shrinking}_c${c}_lre17_aryt${ary_thr}_vl${vl}_aug_clean + be_dir=$be_base_dir/$be_name + score_dir=$score_base_dir/$be_name + #score_dir=$score_base_dir/${be_name}_logpost + ( + for p_trn in p1 p2 + do + + if [ "$p_trn" == "p1" ];then + p_test="p2" + else + p_test="p1" + fi + + be_dir_p=${be_dir}_$p_trn + ( + $train_cmd $be_dir_p/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean_$p_trn/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500\ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir_p \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}_p12/test_${p_test}.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev_$p_test/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir_p \ + --score-file ${score_dir}_p12/nocal/lre22_dev_${p_test}_scores.tsv + ) & + done + ( + $train_cmd $be_dir/train.log \ + hyp_utils/conda_env.sh \ + steps_be/train_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev_aug_clean/xvector.scp \ + --train-list data/lre22_dev_aug_clean/utt2lang \ + --lre17-v-file scp:$xvector_dir/lre17_proc_audio_no_sil/xvector.scp \ + --lre17-list data/lre17_proc_audio_no_sil/utt2lang \ + --voxlingua-v-file scp:$xvector_dir/voxlingua107_codecs_proc_audio_no_sil/xvector.scp \ + --voxlingua-list data/voxlingua107_codecs_proc_audio_no_sil/utt2lang \ + --pca.pca-var-r $r \ + --svm.shrinking $shrinking --svm.c $c --svm.break_ties false --svm.max-iter 500 \ + --do-lnorm --whiten --ary-thr $ary_thr \ + --output-dir $be_dir \ + --do-lre17 $do_vl + + $train_cmd ${score_dir}/test_dev.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_dev/xvector.scp \ + --trial-list data/lre22_dev/utt2lang \ + --svm.eval-type cat-log-post \ + --has-labels \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_dev_scores.tsv + + $train_cmd ${score_dir}/test_eval.log \ + hyp_utils/conda_env.sh \ + steps_be/eval_be_v5.py \ + --v-file scp:$xvector_dir/lre22_eval/xvector.scp \ + --trial-list data/lre22_eval/utt2spk \ + --svm.eval-type cat-log-post \ + --model-dir $be_dir \ + --score-file ${score_dir}/nocal/lre22_eval_scores.tsv + + ) & + + wait + hyp_utils/conda_env.sh \ + local/merge_scores.py \ + --in-score-files ${score_dir}_p12/nocal/lre22_dev_p{1,2}_scores.tsv \ + --out-score-file ${score_dir}_p12/nocal/lre22_dev_scores.tsv + + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/nocal/lre22_dev_scores.tsv \ + ${score_dir}_p12/nocal/lre22_dev_results + + local/train_calibration_lre22.sh ${score_dir}_p12 + local/score_lre22.sh \ + dev \ + ${score_dir}_p12/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}_p12/cal_v1/lre22_dev_results + + local/score_lre22.sh \ + dev \ + ${score_dir}/nocal/lre22_dev_scores.tsv \ + ${score_dir}/nocal/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/nocal/lre22_eval_scores.tsv \ + ${score_dir}/nocal/lre22_eval_results + + local/eval_calibration_lre22.sh $score_dir ${score_dir}_p12/cal_v1/cal.mat + local/score_lre22.sh \ + dev \ + ${score_dir}/cal_v1/lre22_dev_scores.tsv \ + ${score_dir}/cal_v1/lre22_dev_results + local/score_lre22.sh \ + eval \ + ${score_dir}/cal_v1/lre22_eval_scores.tsv \ + ${score_dir}/cal_v1/lre22_eval_results + + # local/validate_lre22.sh \ + # ${score_dir}/cal_v1/lre22_eval_scores.tsv + + + ) & + done + done + done + done + wait + +fi From e49157dfa1420091e603bf832abfe4e5acf1d158 Mon Sep 17 00:00:00 2001 From: System User Date: Thu, 30 Nov 2023 13:11:57 -0500 Subject: [PATCH 123/154] clean up recipe lre22/open.v2.8k --- egs/lre22/open.v1.8k/README.md | 6 +- ...onfig_fbank64_stmn_fwseres2net50s8_v1.0.sh | 2 +- egs/lre22/open.v1.8k/run_050_fusion_v1.sh | 43 +++++++++++++ egs/lre22/open.v2.8k/README.md | 58 +++++++++++++++++ ...2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml | 2 +- ...2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml | 62 +++++++++++++++++++ ...ig_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh | 15 ++--- egs/lre22/open.v2.8k/run_050_fusion_v1.sh | 46 ++++++++++++++ 8 files changed, 219 insertions(+), 15 deletions(-) create mode 100755 egs/lre22/open.v1.8k/run_050_fusion_v1.sh create mode 100644 egs/lre22/open.v2.8k/README.md create mode 100644 egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml create mode 100755 egs/lre22/open.v2.8k/run_050_fusion_v1.sh diff --git a/egs/lre22/open.v1.8k/README.md b/egs/lre22/open.v1.8k/README.md index 9ad41229..d55ced4e 100644 --- a/egs/lre22/open.v1.8k/README.md +++ b/egs/lre22/open.v1.8k/README.md @@ -1,4 +1,4 @@ -# LRE22 Fixed Condition V1 +# LRE22 Open Condition V1 Recipe for the NIST LRE22 open condition based to the JHU-MIT Submission. @@ -51,5 +51,5 @@ run_040_be_final.sh --config-file global_conf/config_fbank64_stmn_fwseres2net50s | Config | Model Type | Model Details | Back-end | Dev MinCp | Dev ActCp | Eval MinCp | Eval ActCp | | ------ | ---------- | ------------- | -------- | :-------: | :-------: | :--------: | :--------: | | config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | ECAPA-TDNN 2048x4 | Stage-1 | GBE | 0.100 | 0.101 | 0.105 | 0.106 | -| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-1 | GBE | -| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | +| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-1 | GBE | 0.092 | 0.093 | 0.103 | 0.104 | +| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | 0.082 | 0.083 | 0.089 | 0.090 | diff --git a/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh index 6f6bc98b..352cd1a6 100644 --- a/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh +++ b/egs/lre22/open.v1.8k/global_conf/config_fbank64_stmn_fwseres2net50s8_v1.0.sh @@ -13,7 +13,7 @@ nnet_type=resnet nnet_stages=1 nnet_s1_base_cfg=conf/train_fwseres2net50s8_xvec_stage1_v1.0.yaml -nnet_name=${feat_type}_fwseres2net50s8_v1.2 +nnet_name=${feat_type}_fwseres2net50s8_v1.0 nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/model_ep0012.pth diff --git a/egs/lre22/open.v1.8k/run_050_fusion_v1.sh b/egs/lre22/open.v1.8k/run_050_fusion_v1.sh new file mode 100755 index 00000000..5f9a1624 --- /dev/null +++ b/egs/lre22/open.v1.8k/run_050_fusion_v1.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +score_dir_0=exp/scores +nnet_1=fbank64_stmn_ecapatdnn2048x4_v1.0.s1 +nnet_2=fbank64_stmn_fwseres2net50s8_v1.0.s1 +be_1=pca1_cw_lnorm_lgbe_lre22_aug +score_dirs="$score_dir_0/$nnet_1/$be_1 +$score_dir_0/$nnet_2/$be_1" + +train_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"_p12/cal_v1" }; print $0}') +test_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"/cal_v1" }; print $0}') + +output_dir=exp/fusion/fus_v1.0 + +local/train_fusion_lre22.sh "$train_score_dirs" $output_dir/train +local/score_lre22.sh \ + dev \ + ${output_dir}/train/lre22_dev_scores.tsv \ + ${output_dir}/train/lre22_dev_results + +local/eval_fusion_lre22.sh "$test_score_dirs" $output_dir/train/fus.mat $output_dir/test + +local/score_lre22.sh \ + dev \ + ${output_dir}/test/lre22_dev_scores.tsv \ + ${output_dir}/test/lre22_dev_results + +local/score_lre22.sh eval \ + ${output_dir}/test/lre22_eval_scores.tsv \ + ${output_dir}/test/lre22_eval_results + + + + + + diff --git a/egs/lre22/open.v2.8k/README.md b/egs/lre22/open.v2.8k/README.md new file mode 100644 index 00000000..c500d811 --- /dev/null +++ b/egs/lre22/open.v2.8k/README.md @@ -0,0 +1,58 @@ +# LRE22 Open Condition V2 + +Recipe for the NIST LRE22 open condition based to the JHU-MIT Submission, using Hugging Face Wav2Vec + x-vector model. + +## Citing +``` +@inproceedings{villalba23_interspeech, + author={Jesús Villalba and Jonas Borgstrom and Maliha Jahan and Saurabh Kataria and Leibny Paola Garcia and Pedro Torres-Carrasquillo and Najim Dehak}, + title={{Advances in Language Recognition in Low Resource African Languages: The JHU-MIT Submission for NIST LRE22}}, + year=2023, + booktitle={Proc. INTERSPEECH 2023}, + pages={521--525}, + doi={10.21437/Interspeech.2023-1094} +} +``` + +## Training Data + + - x-Vector networks trained on: + - VoxLingua107 + - NIST LRE17 Train + Dev + Eval / CTS + AfV without Maghrebi Arabic + - NIST SRE16 + - NIST SRE18 + - NIST SRE19 CMN2 + - NIST SRE21 + - NIST SRE CTS Superset + - IARPA Babel + - Fleurs + - LWAZI 2009 + - NCHLT 2014 + - AMMI 2020 + - CommonVoice Tigrinya, Indian English, French + - ADI 2017 + - AST + - Gaussian back-end trained on: + - NIST LRE22 dev with 2-fold cross-val + x10 augmentations + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it uses Wav2Vec2 XLSR 300M + - To change the default network run scripts with the config-file argument: +```bash +run_011_train_xvector.sh --config-file global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh +run_030_extract_xvectors.sh --config-file global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh --use-gpu true +run_040_be_final.sh --config-file global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh +``` + +## Results + +| Config | Model Type | Model Details | Back-end | Dev MinCp | Dev ActCp | Eval MinCp | Eval ActCp | +| ------ | ---------- | ------------- | -------- | :-------: | :-------: | :--------: | :--------: | +| config_fbank64_stmn_ecapatdnn2048x4_v1.0.sh | ECAPA-TDNN 2048x4 | Stage-1 | GBE | 0.100 | 0.101 | 0.105 | 0.106 | +| config_fbank64_stmn_fwseres2net50s8_v1.0.sh | fw-SE Res2Net50 scale=8 | Stage-1 | GBE | 0.092 | 0.093 | 0.103 | 0.104 | +| Fusion ECAPA-TDNN + FwSE Res2Net50 | | | FoCal | 0.082 | 0.083 | 0.089 | 0.090 | +| config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh | Wav2VectXLR 300M + ECAPA-TDNN 1024x3 | Stage-1 | GBE | 0.088 | 0.089 | 0.106 | 0.107 | +| " | " | Stage-2 | GBE | 0.083 | 0.085 | 0.089 | 0.090 | +| Fusion ECAPA-TDNN + FwSE Res2Net50 + Wav2Vec2 | | | FoCal | 0.069 | 0.072 | 0.076 | 0.077 | diff --git a/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml index a7f3b111..d33e30f4 100644 --- a/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml +++ b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml @@ -37,7 +37,7 @@ model: wav2vec2xlsr300m_ecapatdnn1024x3_subcenter.yaml trainer: optim: opt_type: sgd - lr: 0.4 + lr: 0.04 momentum: 0.9 weight_decay: 4e-4 lrsched: diff --git a/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml new file mode 100644 index 00000000..090093b3 --- /dev/null +++ b/egs/lre22/open.v2.8k/conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml @@ -0,0 +1,62 @@ +data: + train: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - class_id + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - class_id + target_sample_freq: 16000 + wav_scale: 1 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 3.0 + min_chunk_length: 3.0 + data_loader: + num_workers: 8 +model: + xvector: + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 32.0 + margin: 0.0 + margin_warmup_epochs: 0 + intertop_margin: 0. +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 1e-4 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 20000 + min_lr: 1e-6 + warmup_steps: 10000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 6 + eff_batch_size: 512 + train_mode: full diff --git a/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh b/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh index bf6c3528..910b4bad 100644 --- a/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh +++ b/egs/lre22/open.v2.8k/global_conf/config_wav2vec2xlr300m_ecapatdnn1024x3_v1.0.sh @@ -10,7 +10,7 @@ vad_config=conf/vad_8k.yaml nnet_data=open # x-vector cfg - +nnet_stages=2 nnet_type=hf_wav2vec2resnet1d nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage1_v1.0.yaml @@ -22,15 +22,10 @@ nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/model_ep0012.pth -nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v2.2.yaml +nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage2_v1.0.yaml nnet_s2_args="" +nnet_name=${hf_model_name}_ecapatdnn1024x3_v1.0 nnet_s2_name=${nnet_name}.s2 nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/model_ep0008.pth - -nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_ecapatdnn1024x3_stage3_v2.2.yaml -nnet_s3_args="" -nnet_s3_name=${nnet_name}.s3 -nnet_s3_dir=exp/xvector_nnets/$nnet_s3_name -nnet_s3=$nnet_s3_dir/model_ep0002.pth -nnet_s3=$nnet_s3_dir/model_ep0005.pth +nnet_s2=$nnet_s2_dir/model_ep0006.pth + diff --git a/egs/lre22/open.v2.8k/run_050_fusion_v1.sh b/egs/lre22/open.v2.8k/run_050_fusion_v1.sh new file mode 100755 index 00000000..056c2f0b --- /dev/null +++ b/egs/lre22/open.v2.8k/run_050_fusion_v1.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +score_dir_fixed=../open.v1.8k/exp/scores +score_dir_0=exp/scores +nnet_1=fbank64_stmn_ecapatdnn2048x4_v1.0.s1 +nnet_2=fbank64_stmn_fwseres2net50s8_v1.0.s1 +nnet_3=wav2vec2xlsr300m_ecapatdnn1024x3_v1.0.s2 +be_1=pca1_cw_lnorm_lgbe_lre22_aug +score_dirs="$score_dir_fixed/$nnet_1/$be_1 +$score_dir_fixed/$nnet_2/$be_1 +$score_dir_0/$nnet_3/$be_1" + +train_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"_p12/cal_v1" }; print $0}') +test_score_dirs=$(echo $score_dirs | awk '{ for(i=1;i<=NF;i++){ $i=$i"/cal_v1" }; print $0}') + +output_dir=exp/fusion/fus_v1.0 + +local/train_fusion_lre22.sh "$train_score_dirs" $output_dir/train +local/score_lre22.sh \ + dev \ + ${output_dir}/train/lre22_dev_scores.tsv \ + ${output_dir}/train/lre22_dev_results + +local/eval_fusion_lre22.sh "$test_score_dirs" $output_dir/train/fus.mat $output_dir/test + +local/score_lre22.sh \ + dev \ + ${output_dir}/test/lre22_dev_scores.tsv \ + ${output_dir}/test/lre22_dev_results + +local/score_lre22.sh eval \ + ${output_dir}/test/lre22_eval_scores.tsv \ + ${output_dir}/test/lre22_eval_results + + + + + + From ae47ce6a5b51e64012743c62f4cd28582cd5b711 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 8 Dec 2023 19:48:38 -0500 Subject: [PATCH 124/154] dino seems to be working --- ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml | 1 + ...train_res2net50w26s4_xvec_stage2_v1.0.yaml | 1 + ...train_res2net50w26s8_xvec_stage2_v1.0.yaml | 1 + ...in_tseres2net50w26s4_xvec_stage2_v1.0.yaml | 1 + ...rain_ecapatdnn2048x4_xvec_stage2_v1.0.yaml | 1 + ...train_res2net50w26s8_xvec_stage2_v1.0.yaml | 1 + ...in_tseres2net50w26s4_xvec_stage2_v1.0.yaml | 1 + egs/voxceleb/ssl.v1/run_005_train_dino.sh | 81 ++-- .../ssl.v1/run_006_extract_dino_embeds.sh | 139 ++++++ .../train_cfwseresnet34_xvec_stage2_v3.0.yaml | 1 + .../train_cwseresnet34_xvec_stage2_v3.0.yaml | 1 + ...rain_ecapatdnn2048x4_xvec_stage2_v2.0.yaml | 6 +- ...rain_ecapatdnn2048x4_xvec_stage2_v3.0.yaml | 1 + ...train_ecapatdnn512x3_xvec_stage2_v2.0.yaml | 1 + ...train_ecapatdnn512x3_xvec_stage2_v3.0.yaml | 1 + .../train_fwseresnet34_xvec_stage2_v3.0.yaml | 1 + ...rain_idrnd_resnet100_xvec_stage2_v2.0.yaml | 1 + ...rain_idrnd_resnet100_xvec_stage2_v3.0.yaml | 1 + ...rain_idrnd_resnet202_xvec_stage2_v2.0.yaml | 1 + ...train_res2net50w26s4_xvec_stage2_v3.0.yaml | 1 + ...train_res2net50w26s8_xvec_stage2_v3.0.yaml | 1 + .../train_tseresnet34_xvec_stage2_v3.0.yaml | 1 + .../train_cfwseresnet34_xvec_stage2_v3.0.yaml | 1 + .../train_cwseresnet34_xvec_stage2_v3.0.yaml | 1 + ...rain_ecapatdnn2048x4_xvec_stage2_v3.0.yaml | 1 + ...train_ecapatdnn512x3_xvec_stage2_v3.0.yaml | 1 + .../train_fwseresnet34_xvec_stage2_v3.0.yaml | 1 + ...rain_idrnd_resnet100_xvec_stage2_v3.0.yaml | 1 + .../conf/train_resnet34_xvec_stage2_v3.0.yaml | 1 + .../train_tseresnet34_xvec_stage2_v3.0.yaml | 1 + .../config_fbank80_stmn_resnet34.v3.0.sh | 2 +- egs/voxceleb/v2.1/cmd.sh | 3 +- ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 1 + ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 1 + ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 1 + ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 1 + ...avlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml | 1 + ...avlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml | 1 + ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 1 + ...wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml | 1 + ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 1 + ...wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml | 4 +- ...rge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml | 1 + ...rge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml | 1 + ...c2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml | 1 + ...c2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml | 1 + ...vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml | 1 + ...vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml | 1 + ...lmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml | 1 + ...lmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml | 1 + ...lmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml | 1 + ...lmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml | 1 + ...lmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml | 1 + ...lmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml | 1 + ...lmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml | 1 + ...wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml | 1 + ...wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml | 1 + ...wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml | 1 + ...wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml | 1 + hyperion/bin/extract_wav2xvectors.py | 6 +- hyperion/bin/train_dino_wav2xvector.py | 232 ++++++++++ hyperion/bin/train_wav2rnn_transducer.py | 4 +- hyperion/bin/train_wav2vec2rnn_transducer.py | 4 +- hyperion/bin/train_wav2vec2xvector.py | 4 +- hyperion/bin/train_wav2xvector.py | 4 +- hyperion/bin/train_xvector_from_feats.py | 4 +- hyperion/bin/train_xvector_from_wav.py | 4 +- hyperion/torch/data/__init__.py | 4 + hyperion/torch/data/audio_dataset.py | 51 ++- hyperion/torch/data/dino_audio_dataset.py | 352 +++++++++++++++ hyperion/torch/data/embed_dataset.py | 9 +- hyperion/torch/data/hyp_sampler.py | 12 +- hyperion/torch/data/seg_chunk_sampler.py | 17 +- .../layer_blocks/conformer_encoder_v1.py | 6 + hyperion/torch/layers/__init__.py | 8 +- hyperion/torch/layers/activation_factory.py | 4 + hyperion/torch/layers/global_pool.py | 49 ++- hyperion/torch/layers/norm_layer_factory.py | 2 +- hyperion/torch/layers/pos_encoder.py | 56 ++- hyperion/torch/layers/swish.py | 23 +- hyperion/torch/losses/__init__.py | 1 + hyperion/torch/losses/dino_loss.py | 164 +++++++ hyperion/torch/lr_schedulers/factory.py | 94 ++--- .../torch/models/transducer/rnn_transducer.py | 89 ++-- .../models/wav2xvectors/hf_wav2xvector.py | 17 +- .../wav2xvectors/wav2conformer_v1_xvector.py | 18 + .../wav2xvectors/wav2resnet1d_xvector.py | 22 +- .../models/wav2xvectors/wav2resnet_xvector.py | 22 +- .../torch/models/wav2xvectors/wav2xvector.py | 23 +- .../models/xvectors/conformer_v1_xvector.py | 35 ++ .../models/xvectors/efficient_net_xvector.py | 48 ++- .../torch/models/xvectors/resnet1d_xvector.py | 69 ++- .../torch/models/xvectors/resnet_xvector.py | 45 +- .../torch/models/xvectors/spinenet_xvector.py | 40 +- .../torch/models/xvectors/tdnn_xvector.py | 39 +- .../models/xvectors/transformer_xvector_v1.py | 66 ++- hyperion/torch/models/xvectors/xvector.py | 318 +++++++++++--- hyperion/torch/narchs/__init__.py | 2 + hyperion/torch/narchs/audio_feats_mvn.py | 4 +- hyperion/torch/narchs/classif_head.py | 25 +- hyperion/torch/narchs/conformer_encoder_v1.py | 87 +++- hyperion/torch/narchs/dino_head.py | 337 +++++++++++++++ hyperion/torch/narchs/feat_fuser_mvn.py | 4 +- hyperion/torch/narchs/proj_head.py | 149 +++++++ .../torch/narchs/rnn_transducer_decoder.py | 48 ++- hyperion/torch/optim/__init__.py | 1 + hyperion/torch/optim/factory.py | 54 ++- hyperion/torch/optim/radam.py | 3 - hyperion/torch/torch_model.py | 28 +- hyperion/torch/trainers/__init__.py | 4 +- hyperion/torch/trainers/ae_trainer.py | 50 +-- .../torch/trainers/dino_xvector_trainer.py | 385 +++++++++++++++++ hyperion/torch/trainers/dvae_trainer.py | 18 +- hyperion/torch/trainers/plda_trainer.py | 44 +- hyperion/torch/trainers/torch_trainer.py | 399 ++++++++++++++---- hyperion/torch/trainers/transducer_trainer.py | 58 ++- hyperion/torch/trainers/vae_trainer.py | 19 +- hyperion/torch/trainers/vq_dvae_trainer.py | 51 +-- hyperion/torch/trainers/vq_vae_trainer.py | 20 +- .../torch/trainers/xvector_adv_trainer.py | 50 +-- .../trainers/xvector_adv_trainer_from_wav.py | 49 +-- hyperion/torch/trainers/xvector_trainer.py | 16 +- .../trainers/xvector_trainer_deep_feat_reg.py | 47 +-- .../xvector_trainer_deep_feat_reg_from_wav.py | 48 +-- .../trainers/xvector_trainer_from_wav.py | 8 +- hyperion/torch/utils/ddp.py | 4 +- hyperion/utils/dataset.py | 24 +- 127 files changed, 3394 insertions(+), 895 deletions(-) create mode 100755 egs/voxceleb/ssl.v1/run_006_extract_dino_embeds.sh create mode 100755 hyperion/bin/train_dino_wav2xvector.py create mode 100644 hyperion/torch/data/dino_audio_dataset.py create mode 100644 hyperion/torch/losses/dino_loss.py create mode 100644 hyperion/torch/narchs/dino_head.py create mode 100644 hyperion/torch/narchs/proj_head.py create mode 100644 hyperion/torch/trainers/dino_xvector_trainer.py diff --git a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml index e7f9969b..30483a8b 100644 --- a/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml +++ b/egs/sre21-av-a/v1.16k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.5 margin_warmup_epochs: 3 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml index 9884bb4c..49f84a6a 100644 --- a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s4_xvec_stage2_v1.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.5 margin_warmup_epochs: 3 diff --git a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml index f34b4896..c85c0e7b 100644 --- a/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +++ b/egs/sre21-av-a/v1.16k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.5 margin_warmup_epochs: 3 diff --git a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml index f34b4896..c85c0e7b 100644 --- a/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml +++ b/egs/sre21-av-a/v1.16k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.5 margin_warmup_epochs: 3 diff --git a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml index 031e9ca3..72dec1b7 100644 --- a/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml +++ b/egs/sre21-av-a/v1.8k/conf/train_ecapatdnn2048x4_xvec_stage2_v1.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank64_stmn_8k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.5 margin_warmup_epochs: 3 diff --git a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml index 16203033..1b917e2c 100644 --- a/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml +++ b/egs/sre21-av-a/v1.8k/conf/train_res2net50w26s8_xvec_stage2_v1.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank64_stmn_8k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.5 margin_warmup_epochs: 3 diff --git a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml index f34b4896..c85c0e7b 100644 --- a/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml +++ b/egs/sre21-av-a/v1.8k/conf/train_tseres2net50w26s4_xvec_stage2_v1.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.5 margin_warmup_epochs: 3 diff --git a/egs/voxceleb/ssl.v1/run_005_train_dino.sh b/egs/voxceleb/ssl.v1/run_005_train_dino.sh index eb1c591e..58399159 100755 --- a/egs/voxceleb/ssl.v1/run_005_train_dino.sh +++ b/egs/voxceleb/ssl.v1/run_005_train_dino.sh @@ -37,17 +37,16 @@ if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi -# Network Training +# # Network Training if [ $stage -le 1 ]; then mkdir -p $nnet_s1_dir/log $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - hyperion-train-wav2vec2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + hyperion-train-dino-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ - --data.train.dataset.class-files $train_data_dir/speaker.csv \ --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ --data.val.dataset.segments-file $val_data_dir/segments.csv \ --trainer.exp-path $nnet_s1_dir \ @@ -56,44 +55,44 @@ if [ $stage -le 1 ]; then fi -# Finetune full model -if [ $stage -le 2 ]; then - if [ "$use_wandb" == "true" ];then - extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" - fi - mkdir -p $nnet_s2_dir/log - $cuda_cmd \ - --gpu $ngpu $nnet_s2_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ - --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ - --data.train.dataset.segments-file $train_data_dir/segments.csv \ - --data.train.dataset.class-files $train_data_dir/speaker.csv \ - --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ - --data.val.dataset.segments-file $val_data_dir/segments.csv \ - --in-model-file $nnet_s1 \ - --trainer.exp-path $nnet_s2_dir \ - --num-gpus $ngpu \ +# # Finetune full model +# if [ $stage -le 2 ]; then +# if [ "$use_wandb" == "true" ];then +# extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" +# fi +# mkdir -p $nnet_s2_dir/log +# $cuda_cmd \ +# --gpu $ngpu $nnet_s2_dir/log/train.log \ +# hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ +# hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ +# --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ +# --data.train.dataset.segments-file $train_data_dir/segments.csv \ +# --data.train.dataset.class-files $train_data_dir/speaker.csv \ +# --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ +# --data.val.dataset.segments-file $val_data_dir/segments.csv \ +# --in-model-file $nnet_s1 \ +# --trainer.exp-path $nnet_s2_dir \ +# --num-gpus $ngpu \ -fi +# fi -# Finetune full model -if [ $stage -le 3 ]; then - if [ "$use_wandb" == "true" ];then - extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" - fi - mkdir -p $nnet_s3_dir/log - $cuda_cmd \ - --gpu $ngpu $nnet_s3_dir/log/train.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ - --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ - --data.train.dataset.segments-file $train_data_dir/segments.csv \ - --data.train.dataset.class-files $train_data_dir/speaker.csv \ - --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ - --data.val.dataset.segments-file $val_data_dir/segments.csv \ - --in-model-file $nnet_s2 \ - --trainer.exp-path $nnet_s3_dir \ - --num-gpus $ngpu \ +# # Finetune full model +# if [ $stage -le 3 ]; then +# if [ "$use_wandb" == "true" ];then +# extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)" +# fi +# mkdir -p $nnet_s3_dir/log +# $cuda_cmd \ +# --gpu $ngpu $nnet_s3_dir/log/train.log \ +# hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ +# hyperion-finetune-wav2vec2xvector $nnet_type --cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \ +# --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ +# --data.train.dataset.segments-file $train_data_dir/segments.csv \ +# --data.train.dataset.class-files $train_data_dir/speaker.csv \ +# --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ +# --data.val.dataset.segments-file $val_data_dir/segments.csv \ +# --in-model-file $nnet_s2 \ +# --trainer.exp-path $nnet_s3_dir \ +# --num-gpus $ngpu \ -fi +# fi diff --git a/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds.sh b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds.sh new file mode 100755 index 00000000..36ccd294 --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds.sh @@ -0,0 +1,139 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=2 +nnet_stage=1 +config_file=default_config.sh +use_gpu=false +xvec_chunk_length=120.0 +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu --chunk-length $xvec_chunk_length" + xvec_cmd="$cuda_eval_cmd --gpu 1 --mem 6G" + num_gpus=1 +else + xvec_cmd="$train_cmd --mem 12G" + num_gpus=0 +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name +score_dir=exp/scores/$nnet_name +score_cosine_dir=$score_dir/cosine + +if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 2 ]; then + # Extracts x-vectors for evaluation + nj=100 + if [ "$do_voxsrc22" == "true" ];then + extra_data="voxsrc22_dev" + fi + for name in voxceleb1_test $extra_data + do + num_segs=$(wc -l data/$name/segments.csv | awk '{ print $1-1}') + nj=$(($num_segs < 100 ? $num_segs:100)) + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + +if [ $stage -le 3 ];then + + echo "Eval Voxceleb 1 with Cosine scoring" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_cosine_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-cosine-scoring-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_cosine_dir/voxceleb1_scores.csv \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_cosine_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_cosine_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_cosine_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_cosine_dir/voxceleb1_results.csv + + cat $score_cosine_dir/voxceleb1_results.csv +fi + diff --git a/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml index 04665cac..9a9dfc06 100644 --- a/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml index 04665cac..9a9dfc06 100644 --- a/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml index e7a94225..a2e63b54 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v2.0.yaml @@ -41,15 +41,11 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.4 margin_warmup_epochs: 0 intertop_margin: 0.1 - # override_dropouts: false - # dropout_rate: 0.1 - # resnet_enc: - # override_dropouts: true - # dropout_rate: 0.1 trainer: optim: opt_type: sgd diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml index 877736b3..f3573b4a 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml index b6163f14..bb9c8c79 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v2.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml index 45e55d97..13f9cd9a 100644 --- a/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml index 04665cac..9a9dfc06 100644 --- a/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml index 2311b07b..65cd737c 100644 --- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v2.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml index 469e166b..c7437e94 100644 --- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml index e4e6d97a..09a5345f 100644 --- a/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_idrnd_resnet202_xvec_stage2_v2.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml index 469e166b..c7437e94 100644 --- a/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s4_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml index 5c9af011..63a5cb25 100644 --- a/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_res2net50w26s8_xvec_stage2_v3.0.yaml @@ -47,6 +47,7 @@ model: # dropout_rate: 0.0 dropout_rate: 0.2 trainer: + override_output: true optim: opt_type: sgd lr: 1e-3 diff --git a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml index 04665cac..9a9dfc06 100644 --- a/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.1/conf/train_tseresnet34_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 feats: fbank80_stmn_16k.yaml model: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml index 0923a608..c0bd44e5 100644 --- a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml index 0923a608..c0bd44e5 100644 --- a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml index 21f0db8b..c348e7c5 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml index 9788bb7c..9008a04c 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml index 0923a608..c0bd44e5 100644 --- a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml index 11d33ae2..b14cfc75 100644 --- a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml index 0923a608..c0bd44e5 100644 --- a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml index 0923a608..c0bd44e5 100644 --- a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml @@ -39,6 +39,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 30.0 margin: 0.3 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh index bb5d990c..cb1a172d 100644 --- a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.0.sh @@ -16,7 +16,7 @@ nnet_name=${feat_type}_resnet34.v3.0 nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.0.yaml nnet_s1_name=$nnet_name.s1 -nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name.kk2 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/model_ep0035.pth nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml diff --git a/egs/voxceleb/v2.1/cmd.sh b/egs/voxceleb/v2.1/cmd.sh index 040f458b..c95884ec 100755 --- a/egs/voxceleb/v2.1/cmd.sh +++ b/egs/voxceleb/v2.1/cmd.sh @@ -15,7 +15,8 @@ if [ "$(hostname -d)" == "cm.gemini" ];then export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" - export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_a100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 30G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml index 8504db9e..07bf8e5a 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -37,6 +37,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml index dda0c632..c58797cf 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -47,6 +47,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml index db36f8ee..5703104e 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -37,6 +37,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml index dda0c632..c58797cf 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -47,6 +47,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml index 8504db9e..07bf8e5a 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage2_v2.0.yaml @@ -37,6 +37,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml index ad56e80d..e9638704 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn1024x3_stage3_v2.0.yaml @@ -47,6 +47,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml index fc964f84..d1af05d8 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -37,6 +37,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml index ab6b3f4e..99002b45 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.1.yaml @@ -40,6 +40,7 @@ model: encoder_lr: 1e-2 feat_extract_lr: 1e-2 xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml index 928779f5..4a8c53d7 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -47,6 +47,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml index 7ab8cea7..9c7652ce 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.1.yaml @@ -46,10 +46,8 @@ data: data_loader: num_workers: 8 model: - #hf_feats: - # encoder_lr: 1e-2 - # feat_extract_lr: 1e-2 xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml index b5b9b6b6..d1ed9300 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage2_v2.0.yaml @@ -44,6 +44,7 @@ model: - q_proj - v_proj xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml index a39445ff..fbea3f0f 100644 --- a/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2.1/conf/train_wavlmlarge_loraqv_ecapatdnn512x3_stage3_v2.0.yaml @@ -47,6 +47,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml index 90e3b14f..84ecfc04 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage2_v1.0.yaml @@ -43,6 +43,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml index 7a2f7bba..fdaff633 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr300m_ecapatdnn512x3_stage3_v1.0.yaml @@ -43,6 +43,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml index 90e3b14f..84ecfc04 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage2_v1.0.yaml @@ -43,6 +43,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml index 69bcc097..58fe1d49 100644 --- a/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wav2vec2xlsr53_ecapatdnn512x3_stage3_v1.0.yaml @@ -43,6 +43,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml index 90e3b14f..84ecfc04 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v1.0.yaml @@ -43,6 +43,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml index 69a8322b..f8e620c1 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage2_v2.0.yaml @@ -37,6 +37,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml index 69bcc097..58fe1d49 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v1.0.yaml @@ -43,6 +43,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml index 3443591a..5013e5af 100644 --- a/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmbaseplus_ecapatdnn512x3_stage3_v2.0.yaml @@ -47,6 +47,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml index 7287188c..9fec8986 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage2_v2.0.yaml @@ -37,6 +37,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml index 69bcc097..58fe1d49 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v1.0.yaml @@ -43,6 +43,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml index 3443591a..5013e5af 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge12l_ecapatdnn512x3_stage3_v2.0.yaml @@ -47,6 +47,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml index 90e3b14f..84ecfc04 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v1.0.yaml @@ -43,6 +43,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml index 69a8322b..f8e620c1 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage2_v2.0.yaml @@ -37,6 +37,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.2 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml index 69bcc097..58fe1d49 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v1.0.yaml @@ -43,6 +43,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml index 5e1260ad..2ea1589d 100644 --- a/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml +++ b/egs/voxceleb/v2/conf/train_wavlmlarge_ecapatdnn512x3_stage3_v2.0.yaml @@ -47,6 +47,7 @@ data: num_workers: 8 model: xvector: + override_output: true cos_scale: 32.0 margin: 0.4 margin_warmup_epochs: 0 diff --git a/hyperion/bin/extract_wav2xvectors.py b/hyperion/bin/extract_wav2xvectors.py index 763df3fc..3cca3ede 100755 --- a/hyperion/bin/extract_wav2xvectors.py +++ b/hyperion/bin/extract_wav2xvectors.py @@ -25,7 +25,9 @@ from hyperion.io import SequentialAudioReader as AR from hyperion.io import VADReaderFactory as VRF from hyperion.np.augment import SpeechAugment -from hyperion.torch import TorchModelLoader as TML + +# from hyperion.torch import TorchModelLoader as TML +from hyperion.torch import TorchModel from hyperion.torch.utils import open_device from hyperion.utils import Utt2Info @@ -59,7 +61,7 @@ def init_device(use_gpu): def load_model(model_path, device): logging.info("loading model %s", model_path) - model = TML.load(model_path) + model = TorchModel.auto_load(model_path) logging.info(f"xvector-model={model}") model.to(device) model.eval() diff --git a/hyperion/bin/train_dino_wav2xvector.py b/hyperion/bin/train_dino_wav2xvector.py new file mode 100755 index 00000000..d1cd108e --- /dev/null +++ b/hyperion/bin/train_dino_wav2xvector.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import multiprocessing +import os +from pathlib import Path + +import torch +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger, set_float_cpu +from hyperion.torch.data import DINOAudioDataset as AD +from hyperion.torch.data import SegSamplerFactory +from hyperion.torch.losses import DINOLoss +from hyperion.torch.metrics import CategoricalAccuracy + +# from hyperion.torch.models import EfficientNetXVector as EXVec +from hyperion.torch.models import Wav2ConformerV1XVector as CXVec +from hyperion.torch.models import Wav2ResNet1dXVector as R1dXVec +from hyperion.torch.models import Wav2ResNetXVector as RXVec + +# from hyperion.torch.models import SpineNetXVector as SpineXVec +# from hyperion.torch.models import TDNNXVector as TDXVec +# from hyperion.torch.models import TransformerXVectorV1 as TFXVec +from hyperion.torch.trainers import DINOXVectorTrainer as Trainer +from hyperion.torch.utils import ddp + +xvec_dict = { + "resnet": RXVec, + "resnet1d": R1dXVec, + "conformer": CXVec, + # "efficientnet": EXVec, + # "tdnn": TDXVec, + # "transformer": TFXVec, + # "spinenet": SpineXVec, +} + + +def init_data(partition, rank, num_gpus, **kwargs): + kwargs = kwargs["data"][partition] + ad_args = AD.filter_args(**kwargs["dataset"]) + sampler_args = kwargs["sampler"] + if rank == 0: + logging.info("{} audio dataset args={}".format(partition, ad_args)) + logging.info("{} sampler args={}".format(partition, sampler_args)) + logging.info("init %s dataset", partition) + + is_val = partition == "val" + ad_args["is_val"] = is_val + sampler_args["shuffle"] = not is_val + dataset = AD(**ad_args) + + if rank == 0: + logging.info("init %s samplers", partition) + + sampler = SegSamplerFactory.create(dataset, **sampler_args) + + if rank == 0: + logging.info("init %s dataloader", partition) + + num_workers = kwargs["data_loader"]["num_workers"] + num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) + data_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler, **largs) + return data_loader + + +def init_student_xvector(num_classes, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["student_model"]) + if rank == 0: + logging.info(f"student xvector network args={xvec_args}") + xvec_args["xvector"]["num_classes"] = num_classes + model = xvec_class(**xvec_args) + if rank == 0: + logging.info(f"student-model={model}") + return model + + +def init_teacher_xvector(student_model, rank, xvec_class, **kwargs): + xvec_args = xvec_class.filter_args(**kwargs["teacher_model"]) + if rank == 0: + logging.info(f"teacher xvector network args={xvec_args}") + # xvec_args["xvector"]["num_classes"] = num_classes + model = student_model.clone() + model.change_config(**xvec_args) + if rank == 0: + logging.info(f"teacher-model={model}") + return model + + +def init_dino_loss(rank, **kwargs): + loss_args = kwargs["dino_loss"] + if rank == 0: + logging.info(f"dino loss args={loss_args}") + loss = DINOLoss(**loss_args) + if rank == 0: + logging.info(f"dino-loss={loss}") + + return loss + + +def train_xvec(gpu_id, args): + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + kwargs = namespace_to_dict(args) + torch.manual_seed(args.seed) + set_float_cpu("float32") + + ddp_args = ddp.filter_ddp_args(**kwargs) + device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) + kwargs["rank"] = rank + + train_loader = init_data(partition="train", **kwargs) + val_loader = init_data(partition="val", **kwargs) + + dino_loss = init_dino_loss(**kwargs) + student_model = init_student_xvector(num_classes=dino_loss.num_classes, **kwargs) + kwargs["student_model"] = student_model + teacher_model = init_teacher_xvector(**kwargs) + + trn_args = Trainer.filter_args(**kwargs["trainer"]) + if rank == 0: + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + student_model, + teacher_model, + dino_loss, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args, + ) + trainer.load_last_checkpoint() + trainer.fit(train_loader, val_loader) + + ddp.ddp_cleanup() + + +def make_parser(xvec_class): + parser = ArgumentParser() + + parser.add_argument("--cfg", action=ActionConfigFile) + + train_parser = ArgumentParser(prog="") + + AD.add_class_args(train_parser, prefix="dataset") + SegSamplerFactory.add_class_args(train_parser, prefix="sampler") + train_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + + val_parser = ArgumentParser(prog="") + AD.add_class_args(val_parser, prefix="dataset") + SegSamplerFactory.add_class_args(val_parser, prefix="sampler") + val_parser.add_argument( + "--data_loader.num-workers", + type=int, + default=5, + help="num_workers of data loader", + ) + data_parser = ArgumentParser(prog="") + data_parser.add_argument("--train", action=ActionParser(parser=train_parser)) + data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) + parser.add_argument("--data", action=ActionParser(parser=data_parser)) + parser.link_arguments( + "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" + ) + + xvec_class.add_class_args(parser, prefix="student_model") + xvec_class.add_dino_teacher_args(parser, prefix="teacher_model") + DINOLoss.add_class_args(parser, prefix="dino_loss") + Trainer.add_class_args( + parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() + ) + ddp.add_ddp_args(parser) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + return parser + + +def main(): + parser = ArgumentParser(description="Train Wav2XVector from audio files") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for k, v in xvec_dict.items(): + parser_k = make_parser(v) + subcommands.add_subcommand(k, parser_k) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + xvec_type = args.subcommand + args_sc = vars(args)[xvec_type] + + if gpu_id == 0: + try: + config_file = Path(args_sc.trainer.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except: + pass + + args_sc.xvec_class = xvec_dict[xvec_type] + # torch docs recommend using forkserver + multiprocessing.set_start_method("forkserver") + train_xvec(gpu_id, args_sc) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index c00c4633..6d947d24 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -145,7 +145,7 @@ def make_parser(model_class): parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") - AD.add_class_args(train_parser, prefix="dataset", skip={}) + AD.add_class_args(train_parser, prefix="dataset") SegSamplerFactory.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", @@ -155,7 +155,7 @@ def make_parser(model_class): ) val_parser = ArgumentParser(prog="") - AD.add_class_args(val_parser, prefix="dataset", skip={}) + AD.add_class_args(val_parser, prefix="dataset") SegSamplerFactory.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", diff --git a/hyperion/bin/train_wav2vec2rnn_transducer.py b/hyperion/bin/train_wav2vec2rnn_transducer.py index 5b802454..fd94f19d 100755 --- a/hyperion/bin/train_wav2vec2rnn_transducer.py +++ b/hyperion/bin/train_wav2vec2rnn_transducer.py @@ -168,7 +168,7 @@ def make_parser(model_class): parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") - AD.add_class_args(train_parser, prefix="dataset", skip={}) + AD.add_class_args(train_parser, prefix="dataset") SegSamplerFactory.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", @@ -178,7 +178,7 @@ def make_parser(model_class): ) val_parser = ArgumentParser(prog="") - AD.add_class_args(val_parser, prefix="dataset", skip={}) + AD.add_class_args(val_parser, prefix="dataset") SegSamplerFactory.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", diff --git a/hyperion/bin/train_wav2vec2xvector.py b/hyperion/bin/train_wav2vec2xvector.py index 63ac34a9..c772fe3c 100755 --- a/hyperion/bin/train_wav2vec2xvector.py +++ b/hyperion/bin/train_wav2vec2xvector.py @@ -128,7 +128,7 @@ def make_parser(model_class): parser.add_argument("--cfg", action=ActionConfigFile) train_parser = ArgumentParser(prog="") - AD.add_class_args(train_parser, prefix="dataset", skip={}) + AD.add_class_args(train_parser, prefix="dataset") SegSamplerFactory.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", @@ -138,7 +138,7 @@ def make_parser(model_class): ) val_parser = ArgumentParser(prog="") - AD.add_class_args(val_parser, prefix="dataset", skip={}) + AD.add_class_args(val_parser, prefix="dataset") SegSamplerFactory.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py index 3138784d..2c4684c3 100755 --- a/hyperion/bin/train_wav2xvector.py +++ b/hyperion/bin/train_wav2xvector.py @@ -127,7 +127,7 @@ def make_parser(xvec_class): train_parser = ArgumentParser(prog="") - AD.add_class_args(train_parser, prefix="dataset", skip={}) + AD.add_class_args(train_parser, prefix="dataset") SegSamplerFactory.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", @@ -137,7 +137,7 @@ def make_parser(xvec_class): ) val_parser = ArgumentParser(prog="") - AD.add_class_args(val_parser, prefix="dataset", skip={}) + AD.add_class_args(val_parser, prefix="dataset") SegSamplerFactory.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", diff --git a/hyperion/bin/train_xvector_from_feats.py b/hyperion/bin/train_xvector_from_feats.py index 699aa410..c79e444f 100755 --- a/hyperion/bin/train_xvector_from_feats.py +++ b/hyperion/bin/train_xvector_from_feats.py @@ -126,7 +126,7 @@ def make_parser(xvec_class): train_parser = ArgumentParser(prog="") - SD.add_class_args(train_parser, prefix="dataset", skip={}) + SD.add_class_args(train_parser, prefix="dataset") Sampler.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", @@ -136,7 +136,7 @@ def make_parser(xvec_class): ) val_parser = ArgumentParser(prog="") - SD.add_class_args(val_parser, prefix="dataset", skip={}) + SD.add_class_args(val_parser, prefix="dataset") Sampler.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", diff --git a/hyperion/bin/train_xvector_from_wav.py b/hyperion/bin/train_xvector_from_wav.py index 67075a5d..eb251ad9 100755 --- a/hyperion/bin/train_xvector_from_wav.py +++ b/hyperion/bin/train_xvector_from_wav.py @@ -138,7 +138,7 @@ def make_parser(xvec_class): train_parser = ArgumentParser(prog="") - AD.add_class_args(train_parser, prefix="dataset", skip={}) + AD.add_class_args(train_parser, prefix="dataset") SegSamplerFactory.add_class_args(train_parser, prefix="sampler") train_parser.add_argument( "--data_loader.num-workers", @@ -148,7 +148,7 @@ def make_parser(xvec_class): ) val_parser = ArgumentParser(prog="") - AD.add_class_args(val_parser, prefix="dataset", skip={}) + AD.add_class_args(val_parser, prefix="dataset") SegSamplerFactory.add_class_args(val_parser, prefix="sampler") val_parser.add_argument( "--data_loader.num-workers", diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index 959a635a..ee5a661d 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -4,11 +4,15 @@ """ from .audio_dataset import AudioDataset + # samplers from .bucketing_seg_sampler import BucketingSegSampler +from .dino_audio_dataset import DINOAudioDataset from .embed_sampler_factory import EmbedSamplerFactory + # datasets from .feat_seq_dataset import FeatSeqDataset from .paired_feat_seq_dataset import PairedFeatSeqDataset + # from .weighted_seq_sampler import ClassWeightedSeqSampler from .seg_sampler_factory import SegSamplerFactory diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 4644f141..2329d0b1 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -29,6 +29,28 @@ class AudioDataset(Dataset): + """AudioDataset class + + Args: + recordings_file: recordings manifest file (kaldi .scp or pandas .csv) + segments_file: segments manifest file (kaldi .scp or pandas .csv) + class_names: list with the names of the types of classes in the datasets, e.g., speaker, language + class_files: list of class info files + time_durs_file: (deprecated) segment to duration in secs file, if durations are not in segments_file + bpe_model: bpe model for the text label + text_file: text file with words labels for each utterances + aug_cfgs: list of augmentation configuration files + num_augs: number of augmentations per segment and augmentation type + num_aug_mix: "number of AugMix augmentations per segment + aug_mix_alpha: AugMix Diritchlet distribution parameter + return_segment_info: list of columns of the segment file which should be returned as supervisions + return_orig: when using augmentation, whether or not to return also the original audio + target_sample_freq: target sampling frequencey, if not None all audios are converted to this sample freq + wav_scale: make waves to be in [-wav_scale, wav_scale] + is_val: is validation dataset. + seed: random seed", + """ + def __init__( self, recordings_file: str, @@ -418,14 +440,14 @@ def add_class_args(parser, prefix=None, skip=set()): parser.add_argument( "--recordings-file", required=True, - help=("recordings manifest file (kaldi .scp or pandas .csv)"), + help="recordings manifest file (kaldi .scp or pandas .csv)", ) if "segments_file" not in skip: parser.add_argument( "--segments-file", required=True, - help=("segments manifest file (kaldi .scp or pandas .csv)"), + help="segments manifest file (kaldi .scp or pandas .csv)", ) parser.add_argument( @@ -441,7 +463,7 @@ def add_class_args(parser, prefix=None, skip=set()): "--class-files", default=None, nargs="+", - help=("list of class info files"), + help="list of class info files", ) parser.add_argument( @@ -455,39 +477,40 @@ def add_class_args(parser, prefix=None, skip=set()): parser.add_argument( "--bpe-model", default=None, - help=("bpe model for the text label"), + help="bpe model for the text label", ) parser.add_argument( "--text-file", default=None, - help=("text file with words labels for each utterances"), + help="text file with words labels for each utterances", ) - parser.add_argument( - "--aug-cfgs", - default=None, - nargs="+", - help=("augmentation configuration file."), - ) + if "aug_cfgs" not in skip: + parser.add_argument( + "--aug-cfgs", + default=None, + nargs="+", + help="augmentation configuration file.", + ) parser.add_argument( "--num-augs", default=1, type=int, - help=("number of augmentations per segment and augmentation type"), + help="number of augmentations per segment and augmentation type", ) parser.add_argument( "--num-aug-mix", default=0, type=int, - help=("number of AugMix augmentations per segment"), + help="number of AugMix augmentations per segment", ) parser.add_argument( "--aug-mix-alpha", default=0.5, type=float, - help=("number of AugMix augmentations per segment"), + help="number of AugMix augmentations per segment", ) parser.add_argument( "--return-segment-info", diff --git a/hyperion/torch/data/dino_audio_dataset.py b/hyperion/torch/data/dino_audio_dataset.py new file mode 100644 index 00000000..bb0a93a5 --- /dev/null +++ b/hyperion/torch/data/dino_audio_dataset.py @@ -0,0 +1,352 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +import math +import time +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd + +# import k2 +import sentencepiece as spm +import torch +import torch.distributed as dist +import torchaudio.transforms as tat +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...io import RandomAccessAudioReader as AR +from ...np.augment import SpeechAugment +from ...utils.class_info import ClassInfo +from ...utils.misc import filter_func_args +from ...utils.segment_set import SegmentSet +from ...utils.text import read_text +from ..torch_defs import floatstr_torch +from .audio_dataset import AudioDataset + + +class DINOAudioDataset(AudioDataset): + """AudioDataset class to train DINO for speech + + Args: + recordings_file: recordings manifest file (kaldi .scp or pandas .csv) + segments_file: segments manifest file (kaldi .scp or pandas .csv) + class_names: list with the names of the types of classes in the datasets, e.g., speaker, language + class_files: list of class info files + time_durs_file: (deprecated) segment to duration in secs file, if durations are not in segments_file + bpe_model: bpe model for the text label + text_file: text file with words labels for each utterances + teacher_aug_cfg: configuration for teacher augmentations + student_aug_cfg: configuration for student augmentations. + aug_cfgs: list of augmentation configuration files + num_augs: number of augmentations per segment and augmentation type + num_aug_mix: "number of AugMix augmentations per segment + aug_mix_alpha: AugMix Diritchlet distribution parameter + return_segment_info: list of columns of the segment file which should be returned as supervisions + return_orig: when using augmentation, whether or not to return also the original audio + target_sample_freq: target sampling frequencey, if not None all audios are converted to this sample freq + wav_scale: make waves to be in [-wav_scale, wav_scale] + is_val: is validation dataset. + seed: random seed + teacher_chunk_length: chunk length for the teacher model + num_teacher_chunks: num teacher chunks in eachd batch + student_chunk_length: chunk length for the student model + num_student_chunks: num student chunks in eachd batch + same_teacher_student_chunks: is True if teacher and student chunks are overlapped, False if disjoint + """ + + def __init__( + self, + recordings_file: str, + segments_file: str, + class_names: Optional[List[str]] = None, + class_files: Optional[List[str]] = None, + bpe_model: Optional[str] = None, + text_file: Optional[str] = None, + time_durs_file: Optional[str] = None, + teacher_aug_cfg: Optional[str] = None, + student_aug_cfg: Optional[str] = None, + num_augs: int = 1, + num_aug_mix: int = 0, + aug_mix_alpha: float = 0, + return_segment_info: Optional[List[str]] = None, + return_orig: bool = False, + target_sample_freq: Optional[float] = None, + wav_scale: float = 1, + is_val: bool = False, + seed: int = 112358, + teacher_chunk_length: float = 4, + num_teacher_chunks: int = 2, + student_chunk_length: float = 2, + num_student_chunks: int = 4, + same_teacher_student_chunks: bool = False, + ): + aug_cfgs = [] + student_aug_idx = -1 + teacher_aug_idx = -1 + if student_aug_cfg is not None: + aug_cfgs.append(student_aug_cfg) + student_aug_idx = 0 + if teacher_aug_cfg is not None: + assert student_aug_idx is not None + if teacher_aug_cfg != student_aug_cfg: + aug_cfgs.append(teacher_aug_cfg) + teacher_aug_idx = 1 + else: + teacher_aug_idx = 0 + + super_args = filter_func_args(super().__init__, locals()) + super().__init__(**super_args) + self.teacher_chunk_length = teacher_chunk_length + self.num_teacher_chunks = num_teacher_chunks + self.student_chunk_length = student_chunk_length + self.num_student_chunks = num_student_chunks + self.same_teacher_student_chunks = same_teacher_student_chunks + if student_aug_idx != -1: + self.student_augmenter = self.augmenters[student_aug_idx] + if teacher_aug_idx != -1: + self.teacher_augmenter = self.augmenters[teacher_aug_idx] + + def _apply_chunk_augs(self, x, duration, fs, augmenter, tag): + if not augmenter: + return {f"x_{tag}": x} + + if duration == 0: + num_samples = len(x) + else: + num_samples = int(duration * fs) + + reverb_context_samples = len(x) - num_samples + x_orig = x[reverb_context_samples:] + x_augs = {} + for j in range(self.num_augs): + # augment x + x_aug, aug_info = augmenter(x) + # remove the extra left context used to compute the reverberation. + x_aug = x_aug[reverb_context_samples : len(x)] + x_aug = x_aug.astype(floatstr_torch(), copy=False) + x_augs[f"x_{tag}_aug_{j}"] = x_aug + + if self.num_aug_mix > 0: + x_augs = self._apply_aug_mix(x_orig, x_augs, 0) + + if self.return_orig: + x_augs[f"x_{tag}"] = x_orig + elif len(x_augs) == 1: + # if we just have one aug and we don't return the clean version, + # we just call x to the aug version + x_augs[f"x_{tag}"] = x_augs.pop(f"x_{tag}_aug_0") + + return x_augs + + def _apply_augs(self, xs, duration, fs, augmenter, tag): + x_augs = {} + for i, x in enumerate(xs): + x_augs_i = self._apply_chunk_augs(x, duration, fs, augmenter, f"{tag}_{i}") + x_augs.update(x_augs_i) + + return x_augs + + def _split_audio_into_chunks(self, x, x_samples, chunk_samples, num_chunks): + reverb_context = len(x) - x_samples + chunk_shift = (x_samples - chunk_samples) // num_chunks + xs = [] + for i in range(num_chunks): + x_start = i * chunk_shift + x_end = x_start + chunk_samples + reverb_context + xs.append(x[x_start:x_end]) + + return xs + + def _split_audio_into_teacher_student_disjoint(self, x, duration, fs): + total_samples = int(duration * fs) + teacher_chunk_samples = int(fs * self.teacher_chunk_length) + student_chunk_samples = int(fs * self.student_chunk_length) + sum_chunk = teacher_chunk_samples + student_chunk_samples + assert total_samples >= sum_chunk, f"signal samples = {len(x)} < {sum_chunk}" + + teacher_crops_x_chunk = self.num_teacher_chunks * teacher_chunk_samples + student_crops_x_chunk = self.num_student_chunks * student_chunk_samples + sum_crops_x_chunk = teacher_crops_x_chunk + student_crops_x_chunk + teacher_samples = max( + teacher_crops_x_chunk * total_samples // sum_crops_x_chunk, + teacher_chunk_samples, + ) + student_samples = total_samples - teacher_samples + # here we decide if we split the audio in [teacher, student] or [student, teacher] + teacher_first = self.rng.random() < 0.5 + + if teacher_first: + x1_samples = teacher_samples + # x2_samples = student_samples + else: + x1_samples = student_samples + # x2_samples = teacher_samples + + max_reverb_context = int(self.reverb_context * fs) + x1_reverb_context = len(x) - total_samples + x1_end_sample = x1_reverb_context + x1_samples + x1 = x[:x1_end_sample] + if x1_end_sample >= max_reverb_context: + x2_reverb_context = max_reverb_context + else: + x2_reverb_context = x1_end_sample + + # print( + # "xxx", + # len(x), + # total_samples, + # teacher_first, + # teacher_samples, + # student_samples, + # x1_reverb_context, + # x1_end_sample, + # x2_reverb_context, + # flush=True, + # ) + x2 = x[x1_end_sample - x2_reverb_context :] + if teacher_first: + x_teacher = x1 + x_student = x2 + else: + x_teacher = x2 + x_student = x1 + + return x_teacher, teacher_samples, x_student, student_samples + + def _split_audio_into_teacher_student_same(self, x, duration, fs): + total_samples = int(duration * fs) + return x, total_samples, x, total_samples + + def _split_audio_into_teacher_student_chunks(self, x, duration, fs): + if self.same_teacher_student_chunks: + ( + x_teacher, + teacher_samples, + x_student, + student_samples, + ) = self._split_audio_into_teacher_student_same(x, duration, fs) + else: + ( + x_teacher, + teacher_samples, + x_student, + student_samples, + ) = self._split_audio_into_teacher_student_disjoint(x, duration, fs) + assert ( + len(x_teacher) >= 64000 and len(x_teacher) <= 136000 + ), f"{len(x_teacher)}, {len(x_student)} {len(x)} {duration*fs}, {teacher_samples}, {student_samples}" + assert ( + len(x_student) >= 32000 and len(x_student) <= 136000 + ), f"{len(x_teacher)}, {len(x_student)}, {len(x)} {duration*fs}, {teacher_samples}, {student_samples}" + xs_teacher = self._split_audio_into_chunks( + x_teacher, + teacher_samples, + int(fs * self.teacher_chunk_length), + self.num_teacher_chunks, + ) + xs_student = self._split_audio_into_chunks( + x_student, + student_samples, + int(fs * self.student_chunk_length), + self.num_student_chunks, + ) + for xx in xs_teacher: + assert ( + len(xx) >= 64000 and len(xx) <= 72000 + ), f"{[len(t) for t in xs_teacher]} {len(x_teacher)} {len(x)}" + for xx in xs_student: + assert ( + len(xx) >= 32000 and len(xx) <= 40000 + ), f"{[len(t) for t in xs_student]} {len(x_student)} {len(x)}" + + return xs_teacher, xs_student + + def __getitem__(self, segment): + seg_id, start, duration = self._parse_segment_item(segment) + x, fs = self._read_audio(seg_id, start, duration) + x, fs = self._resample(x, fs) + assert len(x) >= int( + duration * fs + ), f"getitem {self.seg_set.loc[seg_id].duration}, {start}, {duration}, {len(x)}" + data = {"seg_id": seg_id, "sample_freq": fs} + xs_teacher, xs_student = self._split_audio_into_teacher_student_chunks( + x, duration, fs + ) + x_augs_teacher = self._apply_augs( + xs_teacher, self.teacher_chunk_length, fs, self.teacher_augmenter, "teacher" + ) + x_augs_student = self._apply_augs( + xs_student, self.student_chunk_length, fs, self.student_augmenter, "student" + ) + data.update(x_augs_teacher) + data.update(x_augs_student) + # print(data, flush=True) + # for ll in [ + # "x_teacher_0", + # "x_teacher_1", + # "x_student_0", + # "x_student_1", + # "x_student_2", + # "x_student_3", + # ]: + # print("zzz ", ll, data[ll].shape, flush=True) + seg_info = self._get_segment_info(seg_id) + data.update(seg_info) + return data + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(DINOAudioDataset.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + skip.add("aug_cfgs") + AudioDataset.add_class_args(parser, skip=skip) + parser.add_argument( + "--teacher-aug-cfg", default=None, help="config for teacher augmentations" + ) + parser.add_argument( + "--student-aug-cfg", default=None, help="config for student augmentations" + ) + parser.add_argument( + "--teacher-chunk-length", + default=4.0, + type=float, + help="chunk length for the teacher model", + ) + parser.add_argument( + "--student-chunk-length", + default=4.0, + type=float, + help="chunk length for the student model", + ) + parser.add_argument( + "--num-teacher-chunks", + default=2, + type=int, + help="num teacher chunks in eachd batch", + ) + parser.add_argument( + "--num-student-chunks", + default=4, + type=int, + help="num student chunks in eachd batch", + ) + parser.add_argument( + "--same-teacher-student-chunks", + default=False, + action=ActionYesNo, + help="teacher and student chunks are overlapped", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/embed_dataset.py b/hyperion/torch/data/embed_dataset.py index 519f498d..3c4433af 100644 --- a/hyperion/torch/data/embed_dataset.py +++ b/hyperion/torch/data/embed_dataset.py @@ -10,10 +10,9 @@ import numpy as np import pandas as pd -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.distributed as dist +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.utils.data import Dataset from ...io import RandomAccessDataReaderFactory as RF @@ -38,7 +37,6 @@ def __init__( preload_embeds=False, is_val=False, ): - assert embeds is not None or embed_file is not None assert embed_info is not None or embed_info is not None assert class_info is not None or class_files is not None @@ -60,8 +58,8 @@ def __init__( logging.info("dataset contains %d embeddings", len(self.embed_info)) if embeds is None: - if rank == 0: - logging.info("opening dataset %s", rspecifier) + # if rank == 0: + # logging.info("opening dataset %s", rspecifier) self.r = RF.create(embed_file, path_prefix=path_prefix, scp_sep=" ") if self.preload_embeds: self.embeds = self.r.load(embed_info["id"], squeeze=True).astype( @@ -143,7 +141,6 @@ def _get_embed_info(self, embed_id): return embed_info def __getitem__(self, embed_id): - x = self._read_embed(embed_id) data = {"embed_id": embed_id, "x": x} diff --git a/hyperion/torch/data/hyp_sampler.py b/hyperion/torch/data/hyp_sampler.py index d1bcb0a8..f8d0862b 100644 --- a/hyperion/torch/data/hyp_sampler.py +++ b/hyperion/torch/data/hyp_sampler.py @@ -2,10 +2,9 @@ import math import numpy as np -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser from torch.utils.data import Sampler @@ -14,6 +13,7 @@ def __init__(self, shuffle=False, seed=1234): super().__init__(None) self.epoch = 0 self.batch = 0 + self.init_batch = 0 self.shuffle = shuffle self.seed = seed @@ -28,16 +28,18 @@ def __init__(self, shuffle=False, seed=1234): self.world_size = world_size self.rng = torch.Generator() - def set_epoch(self, epoch): + def set_epoch(self, epoch, batch=0): self.epoch = epoch + self.init_batch = batch def _set_seed(self): if self.shuffle: - self.rng.manual_seed(self.seed + 10 * self.epoch) + self.rng.manual_seed(self.seed + 10 * self.epoch + 100 * self.init_batch) else: self.rng.manual_seed(self.seed) def __iter__(self): - self.batch = 0 + self.batch = self.init_batch + self.init_batch = 0 self._set_seed() return self diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py index 2933dcc6..da47c8ac 100644 --- a/hyperion/torch/data/seg_chunk_sampler.py +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -8,10 +8,9 @@ import numpy as np import pandas as pd -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.distributed as dist +from jsonargparse import ActionParser, ArgumentParser from ...utils.segment_set import SegmentSet from .hyp_sampler import HypSampler @@ -30,7 +29,6 @@ def __init__( seed=1234, **base_kwargs, ): - super().__init__(shuffle=shuffle, seed=seed) self.seg_set = seg_set self.min_chunk_length = min_chunk_length @@ -80,7 +78,6 @@ def get_random_duration(self): return self.min_chunk_length def _create_chunks(self): - chunks = [] for id, len in zip(self.seg_set["id"], self.seg_set[self.length_name]): if len < self.min_chunk_length: @@ -91,7 +88,16 @@ def _create_chunks(self): num_chunks = math.ceil(len / self.avg_chunk_length) start = 0 for i in range(num_chunks - 1): - dur = self.get_random_duration() + remainder = len - start + if remainder < self.min_chunk_length: + remainder = self.min_chunk_length + dur = remainder + start = len - dur + else: + dur = self.get_random_duration() + if dur > remainder: + dur = remainder + chunk = (f"{id}-{i}", id, start, dur) chunks.append(chunk) start += dur @@ -135,7 +141,6 @@ def __next__(self): @staticmethod def filter_args(**kwargs): - valid_args = ( "min_chunk_length", "max_chunk_length", diff --git a/hyperion/torch/layer_blocks/conformer_encoder_v1.py b/hyperion/torch/layer_blocks/conformer_encoder_v1.py index 4f8e1b4d..349bef4b 100644 --- a/hyperion/torch/layer_blocks/conformer_encoder_v1.py +++ b/hyperion/torch/layer_blocks/conformer_encoder_v1.py @@ -131,6 +131,12 @@ def __init__( if self.concat_after: self.concat_linear = nn.Linear(num_feats + num_feats, num_feats) + def change_attn_dropout(self, att_dropout_rate): + attn = self.self_attn + if hasattr(attn, "dropout_rate"): + attn.dropout_rate = att_dropout_rate + attn.dropout.p = att_dropout_rate + @staticmethod def _make_att( att_type, diff --git a/hyperion/torch/layers/__init__.py b/hyperion/torch/layers/__init__.py index b0b607e2..d53646ed 100644 --- a/hyperion/torch/layers/__init__.py +++ b/hyperion/torch/layers/__init__.py @@ -28,6 +28,12 @@ from .mvn import MeanVarianceNorm from .norm_layer_factory import NormLayer1dFactory, NormLayer2dFactory from .pool_factory import GlobalPool1dFactory -from .pos_encoder import ConvPosEncoder, NoPosEncoder, PosEncoder, RelPosEncoder +from .pos_encoder import ( + ConvPosEncoder, + NoPosEncoder, + PosEncoder, + PosEncoderBase, + RelPosEncoder, +) from .spec_augment import AxisMasker, SpecAugment, SpecWarper from .subpixel_convs import ICNR1d, ICNR2d, SubPixelConv1d, SubPixelConv2d diff --git a/hyperion/torch/layers/activation_factory.py b/hyperion/torch/layers/activation_factory.py index f2467962..e656eff5 100644 --- a/hyperion/torch/layers/activation_factory.py +++ b/hyperion/torch/layers/activation_factory.py @@ -36,6 +36,7 @@ "double_swish": DoubleSwish, "swish6": Swish6, "double_swish6": DoubleSwish6, + "gelu": nn.GELU, } @@ -194,3 +195,6 @@ def get_config(activation): return {"name": "swish6"} if isinstance(activation, DoubleSwish6): return {"name": "double_swish6"} + + if isinstance(activation, nn.GELU): + return {"name": "gelu"} diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 8fe67792..4587fbd2 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -6,7 +6,6 @@ import math import numpy as np - import torch import torch.nn as nn import torch.nn.functional as nnf @@ -64,7 +63,6 @@ def forward_slidwin(self, x, win_length, win_shift): raise NotImplementedError() def _slidwin_pad(self, x, win_length, win_shift, snip_edges): - if snip_edges: num_frames = int( math.floor((x.size(-1) - win_length + win_shift) / win_shift) @@ -145,7 +143,6 @@ def _post_slidwin(self, m_x, x_shape): return m_x def _forward_slidwin_int(self, x, win_length, win_shift, snip_edges): - c_x, out_shape = self._pre_slidwin(x, win_length, win_shift, snip_edges) m_x = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length @@ -204,7 +201,7 @@ def forward(self, x, x_lengths=None, weights=None): # this can produce slightly negative variance when relu6 saturates in all time steps # add 1e-5 for stability s = torch.sqrt( - torch.mean(delta ** 2, dim=self.dim, keepdim=False).clamp(min=SQRT_EPS) + torch.mean(delta**2, dim=self.dim, keepdim=False).clamp(min=SQRT_EPS) ) mus = torch.cat((mu, s), dim=1) @@ -217,7 +214,7 @@ def forward(self, x, x_lengths=None, weights=None): wbar = torch.mean(weights, dim=self.dim, keepdim=True) mu = xbar / wbar delta = x - mu - var = torch.mean(weights * delta ** 2, dim=self.dim, keepdim=True) / wbar + var = torch.mean(weights * delta**2, dim=self.dim, keepdim=True) / wbar s = torch.sqrt(var.clamp(min=SQRT_EPS)) mu = mu.squeeze(self.dim) s = s.squeeze(self.dim) @@ -257,19 +254,18 @@ def _forward_slidwin_int(self, x, win_length, win_shift, snip_edges): c_x = torch.cumsum(x, dim=-1).view(-1, x.shape[-1]) m_x = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length - c_x = torch.cumsum(x ** 2, dim=-1).view(-1, x.shape[-1]) + c_x = torch.cumsum(x**2, dim=-1).view(-1, x.shape[-1]) m_x2 = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length - s_x = torch.sqrt(m_x2 - m_x ** 2).clamp(min=SQRT_EPS) + s_x = torch.sqrt(m_x2 - m_x**2).clamp(min=SQRT_EPS) mus = self._post_slidwin(m_x, s_x, out_shape) return mus def _forward_slidwin_float(self, x, win_length, win_shift, snip_edges): - x, out_shape = self._pre_slidwin(x, win_length, win_shift, snip_edges) num_frames = out_shape[-1] c_x = torch.cumsum(x, dim=-1).view(-1, x.shape[-1]) - c_x2 = torch.cumsum(x ** 2, dim=-1).view(-1, x.shape[-1]) + c_x2 = torch.cumsum(x**2, dim=-1).view(-1, x.shape[-1]) # xx = x.view(-1, x.shape[-1]) # print(xx.shape[1]) @@ -313,7 +309,7 @@ def _forward_slidwin_float(self, x, win_length, win_shift, snip_edges): k += win_shift - var_x = (m_x2 - m_x ** 2).clamp(min=SQRT_EPS) + var_x = (m_x2 - m_x**2).clamp(min=SQRT_EPS) s_x = torch.sqrt(var_x) # idx = torch.isnan(s_x) #.any(dim=1) # if torch.sum(idx) > 0: @@ -404,14 +400,14 @@ def forward(self, x, x_lengths=None, weights=None): weights = self._standardize_weights(x, x_lengths, weights) if weights is None: mu = torch.mean(x, dim=self.dim, keepdim=self.keepdim) - x2bar = torch.mean(x ** 2, dim=self.dim, keepdim=self.keepdim) + x2bar = torch.mean(x**2, dim=self.dim, keepdim=self.keepdim) logvar = torch.log(x2bar - mu * mu + 1e-5) # for stability in case var=0 return torch.cat((mu, logvar), dim=-1) xbar = torch.mean(weights * x, dim=self.dim, keepdim=self.keepdim) wbar = torch.mean(weights, dim=self.dim, keepdim=self.keepdim) mu = xbar / wbar - x2bar = torch.mean(weights * x ** 2, dim=self.dim, keepdim=self.keepdim) / wbar + x2bar = torch.mean(weights * x**2, dim=self.dim, keepdim=self.keepdim) / wbar var = (x2bar - mu * mu).clamp(min=1e-5) logvar = torch.log(var) @@ -448,7 +444,7 @@ def __init__( if dist_pow == 1: self.dist_f = lambda x: torch.norm(x, p=2, dim=-1) else: - self.dist_f = lambda x: torch.sum(x ** 2, dim=-1) + self.dist_f = lambda x: torch.sum(x**2, dim=-1) self.size_multiplier = num_comp @@ -507,7 +503,7 @@ def forward(self, x, x_lengths=None, weights=None): delta = x - self.mu # (batch, time, num_comp, feat_dim) dist = self.dist_f(delta) # (batch, time, num_comp) - llk = -self.prec ** 2 * dist + self.bias + llk = -self.prec**2 * dist + self.bias r = nnf.softmax(llk, dim=-1) # (batch, time, num_comp) if weights is not None: r *= weights @@ -527,7 +523,6 @@ def forward(self, x, x_lengths=None, weights=None): return pool def get_config(self): - config = { "in_feats": self.in_feats, "num_comp": self.num_comp, @@ -783,10 +778,22 @@ def forward(self, x, x_lengths=None, weights=None): # x = (batch, feat_dim, time) weights = self._standardize_weights(x, x_lengths, weights) # (batch, 1, time) x_inner = self.conv1(x) # (batch, inner_dim, time) + assert not torch.any( + torch.isnan(x_inner) + ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(x))} {torch.mean(x)} {torch.sum(torch.isinf(x))} {x.size()}" + assert not torch.any( + torch.isinf(x_inner) + ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(x))}" # logging.info('x_inner1={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) if self.use_global_context: global_mus = self.stats_pool(x, weights=weights) x_inner = x_inner + self.lin_global(global_mus).unsqueeze(-1) + assert not torch.any( + torch.isnan(x_inner) + ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(global_mus))}" + assert not torch.any( + torch.isinf(x_inner) + ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(global_mus))}" # logging.info('x_inner2={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) attn = self.conv2( self.activation(self.norm_layer(x_inner)) @@ -807,11 +814,23 @@ def forward(self, x, x_lengths=None, weights=None): if weights is not None: attn = attn * weights + assert not torch.any( + torch.isnan(attn) + ), f"attn is nan {torch.sum(torch.isnan(attn))}" + assert not torch.any( + torch.isinf(attn) + ), f"attn is inf {torch.sum(torch.isinf(attn))}" mus = self.stats_pool(x, weights=attn) # logging.info('mus={} {}'.format(torch.sum(torch.isnan(mus)), torch.sum(torch.isinf(mus)))) if self.keepdim: mus = mus.unsqueeze(self.dim) + assert not torch.any( + torch.isnan(mus) + ), f"mus is nan {torch.sum(torch.isnan(mus))}" + assert not torch.any( + torch.isinf(mus) + ), f"mus is inf {torch.sum(torch.isinf(mus))}" return mus def get_config(self): diff --git a/hyperion/torch/layers/norm_layer_factory.py b/hyperion/torch/layers/norm_layer_factory.py index 8543b31b..8c0ebdeb 100644 --- a/hyperion/torch/layers/norm_layer_factory.py +++ b/hyperion/torch/layers/norm_layer_factory.py @@ -105,4 +105,4 @@ def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): if norm_name == "layer-norm": # it is equivalent to groupnorm with 1 group - return lambda x, momentum=momentum, eps=eps: nn.GroupNorm(1, x, eps=eps) + return lambda x, momentum=momentum, eps=eps: nn.LayerNorm(x, eps=eps) diff --git a/hyperion/torch/layers/pos_encoder.py b/hyperion/torch/layers/pos_encoder.py index b6f3672e..f18eb51f 100644 --- a/hyperion/torch/layers/pos_encoder.py +++ b/hyperion/torch/layers/pos_encoder.py @@ -11,7 +11,11 @@ from .activation_factory import ActivationFactory as AF -class PosEncoder(nn.Module): +class PosEncoderBase(nn.Module): + pass + + +class PosEncoder(PosEncoderBase): """Positional encoding. Attributes: @@ -32,9 +36,9 @@ def __repr__(self): return self.__str__() def __str__(self): - s = "{}(num_feats={}, dropout_rate={})".format(self.__class__.__name__, - self.num_feats, - self.dropout_rate) + s = "{}(num_feats={}, dropout_rate={})".format( + self.__class__.__name__, self.num_feats, self.dropout_rate + ) return s def _pe(self, x, relative=False): @@ -48,14 +52,15 @@ def _pe(self, x, relative=False): pe = torch.zeros(x.size(1), self.num_feats) if relative: # this is for relative positional encoders - position = torch.arange(x.size(1) - 1, -1, -1, - dtype=torch.float32).unsqueeze(1) + position = torch.arange( + x.size(1) - 1, -1, -1, dtype=torch.float32 + ).unsqueeze(1) else: - position = torch.arange(0, x.size(1), - dtype=torch.float32).unsqueeze(1) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) div_term = torch.exp( - torch.arange(0, self.num_feats, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.num_feats)) + torch.arange(0, self.num_feats, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.num_feats) + ) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) @@ -72,7 +77,7 @@ def forward(self, x: torch.Tensor): x-scaled + pos-encoder """ pe = self._pe(x) - x = x * self.xscale + pe[:, :x.size(1)] + x = x * self.xscale + pe[:, : x.size(1)] if self.dropout_rate > 0: return self.dropout(x) return x @@ -107,7 +112,7 @@ def forward(self, x: torch.Tensor): x = x * self.xscale # we want embedding [R_L,..., R_0] # while in non relative we want [R_0, ..., R_L] - pos_emb = self.pe[:, -x.size(1):] + pos_emb = self.pe[:, -x.size(1) :] # this pos_emb is matrix Q in # https://arxiv.org/pdf/1901.02860.pdf Appendix B # I think it should have been denoted as R, @@ -119,7 +124,7 @@ def forward(self, x: torch.Tensor): return x, pos_emb -class NoPosEncoder(nn.Module): +class NoPosEncoder(PosEncoderBase): """This is a dummy class for the case where we deactivate the positional encoder @@ -140,7 +145,7 @@ def forward(self, x: torch.Tensor): return x -class ConvPosEncoder(nn.Module): +class ConvPosEncoder(PosEncoderBase): """Convolutional positional encoder like the one used in wav2vec2 Attributes: @@ -150,14 +155,21 @@ class ConvPosEncoder(nn.Module): activation: hidden activation """ - def __init__(self, num_feats: int, kernel_size: int, num_groups: int, - activation: Union[str, nn.Module]): + def __init__( + self, + num_feats: int, + kernel_size: int, + num_groups: int, + activation: Union[str, nn.Module], + ): super().__init__() - self.conv = nn.Conv1d(num_feats, - num_feats, - kernel_size=kernel_size, - padding=kernel_size // 2, - groups=num_groups) + self.conv = nn.Conv1d( + num_feats, + num_feats, + kernel_size=kernel_size, + padding=kernel_size // 2, + groups=num_groups, + ) self.activation = AF.create(activation) self.num_pad_remove = 1 if kernel_size % 2 == 0 else 0 @@ -165,7 +177,7 @@ def forward(self, x: torch.Tensor): x = x.transpose(1, 2) x = self.conv(x) if self.num_pad_remove > 0: - x = x[:, :, :-self.num_pad_remove] + x = x[:, :, : -self.num_pad_remove] x = self.activation(x).transpose(1, 2) diff --git a/hyperion/torch/layers/swish.py b/hyperion/torch/layers/swish.py index 62225ad9..9ba0a896 100644 --- a/hyperion/torch/layers/swish.py +++ b/hyperion/torch/layers/swish.py @@ -55,16 +55,16 @@ def __str__(self): class DoubleSwishImplementation(torch.autograd.Function): - """ Implementation for DoubleSwish Activation from - https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py + """Implementation for DoubleSwish Activation from + https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py - f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), + f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), where swish(x) = x * sigmoid(x). Memory-efficient derivative computation: f'(x) = = x * s'(x) + x' * s(x) = x * s'(x) + s(x). where s(x) = simoid(x), and s'(x) = s(x) * (1-s(x)). - + f'(x) = x * s(x) * (1-s(x)) + s(x) = f(x) * (1-s(x)) + s(x) """ @@ -108,15 +108,14 @@ def backward(ctx, y_grad: torch.Tensor) -> torch.Tensor: class DoubleSwish(torch.nn.Module): - """ DoubleSwish activation - f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), - where swish(x) = x * sigmoid(x). + """DoubleSwish activation + f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), + where swish(x) = x * sigmoid(x). """ def forward(self, x: torch.Tensor) -> torch.Tensor: - if torch.jit.is_scripting() or torch.jit.is_tracing(): - return (x * torch.sigmoid(x - 1.0)).clamp(max=6) + return x * torch.sigmoid(x - 1.0) return DoubleSwishImplementation.apply(x) @@ -129,10 +128,10 @@ def __str__(self): class DoubleSwish6(torch.nn.Module): - """ DoubleSwish activation clamped to 6 + """DoubleSwish activation clamped to 6 x = min(x, 6) - f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), - where swish(x) = x * sigmoid(x). + f(x) = x * torch.sigmoid(x-1) = swish(swish(x)), + where swish(x) = x * sigmoid(x). """ def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/hyperion/torch/losses/__init__.py b/hyperion/torch/losses/__init__.py index bf3ce279..6f68ad45 100644 --- a/hyperion/torch/losses/__init__.py +++ b/hyperion/torch/losses/__init__.py @@ -4,3 +4,4 @@ """ from .bce_with_llr import BCEWithLLR +from .dino_loss import DINOLoss diff --git a/hyperion/torch/losses/dino_loss.py b/hyperion/torch/losses/dino_loss.py new file mode 100644 index 00000000..b22489a3 --- /dev/null +++ b/hyperion/torch/losses/dino_loss.py @@ -0,0 +1,164 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging + +import torch +import torch.distributed as dist +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args + + +class DINOLoss(nn.Module): + """Loss for Training DIstillation with NO labels. + + Args: + num_classes: number of DINO classes + student_temp: temperature of student distribution + teacher_temp: final temperature of teacher distribution + teacher_warmup_temp: initial temperature of teacher distribution + temp_warmup_epochs: warmup epochs for the teacher temperature + center_momentum: momumntum for centering of the teacher distribution + """ + + def __init__( + self, + num_classes: int, + student_temp: float = 0.1, + teacher_temp: float = 0.04, + teacher_warmup_temp: float = 0.04, + temp_warmup_epochs: int = 30, + center_momentum: float = 0.9, + ): + super().__init__() + self.num_classes = num_classes + self.student_temp = student_temp + self.teacher_temp = teacher_temp + self.teacher_warmup_temp = teacher_warmup_temp + self.temp_warmup_epochs = temp_warmup_epochs + self.center_momentum = center_momentum + self.cur_teacher_temp = teacher_warmup_temp + self.register_buffer("center", torch.zeros(1, num_classes)) + + def update_temp(self, epoch: int): + if epoch < self.temp_warmup_epochs: + self.cur_teacher_temp = ( + self.teacher_warmup_temp + + (self.teacher_temp - self.teacher_warmup_temp) + * epoch + / self.temp_warmup_epochs + ) + logging.info("updating dino-loss teacher temp=%.2f", self.cur_teacher_temp) + else: + self.cur_teacher_temp = self.teacher_temp + + def forward( + self, + student_pred: torch.Tensor, + teacher_pred: torch.Tensor, + num_student_crops: int, + num_teacher_crops: int, + ): + """ + Cross-entropy between softmax outputs of the teacher and student networks. + """ + assert not torch.any(torch.isnan(student_pred)), f"loss/student is nan" + student_pred = student_pred / self.student_temp + assert not torch.any(torch.isnan(student_pred)), f"loss/p is nan" + student_pred = student_pred.chunk(num_student_crops) + teacher_pred = teacher_pred.detach() + center = self.center # we take the center before updating it + if self.training: + self.update_center(teacher_pred) + assert not torch.any(torch.isnan(teacher_pred)), f"loss/teacher is nan" + teacher_pred = nn.functional.softmax( + (teacher_pred - center) / self.cur_teacher_temp, dim=-1 + ) + assert not torch.any(torch.isnan(teacher_pred)), f"loss/q is nan {center}" + teacher_pred = teacher_pred.chunk(num_teacher_crops) + + total_loss = 0 + n_loss_terms = 0 + for iq, q in enumerate(teacher_pred): + for ip, p in enumerate(student_pred): + if ip == iq and num_teacher_crops > 1: + # we skip cases where student and teacher operate on the same view + continue + loss = torch.sum(-q * nn.functional.log_softmax(p, dim=-1), dim=-1) + assert not torch.any( + torch.isnan(loss) + ), f"loss is nan {iq} {ip} {torch.mean(q)} {torch.mean(p)} {torch.mean(center)}" + total_loss += loss.mean() + n_loss_terms += 1 + total_loss /= n_loss_terms + return total_loss + + @torch.no_grad() + def update_center(self, teacher_pred: torch.Tensor): + """ + Update center used for teacher output. + """ + batch_acc = torch.sum(teacher_pred, dim=0, keepdim=True) + batch_size = torch.as_tensor(teacher_pred.size(0), device=batch_acc.device) + if dist.is_initialized(): + dist.all_reduce(batch_size, op=dist.ReduceOp.SUM) + dist.all_reduce(batch_acc, op=dist.ReduceOp.SUM) + + batch_center = batch_acc / batch_size + assert not torch.any( + torch.isnan(batch_center) + ), f"bc is nan {torch.mean(batch_acc)} {batch_size}" + # ema update + self.center = self.center * self.center_momentum + batch_center * ( + 1 - self.center_momentum + ) + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(DINOLoss.__init__, kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--num-classes", default=65536, type=int, help="number of DINO classes" + ) + parser.add_argument( + "--student-temp", + default=0.1, + type=float, + help="temperature of student distribution", + ) + parser.add_argument( + "--teacher-temp", + default=0.07, + type=float, + help="final temperature of teacher distribution", + ) + parser.add_argument( + "--teacher-warmup-temp", + default=0.04, + type=float, + help="initial temperature of teacher distribution", + ) + parser.add_argument( + "--temp-warmup-epochs", + default=30, + type=int, + help="warmup epochs for the teacher temperature", + ) + parser.add_argument( + "--center-momentum", + default=0.9, + type=float, + help="momumntum for centering of the teacher distribution", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index cf003ca7..f2886203 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -2,9 +2,8 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ActionParser, ArgumentParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from .cos_lr import AdamCosineLR, CosineLR from .exp_lr import ExponentialLR @@ -14,8 +13,7 @@ from .triangular_lr import TriangularLR -class LRSchedulerFactory(object): - +class LRSchedulerFactory: def create( optimizer, lrsch_type, @@ -174,7 +172,6 @@ def create( @staticmethod def filter_args(**kwargs): - valid_args = ( "lrsch_type", "decay_rate", @@ -222,9 +219,11 @@ def add_class_args(parser, prefix=None): "noam_lr", "triangular_lr", ], - help=("Learning rate schedulers: None, Exponential," - "Cosine Annealing, Cosine Annealing for Adam," - "Reduce on Plateau"), + help=( + "Learning rate schedulers: None, Exponential," + "Cosine Annealing, Cosine Annealing for Adam," + "Reduce on Plateau" + ), ) parser.add_argument( @@ -233,29 +232,22 @@ def add_class_args(parser, prefix=None): type=float, help=("LR decay rate in exp lr"), ) - parser.add_argument("--decay-steps", - default=100, - type=int, - help=("LR decay steps in exp lr")) - parser.add_argument("--power", - default=0.5, - type=float, - help=("power in inverse power lr")) - - parser.add_argument("--hold-steps", - default=10, - type=int, - help=("LR hold steps in exp lr")) - parser.add_argument("--t", - default=10, - type=int, - help=("Period in cos lr")) + parser.add_argument( + "--decay-steps", default=100, type=int, help=("LR decay steps in exp lr") + ) + parser.add_argument( + "--power", default=0.5, type=float, help=("power in inverse power lr") + ) + + parser.add_argument( + "--hold-steps", default=10, type=int, help=("LR hold steps in exp lr") + ) + parser.add_argument("--t", default=10, type=int, help=("Period in cos lr")) parser.add_argument( "--t-mul", default=1, type=int, - help=( - "Period multiplicator for each restart in cos/triangular lr"), + help=("Period multiplicator for each restart in cos/triangular lr"), ) parser.add_argument( "--gamma", @@ -267,13 +259,13 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--warm-restarts", default=False, - action="store_true", + action=ActionYesNo, help=("Do warm restarts in cos lr"), ) - parser.add_argument("--monitor", - default="val_loss", - help=("Monitor metric to reduce lr")) + parser.add_argument( + "--monitor", default="val_loss", help=("Monitor metric to reduce lr") + ) parser.add_argument( "--mode", default="min", @@ -285,24 +277,21 @@ def add_class_args(parser, prefix=None): "--factor", default=0.1, type=float, - help=( - "Factor by which the learning rate will be reduced on plateau" - ), + help=("Factor by which the learning rate will be reduced on plateau"), ) parser.add_argument( "--patience", default=10, type=int, - help= - ("Number of epochs with no improvement after which learning rate will be reduced" - ), + help=( + "Number of epochs with no improvement after which learning rate will be reduced" + ), ) - parser.add_argument("--threshold", - default=1e-4, - type=float, - help=("Minimum metric improvement")) + parser.add_argument( + "--threshold", default=1e-4, type=float, help=("Minimum metric improvement") + ) parser.add_argument( "--threshold_mode", @@ -315,20 +304,16 @@ def add_class_args(parser, prefix=None): "--cooldown", default=0, type=int, - help= - ("Number of epochs to wait before resuming normal operation after lr has been reduced" - ), + help=( + "Number of epochs to wait before resuming normal operation after lr has been reduced" + ), ) - parser.add_argument("--eps", - default=1e-8, - type=float, - help=("Minimum decay applied to lr")) + parser.add_argument( + "--eps", default=1e-8, type=float, help=("Minimum decay applied to lr") + ) - parser.add_argument("--min-lr", - default=0, - type=float, - help=("Minimum lr")) + parser.add_argument("--min-lr", default=0, type=float, help=("Minimum lr")) parser.add_argument( "--warmup-steps", @@ -352,13 +337,12 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--update-lr-on-opt-step", default=False, - action="store_true", + action=ActionYesNo, help=("Update lr based on batch number instead of epoch number"), ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='learning rate scheduler options') add_argparse_args = add_class_args diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py index 3326ef81..b8e7fe74 100644 --- a/hyperion/torch/models/transducer/rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -24,7 +24,6 @@ @dataclass class RNNTransducerOutput(HypDataClass): - loss: torch.Tensor loss_simple: Optional[torch.Tensor] = None loss_pruned: Optional[torch.Tensor] = None @@ -32,7 +31,7 @@ class RNNTransducerOutput(HypDataClass): class RNNTransducer(TorchModel): - """ Base-class for RNN-T in + """Base-class for RNN-T in "Sequence Transduction with Recurrent Neural Networks" https://arxiv.org/pdf/1211.3711.pdf @@ -92,13 +91,15 @@ def forward( output = RNNTransducerOutput(*dec_output) return output - def infer(self, - x: torch.Tensor, - x_lengths: torch.Tensor, - decoding_method="time_sync_beam_search", - beam_width: int = 5, - max_sym_per_frame: int = 3, - max_sym_per_utt: int = 1000) -> List[List[int]]: + def infer( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ) -> List[List[int]]: """ ASR tokens inference Args: @@ -121,12 +122,14 @@ def infer(self, batch_size = x.size(0) y = [] for i in range(batch_size): - x_i = x[i:i + 1, :x_lengths[i]] - y_i = self.decoder.decode(x_i, - method=decoding_method, - beam_width=beam_width, - max_sym_per_frame=max_sym_per_frame, - max_sym_per_utt=max_sym_per_utt) + x_i = x[i : i + 1, : x_lengths[i]] + y_i = self.decoder.decode( + x_i, + method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt, + ) y.append(y_i) return y @@ -180,7 +183,6 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -188,8 +190,7 @@ def add_class_args(parser, prefix=None, skip=set()): RNNTransducerDecoder.add_class_args(parser, prefix="decoder") if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) def change_config( self, @@ -201,7 +202,7 @@ def change_config( @staticmethod def filter_finetune_args(**kwargs): args = {} - decoder_args = Decoder.filter_finetune_args(**kwargs["decoder"]) + decoder_args = RNNTransducerDecoder.filter_finetune_args(**kwargs["decoder"]) args["decoder"] = decoder_args return args @@ -214,8 +215,7 @@ def add_finetune_args(parser, prefix=None): RNNTransducerDecoder.add_finetune_args(parser, prefix="decoder") if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def add_infer_args(parser, prefix=None): @@ -223,29 +223,34 @@ def add_infer_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") - parser.add_argument("--decoding-method", - default="time_sync_beam_search", - choices=[ - "greedy", "time_sync_beam_search", - "align_length_sync_beam_search" - ]) - - parser.add_argument("--beam-width", - default=5, - type=int, - help="beam width for beam search") - parser.add_argument("--max-sym-per-frame", - default=3, - type=int, - help="max symbols RNN-T can emit in 1 frame") - parser.add_argument("--max-sym-per-utt", - default=1000, - type=int, - help="max symbols RNN-T can emit in 1 frame") + parser.add_argument( + "--decoding-method", + default="time_sync_beam_search", + choices=[ + "greedy", + "time_sync_beam_search", + "align_length_sync_beam_search", + ], + ) + + parser.add_argument( + "--beam-width", default=5, type=int, help="beam width for beam search" + ) + parser.add_argument( + "--max-sym-per-frame", + default=3, + type=int, + help="max symbols RNN-T can emit in 1 frame", + ) + parser.add_argument( + "--max-sym-per-utt", + default=1000, + type=int, + help="max symbols RNN-T can emit in 1 frame", + ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def filter_infer_args(**kwargs): diff --git a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py index fc10f810..9a939346 100644 --- a/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/hf_wav2xvector.py @@ -310,21 +310,15 @@ def freeze_hf_except_lora(self, bias=None): self.hf_feats.freeze_except_lora(bias) def has_param_groups(self): - return self.hf_feats.has_param_groups() + return self.hf_feats.has_param_groups() or self.xvector.has_param_groups() def trainable_param_groups(self): if not self.has_param_groups(): - return self.trainable_parameters() + return [{"params": self.trainable_parameters()}] param_groups = self.hf_feats.trainable_param_groups() param_groups.append({"params": self.feat_fuser.trainable_parameters()}) - # if self.feat_fusion_method == "weighted-avg": - # if self.feat_fuser.requires_grad: - # param_groups.append({"params": self.feat_fuser}) - # else: - # param_groups.append({"params": self.feat_fuser.parameters()}) - - param_groups.append({"params": self.xvector.trainable_parameters()}) + param_groups.extend(self.xvector.trainable_param_groups()) return param_groups def set_train_mode(self, mode): @@ -362,6 +356,9 @@ def set_train_mode(self, mode): else: raise ValueError(f"invalid train_mode={mode}") + if self.xvector.head_type == "dino": + self.xvector.classif_net.freeze_output_g() + logging.info("train mode set to %s", mode) if "nograd" in mode or mode == "ft-embed-affine": @@ -377,6 +374,7 @@ def _train(self, train_mode: str): super()._train(train_mode) elif train_mode == "ft-embed-affine": self.hf_feats.train() + self.feat_fuser.train() self.xvector._train("ft-embed_affine") elif train_mode in [ "ft-xvector", @@ -389,6 +387,7 @@ def _train(self, train_mode: str): "hf-lora-with-bias", ]: self.hf_feats.train() + self.feat_fuser.train() self.xvector._train("full") else: raise ValueError(f"invalid train_mode={train_mode}") diff --git a/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py b/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py index ad6ae4c7..3f6acf02 100644 --- a/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2conformer_v1_xvector.py @@ -68,3 +68,21 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = {} + child_args = ConformerV1XVector.filter_dino_teacher_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerV1XVector.add_dino_teacher_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py index 0e4faded..aa01850f 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ..xvectors import ResNet1dXVector from .wav2xvector import Wav2XVector @@ -26,7 +25,6 @@ class Wav2ResNet1dXVector(Wav2XVector): """ def __init__(self, feats, xvector): - if isinstance(xvector, dict): xvector = ResNet1dXVector.filter_args(**xvector) xvector = ResNet1dXVector(**xvector) @@ -70,3 +68,21 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = {} + child_args = ResNet1dXVector.filter_dino_teacher_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNet1dXVector.add_dino_teacher_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py index 11d643af..642c282d 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ..xvectors import ResNetXVector from .wav2xvector import Wav2XVector @@ -26,7 +25,6 @@ class Wav2ResNetXVector(Wav2XVector): """ def __init__(self, feats, xvector): - if isinstance(xvector, dict): xvector = ResNetXVector.filter_args(**xvector) xvector = ResNetXVector(**xvector) @@ -70,3 +68,21 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = {} + child_args = ResNetXVector.filter_dino_teacher_args(**kwargs["xvector"]) + base_args["xvector"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ResNetXVector.add_dino_teacher_args(parser, prefix="xvector") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 4bbc0c4c..501fa7f8 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -5,10 +5,9 @@ import contextlib import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import AudioFeatsMVN from ...torch_model import TorchModel @@ -24,7 +23,6 @@ class Wav2XVector(TorchModel): """ def __init__(self, feats, xvector): - super().__init__() if isinstance(feats, dict): @@ -42,6 +40,15 @@ def __init__(self, feats, xvector): def sample_frequency(self): return self.feats.sample_frequency + # def clone(self): + # # weight normalized layers cannot be copied with deepcopy, + # # we remove them to clone and put them back later + # modules, cloned_modules = self.xvector.before_cloning() + # new_self = super().clone() + # self.xvector.after_cloning(*modules) + # new_self.xvector.after_cloning(*cloned_modules) + # return new_self + def compute_prototype_affinity(self): return self.xvector.compute_prototype_affinity() @@ -80,6 +87,9 @@ def change_config(self, xvector): logging.info("changing wav2xvector config") self.xvector.change_config(**xvector) + def cancel_output_layer_grads(self): + self.xvector.cancel_output_layer_grads() + def forward( self, x, @@ -91,7 +101,6 @@ def forward( classif_layers=None, return_output=True, ): - with self._feats_context: if vad_samples is not None: x, x_lengths = remove_silence(x, vad_samples, x_lengths) @@ -125,7 +134,6 @@ def extract_embed( embed_layer=None, detach_chunks=False, ): - with self._feats_context: if vad_samples is not None: x, x_lengths = remove_silence(x, vad_samples, x_lengths) @@ -140,6 +148,10 @@ def extract_embed( feats, feat_lengths, chunk_length, embed_layer, detach_chunks ) + def trainable_param_groups(self): + param_groups = self.xvector.trainable_param_groups() + return param_groups + def set_train_mode(self, mode): if mode == self._train_mode: return @@ -155,7 +167,6 @@ def set_train_mode(self, mode): self._train_mode = mode def _train(self, train_mode: str): - self.feats.train() if train_mode in ["frozen"]: super()._train(train_mode) diff --git a/hyperion/torch/models/xvectors/conformer_v1_xvector.py b/hyperion/torch/models/xvectors/conformer_v1_xvector.py index f52b8700..896cad77 100644 --- a/hyperion/torch/models/xvectors/conformer_v1_xvector.py +++ b/hyperion/torch/models/xvectors/conformer_v1_xvector.py @@ -34,9 +34,16 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=True, + head_use_norm=True, head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): if isinstance(encoder, dict): logging.info(f"making conformer encoder network={encoder}") @@ -65,10 +72,17 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) def get_config(self): @@ -88,6 +102,7 @@ def get_config(self): def change_config( self, encoder, + override_output=False, override_dropouts=False, dropout_rate=0, num_classes=None, @@ -100,6 +115,7 @@ def change_config( num_subcenters=2, ): super().change_config( + override_output, False, dropout_rate, num_classes, @@ -170,3 +186,22 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = Encoder.filter_finetune_args(**kwargs["encoder"]) + base_args["encoder"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + Encoder.add_finetune_args(parser, prefix="encoder", skip=set()) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/efficient_net_xvector.py b/hyperion/torch/models/xvectors/efficient_net_xvector.py index 132bb51d..923be8eb 100644 --- a/hyperion/torch/models/xvectors/efficient_net_xvector.py +++ b/hyperion/torch/models/xvectors/efficient_net_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import EfficientNet as EN from .xvector import XVector @@ -52,11 +51,17 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=True, + head_use_norm=True, head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): - logging.info("making %s encoder network", effnet_type) encoder_net = EN( effnet_type, @@ -99,11 +104,18 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=in_feats, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) @property @@ -179,7 +191,6 @@ def time_se(self): return self.encoder_net.time_se def get_config(self): - base_config = super().get_config() del base_config["encoder_cfg"] @@ -208,7 +219,12 @@ def get_config(self): return config def change_config( - self, override_dropouts=False, dropout_rate=0, drop_connect_rate=0, **kwargs + self, + override_output=False, + override_dropouts=False, + dropout_rate=0, + drop_connect_rate=0, + **kwargs ): xvec_args = XVector.filter_finetune_args(**kwargs) xvec_args["override_dropouts"] = False @@ -220,7 +236,6 @@ def change_config( @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) model = cls(**cfg) @@ -231,7 +246,6 @@ def load(cls, file_path=None, cfg=None, state_dict=None): @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = EN.filter_args(**kwargs) @@ -273,3 +287,23 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = EN.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + EN.add_finetune_args(parser) + XVector.add_dino_teacher_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/resnet1d_xvector.py b/hyperion/torch/models/xvectors/resnet1d_xvector.py index 20865880..d305bb6a 100644 --- a/hyperion/torch/models/xvectors/resnet1d_xvector.py +++ b/hyperion/torch/models/xvectors/resnet1d_xvector.py @@ -5,17 +5,15 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import ResNet1dEncoder as Encoder from .xvector import XVector class ResNet1dXVector(XVector): - def __init__( self, resnet_enc, @@ -23,10 +21,7 @@ def __init__( pool_net="mean+stddev", embed_dim=256, num_embed_layers=1, - hid_act={ - "name": "relu", - "inplace": True - }, + hid_act={"name": "relu", "inplace": True}, loss_type="arc-softmax", cos_scale=64, margin=0.3, @@ -39,14 +34,19 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=True, + head_use_norm=True, head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): - if isinstance(resnet_enc, dict): - logging.info("making %s resnet1d encoder network", - resnet_enc["resb_type"]) + logging.info("making %s resnet1d encoder network", resnet_enc["resb_type"]) resnet_enc = Encoder(**resnet_enc) super().__init__( @@ -67,14 +67,20 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) def get_config(self): - base_config = super().get_config() del base_config["encoder_cfg"] del base_config["in_feats"] @@ -91,6 +97,7 @@ def get_config(self): def change_config( self, resnet_enc, + override_output=False, override_dropouts=False, dropout_rate=0, num_classes=None, @@ -103,6 +110,7 @@ def change_config( num_subcenters=2, ): super().change_config( + override_output, False, dropout_rate, num_classes, @@ -122,7 +130,6 @@ def change_config( @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) try: del cfg["in_feats"] @@ -137,7 +144,6 @@ def load(cls, file_path=None, cfg=None, state_dict=None): @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = Encoder.filter_args(**kwargs["resnet_enc"]) @@ -151,12 +157,9 @@ def add_class_args(parser, prefix=None): parser = ArgumentParser(prog="") XVector.add_class_args(parser, skip=set(["in_feats"])) - Encoder.add_class_args(parser, - prefix="resnet_enc", - skip=set(["head_channels"])) + Encoder.add_class_args(parser, prefix="resnet_enc", skip=set(["head_channels"])) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args @@ -174,10 +177,30 @@ def add_finetune_args(parser, prefix=None): parser = ArgumentParser(prog="") XVector.add_finetune_args(parser) - Encoder.add_finetune_args(parser, - prefix="resnet_enc", - skip=set(["head_channels"])) + Encoder.add_finetune_args( + parser, prefix="resnet_enc", skip=set(["head_channels"]) + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dinoteacher_args(**kwargs) + child_args = Encoder.filter_finetune_args(**kwargs["resnet_enc"]) + base_args["resnet_enc"] = child_args + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + Encoder.add_finetune_args( + parser, prefix="resnet_enc", skip=set(["head_channels"]) + ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index 0e9eba22..efc24f27 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import ResNetFactory as RNF from .xvector import XVector @@ -46,14 +45,20 @@ def __init__( use_norm=True, norm_before=True, in_norm=False, + head_use_norm=True, head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", se_r=16, res2net_scale=4, res2net_width_factor=1, + bias_weight_decay=None, ): - logging.info("making %s encoder network", resnet_type) encoder_net = RNF.create( resnet_type, @@ -95,11 +100,18 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=in_feats, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) self.resnet_type = resnet_type @@ -157,12 +169,8 @@ def res2net_width_factor(self): return self.encoder_net.res2net_width_factor def get_config(self): - base_config = super().get_config() del base_config["encoder_cfg"] - - pool_cfg = self.pool_net.get_config() - config = { "resnet_type": self.resnet_type, "in_channels": self.in_channels, @@ -185,7 +193,6 @@ def get_config(self): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) model = cls(**cfg) @@ -196,7 +203,6 @@ def load(cls, file_path=None, cfg=None, state_dict=None): @staticmethod def filter_args(**kwargs): - base_args = XVector.filter_args(**kwargs) child_args = RNF.filter_args(**kwargs) @@ -219,7 +225,6 @@ def add_class_args(parser, prefix=None): @staticmethod def filter_finetune_args(**kwargs): - base_args = XVector.filter_finetune_args(**kwargs) child_args = RNF.filter_finetune_args(**kwargs) @@ -237,3 +242,23 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = RNF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + RNF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index 0b27a840..bf829b64 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -5,10 +5,9 @@ """ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import SpineNetFactory as SNF from .xvector import XVector @@ -50,14 +49,20 @@ def __init__( use_norm=True, norm_before=True, in_norm=False, + head_use_norm=True, head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", se_r=16, res2net_scale=4, res2net_width_factor=1, + bias_weight_decay=None, ): - logging.info("making %s encoder network", spinenet_type) encoder_net = SNF.create( spinenet_type, @@ -103,11 +108,18 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=in_feats, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) self.spinenet_type = spinenet_type @@ -181,7 +193,6 @@ def res2net_width_factor(self): return self.encoder_net.res2net_width_factor def get_config(self): - base_config = super().get_config() del base_config["encoder_cfg"] @@ -213,7 +224,6 @@ def get_config(self): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) model = cls(**cfg) @@ -263,3 +273,23 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = SNF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + SNF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/tdnn_xvector.py b/hyperion/torch/models/xvectors/tdnn_xvector.py index 38262cc3..19c075b6 100644 --- a/hyperion/torch/models/xvectors/tdnn_xvector.py +++ b/hyperion/torch/models/xvectors/tdnn_xvector.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import TDNNFactory as TF from .xvector import XVector @@ -43,11 +42,17 @@ def __init__( use_norm=True, norm_before=False, in_norm=False, + head_use_norm=True, head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): - logging.info("making %s encoder network", tdnn_type) encoder_net = TF.create( tdnn_type, @@ -84,11 +89,18 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=None, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) self.tdnn_type = tdnn_type @@ -125,7 +137,6 @@ def in_norm(self): return self.encoder_net.in_norm def get_config(self): - base_config = super().get_config() del base_config["encoder_cfg"] @@ -197,3 +208,23 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + base_args = XVector.filter_dino_teacher_args(**kwargs) + child_args = TF.filter_finetune_args(**kwargs) + + base_args.update(child_args) + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + TF.add_finetune_args(parser) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/transformer_xvector_v1.py b/hyperion/torch/models/xvectors/transformer_xvector_v1.py index 25e9c894..00f54af7 100644 --- a/hyperion/torch/models/xvectors/transformer_xvector_v1.py +++ b/hyperion/torch/models/xvectors/transformer_xvector_v1.py @@ -5,10 +5,9 @@ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import TransformerEncoderV1 as TE from .xvector import XVector @@ -83,11 +82,17 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=False, + head_use_norm=True, head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, embed_layer=0, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): - logging.info("making transformer-v1 encoder network") encoder_net = TE( in_feats, @@ -127,11 +132,18 @@ def __init__( head_norm_layer=head_norm_layer, use_norm=use_norm, norm_before=norm_before, + head_use_norm=head_use_norm, head_use_in_norm=head_use_in_norm, + head_hid_dim=head_hid_dim, + head_bottleneck_dim=head_bottleneck_dim, + proj_head_use_norm=proj_head_use_norm, + proj_head_norm_before=proj_head_norm_before, dropout_rate=dropout_rate, embed_layer=embed_layer, in_feats=None, proj_feats=proj_feats, + head_type=head_type, + bias_weight_decay=bias_weight_decay, ) @property @@ -409,3 +421,51 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_dino_teacher_args(**kwargs): + """Filters arguments correspondin to TransformerXVector + from args dictionary + + Args: + kwargs: args dictionary + + Returns: + args dictionary + """ + base_args = XVector.filter_dino_teacher_args(**kwargs) + + valid_args = ( + "pos_dropout_rate", + "att_dropout_rate", + ) + + child_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + base_args.update(child_args) + return base_args + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + """Adds TransformerXVector config parameters for finetuning to argparser + + Args: + parser: argparse object + prefix: prefix string to add to the argument names + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + XVector.add_dino_teacher_args(parser) + parser.add_argument( + "--pos-dropout-rate", + default=0.1, + type=float, + help="positional encoder dropout", + ) + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index de28ccae..b4926533 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -3,24 +3,32 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from enum import Enum -from typing import Optional + +# from enum import Enum +from dataclasses import dataclass +from typing import List, Optional import torch import torch.nn as nn from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from ....utils import HypDataClass from ....utils.misc import filter_func_args from ...layer_blocks import TDNNBlock from ...layers import GlobalPool1dFactory as PF -from ...narchs import ClassifHead, TorchNALoader +from ...narchs import ClassifHead, DINOHead, ProjHead, TorchNALoader from ...torch_model import TorchModel from ...utils import eval_nnet_by_chunks, scale_seq_lengths -# class XVectorTrainMode(Enum): -# full = 0 -# frozen = 1 -# ft_embed_affine = 2 + +@dataclass +class XVectorOutput(HypDataClass): + loss: torch.Tensor + logits: torch.Tensor + xvector: torch.Tensor + h_enc: Optional[List[torch.Tensor]] = None + h_classif: Optional[List[torch.Tensor]] = None + h_feats: Optional[List[torch.Tensor]] = None class XVector(TorchModel): @@ -45,13 +53,20 @@ def __init__( head_norm_layer=None, use_norm=True, norm_before=True, + head_use_norm=True, head_use_in_norm=False, + head_hid_dim=2048, + head_bottleneck_dim=256, + proj_head_use_norm=True, + proj_head_norm_before=True, dropout_rate=0, embed_layer=0, in_feats=None, proj_feats=None, + head_type="x-vector", + bias_weight_decay=None, ): - super().__init__() + super().__init__(bias_weight_decay=bias_weight_decay) # encoder network self.encoder_net = encoder_net @@ -112,78 +127,147 @@ def __init__( # create classification head logging.info("making classification head net") - self.classif_net = ClassifHead( - pool_feats, - num_classes, - embed_dim=embed_dim, - num_embed_layers=num_embed_layers, - hid_act=hid_act, - loss_type=loss_type, - cos_scale=cos_scale, - margin=margin, - margin_warmup_epochs=margin_warmup_epochs, - intertop_k=intertop_k, - intertop_margin=intertop_margin, - num_subcenters=num_subcenters, - norm_layer=head_norm_layer, - use_norm=use_norm, - norm_before=norm_before, - dropout_rate=dropout_rate, - use_in_norm=head_use_in_norm, - ) - + self.embed_dim = embed_dim + self.num_embed_layers = num_embed_layers + self.head_type = head_type self.hid_act = hid_act self.norm_layer = norm_layer - self.head_norm_layer = head_norm_layer self.use_norm = use_norm self.norm_before = norm_before self.head_use_in_norm = head_use_in_norm + self.head_use_norm = head_use_norm + self.head_norm_layer = head_norm_layer + self.head_hid_dim = head_hid_dim + self.head_bottleneck_dim = head_bottleneck_dim + self.proj_head_use_norm = proj_head_use_norm + self.proj_head_norm_before = proj_head_norm_before self.dropout_rate = dropout_rate self.embed_layer = embed_layer + if self.head_type == "x-vector": + self.proj_head_net = None + self.classif_net = ClassifHead( + pool_feats, + num_classes, + embed_dim=embed_dim, + num_embed_layers=num_embed_layers, + hid_act=hid_act, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + norm_layer=head_norm_layer, + use_norm=head_use_norm, + norm_before=norm_before, + dropout_rate=dropout_rate, + use_in_norm=head_use_in_norm, + ) + elif self.head_type == "dino": + self.proj_head_net = ProjHead( + pool_feats, + embed_dim, + use_norm=proj_head_use_norm, + norm_before=proj_head_norm_before, + ) + self.classif_net = DINOHead( + embed_dim, + num_classes, + hid_feats=head_hid_dim, + bottleneck_feats=head_bottleneck_dim, + num_hid_layers=num_embed_layers, + hid_act=hid_act, + output_type=loss_type, + norm_layer=head_norm_layer, + use_norm=head_use_norm, + norm_before=norm_before, + dropout_rate=dropout_rate, + use_in_norm=head_use_in_norm, + ) @property def pool_feats(self): - return self.classif_net.in_feats + if self.proj_head_net is None: + return self.classif_net.in_feats + else: + return self.proj_head_net.in_feats @property def num_classes(self): return self.classif_net.num_classes - @property - def embed_dim(self): - return self.classif_net.embed_dim - - @property - def num_embed_layers(self): - return self.classif_net.num_embed_layers - @property def cos_scale(self): - return self.classif_net.cos_scale + if self.head_type == "x-vector": + return self.classif_net.cos_scale + elif self.head_type == "dino": + return 1 + else: + raise ValueError @property def margin(self): - return self.classif_net.margin + if self.head_type == "x-vector": + return self.classif_net.margin + else: + return 0.0 @property def margin_warmup_epochs(self): - return self.classif_net.margin_warmup_epochs + if self.head_type == "x-vector": + return self.classif_net.margin_warmup_epochs + else: + return 0 @property def intertop_k(self): - return self.classif_net.intertop_k + if self.head_type == "x-vector": + return self.classif_net.intertop_k + else: + return 0 @property def intertop_margin(self): - return self.classif_net.intertop_margin + if self.head_type == "x-vector": + return self.classif_net.intertop_margin + else: + return 0.0 @property def num_subcenters(self): - return self.classif_net.num_subcenters + if self.head_type == "x-vector": + return self.classif_net.num_subcenters + else: + return 0 @property def loss_type(self): - return self.classif_net.loss_type + if self.head_type == "x-vector": + return self.classif_net.loss_type + elif self.head_type == "dino": + return self.classif_net.output_type + else: + raise ValueError() + + # def clone(self): + # # weight normalized layers cannot be copied with deepcopy, + # # we remove them to clone and put them back later + # modules, cloned_modules = self.before_cloning() + # new_self = super().clone() + # self.after_cloning(*modules) + # new_self.after_cloning(*cloned_modules) + # return new_self + + # def before_cloning(self): + # if self.head_type == "dino": + # return self.classif_net.before_cloning() + # else: + # return None, None + + # def after_cloning(self, output): + # if self.head_type == "dino": + # self.classif_net.after_cloning(output) def _make_pool_net(self, pool_net, enc_feats=None): """Makes the pooling block @@ -290,6 +374,8 @@ class logits tensor with shape=(batch, num_classes). x = x[0] x, x_lengths = self._post_enc(x, x_lengths, max_in_length) p = self.pool_net(x, x_lengths=x_lengths) + if self.proj_head_net is not None: + p = self.proj_head_net(p) y = self.classif_net(p, y) return y @@ -329,6 +415,8 @@ def forward_hid_feats( x, x_lengths = self._post_enc(x, x_lengths, max_in_length) p = self.pool_net(x, x_lengths=x_lengths) + if self.proj_head_net is not None: + p = self.proj_head_net(p) h_classif = self.classif_net.forward_hid_feats( p, y, return_classif_layers, return_logits=return_logits ) @@ -358,6 +446,9 @@ def extract_embed( x, x_lengths = self._post_enc(x, x_lengths, max_in_length) p = self.pool_net(x, x_lengths=x_lengths) + if self.proj_head_net is not None: + return self.proj_head_net(p) + y = self.classif_net.extract_embed(p, embed_layer) return y @@ -491,7 +582,6 @@ def compute_slidwin_left_padding( def get_config(self): enc_cfg = self.encoder_net.get_config() pool_cfg = PF.get_config(self.pool_net) - config = { "encoder_cfg": enc_cfg, "pool_net": pool_cfg, @@ -507,14 +597,21 @@ def get_config(self): "intertop_margin": self.intertop_margin, "num_subcenters": self.num_subcenters, "norm_layer": self.norm_layer, - "head_norm_layer": self.head_norm_layer, "use_norm": self.use_norm, "norm_before": self.norm_before, + "head_norm_layer": self.head_norm_layer, + "head_use_norm": self.head_use_norm, "head_use_in_norm": self.head_use_in_norm, + "head_hid_dim": self.head_hid_dim, + "head_bottleneck_dim": self.head_bottleneck_dim, + "proj_head_use_norm": self.proj_head_use_norm, + "proj_head_norm_before": self.proj_head_norm_before, "dropout_rate": self.dropout_rate, "embed_layer": self.embed_layer, "in_feats": self.in_feats, "proj_feats": self.proj_feats, + "head_type": self.head_type, + "bias_weight_decay": self.bias_weight_decay, } base_config = super().get_config() @@ -535,6 +632,7 @@ def load(cls, file_path=None, cfg=None, state_dict=None): def change_config( self, + override_output=False, override_dropouts=False, dropout_rate=0, num_classes=None, @@ -547,16 +645,17 @@ def change_config( num_subcenters=2, ): logging.info("changing x-vector config") - self.rebuild_output_layer( - num_classes=num_classes, - loss_type=loss_type, - cos_scale=cos_scale, - margin=margin, - margin_warmup_epochs=margin_warmup_epochs, - intertop_k=intertop_k, - intertop_margin=intertop_margin, - num_subcenters=num_subcenters, - ) + if override_output: + self.rebuild_output_layer( + num_classes=num_classes, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + ) if override_dropouts: logging.info("overriding x-vector dropouts") @@ -605,6 +704,10 @@ def rebuild_output_layer( self.classif_net.set_intertop_margin(intertop_margin) self.classif_net.set_num_subcenters(num_subcenters) + def cancel_output_layer_grads(self): + for p in self.classif_net.output.parameters(): + p.grad = None + def freeze_preembed_layers(self): self.encoder_net.freeze() if self.proj is not None: @@ -630,6 +733,9 @@ def set_train_mode(self, mode): else: raise ValueError(f"invalid train_mode={mode}") + if self.head_type == "dino": + self.classif_net.freeze_output_g() + self._train_mode = mode def _train(self, train_mode: str): @@ -658,7 +764,7 @@ def valid_train_modes(): def filter_args(**kwargs): # get arguments for pooling pool_args = PF.filter_args(**kwargs["pool_net"]) - args = filter_func_args(ClassifHead.__init__, kwargs) + args = filter_func_args(XVector.__init__, kwargs) args["pool_net"] = pool_args return args @@ -672,6 +778,13 @@ def add_class_args(parser, prefix=None, skip=set()): parser, prefix="pool_net", skip=["dim", "in_feats", "keepdim"] ) + parser.add_argument( + "--head-type", + default="x-vector", + choices=["x-vector", "dino"], + help="type of classification head in [x-vector, dino]", + ) + parser.add_argument( "--embed-dim", default=256, type=int, help=("x-vector dimension") ) @@ -776,6 +889,12 @@ def add_class_args(parser, prefix=None, skip=set()): help="batch normalizaton before activation", ) + parser.add_argument( + "--head-use-norm", + default=True, + action=ActionYesNo, + help="batch normalizaton at the head", + ) parser.add_argument( "--head-use-in-norm", default=False, @@ -783,6 +902,33 @@ def add_class_args(parser, prefix=None, skip=set()): help="batch normalizaton at the head input", ) + parser.add_argument( + "--head-hid-dim", + default=2048, + type=int, + help="bottleneck dim of DINO head", + ) + + parser.add_argument( + "--head-bottleneck-dim", + default=256, + type=int, + help="bottleneck dim of DINO head", + ) + + parser.add_argument( + "--proj-head-use-norm", + default=True, + action=ActionYesNo, + help="batch normalizaton at projection head", + ) + parser.add_argument( + "--proj-head-norm-before", + default=False, + action=ActionYesNo, + help="batch normalizaton at the begining of projection head", + ) + try: parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") except: @@ -808,6 +954,14 @@ def add_class_args(parser, prefix=None, skip=set()): "if None, there is not projection" ), ) + + parser.add_argument( + "--bias-weight-decay", + default=None, + type=float, + help="biases weight decay, if None default it is used", + ) + if prefix is not None: outer_parser.add_argument( "--" + prefix, @@ -817,15 +971,7 @@ def add_class_args(parser, prefix=None, skip=set()): @staticmethod def filter_finetune_args(**kwargs): - valid_args = ( - "loss_type", - "cos_scale", - "margin", - "margin_warmup_epochs", - "intertop_k", - "intertop_margin", - ) - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + args = filter_func_args(XVector.change_config, kwargs) return args @staticmethod @@ -834,6 +980,13 @@ def add_finetune_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") + parser.add_argument( + "--override-output", + default=False, + action=ActionYesNo, + help="changes the config of the output layer", + ) + parser.add_argument( "--loss-type", default="arc-softmax", @@ -894,5 +1047,36 @@ def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + @staticmethod + def filter_dino_teacher_args(**kwargs): + return XVector.filter_finetune_args(**kwargs) + + @staticmethod + def add_dino_teacher_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + add_argparse_args = add_class_args add_argparse_finetune_args = add_finetune_args diff --git a/hyperion/torch/narchs/__init__.py b/hyperion/torch/narchs/__init__.py index c46c87fa..0bf7ecf4 100644 --- a/hyperion/torch/narchs/__init__.py +++ b/hyperion/torch/narchs/__init__.py @@ -10,10 +10,12 @@ from .dc1d_encoder import DC1dEncoder from .dc2d_decoder import DC2dDecoder from .dc2d_encoder import DC2dEncoder +from .dino_head import DINOHead from .efficient_net import EfficientNet from .etdnn import ETDNNV1 from .fcnet import FCNetV1, FCNetV2 from .feat_fuser_mvn import FeatFuserMVN +from .proj_head import ProjHead from .resetdnn import ResETDNNV1 from .resnet import * from .resnet1d_decoder import ResNet1dDecoder diff --git a/hyperion/torch/narchs/audio_feats_mvn.py b/hyperion/torch/narchs/audio_feats_mvn.py index b42f48f1..dabf308f 100644 --- a/hyperion/torch/narchs/audio_feats_mvn.py +++ b/hyperion/torch/narchs/audio_feats_mvn.py @@ -4,7 +4,7 @@ """ import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layers import AudioFeatsFactory as AFF from ..layers import MeanVarianceNorm as MVN @@ -116,7 +116,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--aug-after-mvn", default=False, - action="store_true", + action=ActionYesNo, help=("do spec augment after st-mvn," "instead of before"), ) diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index e5d90f4f..a4a7e9a1 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -4,10 +4,9 @@ """ -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.nn import Linear from ...utils.misc import filter_func_args @@ -62,7 +61,6 @@ def __init__( dropout_rate=0, use_in_norm=False, ): - super().__init__() assert num_embed_layers >= 1, "num_embed_layers (%d < 1)" % num_embed_layers @@ -182,7 +180,6 @@ def rebuild_output_layer( intertop_margin=0.0, num_subcenters=2, ): - embed_dim = self.embed_dim self.num_classes = num_classes self.loss_type = loss_type @@ -283,7 +280,6 @@ def put_layers_in_eval_mode(self, layer_list): self.fc_blocks[l].eval() def forward(self, x, y=None): - if self.use_in_norm: x = self.in_norm(x) @@ -298,7 +294,6 @@ def forward(self, x, y=None): return y def forward_hid_feats(self, x, y=None, return_layers=None, return_logits=False): - assert return_layers is not None or return_logits if return_layers is None: return_layers = [] @@ -322,7 +317,6 @@ def forward_hid_feats(self, x, y=None, return_layers=None, return_logits=False): return h, None def extract_embed(self, x, embed_layer=0): - if self.use_in_norm: x = self.in_norm(x) @@ -344,7 +338,6 @@ def compute_prototype_affinity(self): return torch.mm(kernel, kernel.transpose(0, 1)) def get_config(self): - hid_act = AF.get_config(self.fc_blocks[0].activation) config = { @@ -372,7 +365,6 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if "wo_norm" in kwargs: kwargs["use_norm"] = not kwargs["wo_norm"] del kwargs["wo_norm"] @@ -413,7 +405,9 @@ def add_class_args(parser, prefix=None): help="loss type: softmax, arc-softmax, cos-softmax, subcenter-arc-softmax", ) - parser.add_argument("--s", default=64, type=float, help="scale for arcface") + parser.add_argument( + "--cos-scale", default=64, type=float, help="scale for arcface" + ) parser.add_argument( "--margin", default=0.3, type=float, help="margin for arcface, cosface,..." @@ -460,17 +454,17 @@ def add_class_args(parser, prefix=None): pass parser.add_argument( - "--wo-norm", - default=False, + "--use-norm", + default=True, action=ActionYesNo, help="without batch normalization", ) parser.add_argument( - "--norm-after", - default=False, + "--norm-before", + default=True, action=ActionYesNo, - help="batch normalizaton after activation", + help="batch normalizaton before activation", ) parser.add_argument( @@ -487,6 +481,5 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='classification head options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index 54c2f400..f232c986 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -3,6 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging + import torch import torch.nn as nn from jsonargparse import ActionParser, ActionYesNo, ArgumentParser @@ -217,7 +219,7 @@ def _make_in_layer(self): d_model, self.pos_kernel_size, self.pos_num_groups, self.hid_act ) else: - raise Exception("wrong pos-enc-type={}".format(self.pos_enc_type)) + raise Exception(f"wrong pos-enc-type={self.pos_enc_type}") hid_act = AF.create(self.hid_act) @@ -274,6 +276,29 @@ def _forward_input(self, x, x_mask): return x, x_mask + def change_config( + self, override_dropouts, dropout_rate, pos_dropout_rate, att_dropout_rate + ): + if override_dropouts: + logging.info("changing conformer dropouts") + self.change_dropouts(dropout_rate, pos_dropout_rate, att_dropout_rate) + + def change_dropouts(self, dropout_rate, pos_dropout_rate, att_dropout_rate): + super().change_dropouts(dropout_rate) + from ..layers import PosEncoderBase + + for m in self.modules(): + if isinstance(m, PosEncoderBase): + if hasattr(m, "dropout_rate"): + m.dropout_rate = pos_dropout_rate + m.dropout.p = pos_dropout_rate + elif isinstance(m, EBlock): + m.change_attn_dropout(att_dropout_rate) + + self.dropout_rate = dropout_rate + self.pos_dropout_rate = pos_dropout_rate + self.att_dropout_rate = att_dropout_rate + def forward( self, x, x_lengths=None, x_mask=None, return_mask=False, target_shape=None ): @@ -611,3 +636,63 @@ def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + valid_args = ( + "override_dropouts", + "dropout_rate", + "pos_dropout_rate", + "att_dropout_rate", + ) + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return args + + @staticmethod + def add_finetune_args(parser, prefix=None, skip=set([])): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + try: + parser.add_argument( + "--override-dropouts", + default=False, + action=ActionYesNo, + help=( + "whether to use the dropout probabilities passed in the " + "arguments instead of the defaults in the pretrained model." + ), + ) + except: + pass + + try: + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) + except: + pass + + try: + parser.add_argument( + "--pos-dropout-rate", + default=0, + type=float, + help="positional encoder dropout probability", + ) + except: + pass + + try: + parser.add_argument( + "--att-dropout-rate", + default=0, + type=float, + help="attention dropout probability", + ) + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/dino_head.py b/hyperion/torch/narchs/dino_head.py new file mode 100644 index 00000000..a59434bf --- /dev/null +++ b/hyperion/torch/narchs/dino_head.py @@ -0,0 +1,337 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +from typing import Optional + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from ..layer_blocks import FCBlock +from ..layers import ActivationFactory as AF +from ..layers import CosLossOutput +from ..layers import NormLayer1dFactory as NLF +from .net_arch import NetArch + +# class DINOHead1(nn.Module): +# def __init__( +# self, +# in_dim, +# out_dim, +# use_bn=False, +# norm_last_layer=True, +# nlayers=3, +# hidden_dim=2048, +# bottleneck_dim=256, +# ): +# super().__init__() +# nlayers = max(nlayers, 1) +# if nlayers == 1: +# self.mlp = nn.Linear(in_dim, bottleneck_dim) +# else: +# layers = [nn.Linear(in_dim, hidden_dim)] +# if use_bn: +# layers.append(nn.BatchNorm1d(hidden_dim)) +# layers.append(nn.GELU()) +# for _ in range(nlayers - 2): +# layers.append(nn.Linear(hidden_dim, hidden_dim)) +# if use_bn: +# layers.append(nn.BatchNorm1d(hidden_dim)) +# layers.append(nn.GELU()) +# layers.append(nn.Linear(hidden_dim, bottleneck_dim)) +# self.mlp = nn.Sequential(*layers) +# self.apply(self._init_weights) +# self.last_layer = nn.utils.weight_norm( +# nn.Linear(bottleneck_dim, out_dim, bias=False) +# ) +# self.last_layer.weight_g.data.fill_(1) +# if norm_last_layer: +# self.last_layer.weight_g.requires_grad = False + +# def _init_weights(self, m): +# if isinstance(m, nn.Linear): +# nn.init.trunc_normal_(m.weight, std=0.02) +# if isinstance(m, nn.Linear) and m.bias is not None: +# nn.init.constant_(m.bias, 0) + +# def forward(self, x): +# x = self.mlp(x) +# x = nn.functional.normalize(x, dim=-1, p=2) +# x = self.last_layer(x) +# return x + + +class DINOHead(NetArch): + """Classification Head for DINO x-vector style networks + + Attributes: + in_feats: input features + num_classes: number of output classes + hid_feats: dimension of hidding layer + bottleneck_feats: dimension of bottleneck layer before output + num_hid_layers: number of hidden layers + hid_act: str or dict hidden activation type in ['relu', 'relu6', 'swish', ... ] + output_type: type of output layer that will be used with the x-vector in ['softmax', 'cos-softmax'], + corresponding to standard cross-entorpy, cosine scoring + norm_layer: norm_layer object or str indicating type norm layer, if None it uses BatchNorm1d + use_norm: it True it uses layer/batch-normalization + norm_before: if True, layer-norm is before the activation function + use_in_norm: put batchnorm at the input + """ + + def __init__( + self, + in_feats, + num_classes, + hid_feats=2048, + bottleneck_feats=256, + num_hid_layers=3, + hid_act="gelu", + output_type="softmax", + norm_layer=None, + use_norm=False, + norm_before=True, + dropout_rate=0, + use_in_norm=False, + ): + super().__init__() + assert num_hid_layers >= 1, "num_layers (%d < 1)" % num_hid_layers + + self.num_hid_ayers = num_hid_layers + self.in_feats = in_feats + self.hid_feats = hid_feats + self.bottleneck_feats = bottleneck_feats + self.num_classes = num_classes + self.norm_layer = norm_layer + self.use_in_norm = use_in_norm + + if use_norm: + norm_groups = None + if norm_layer == "group-norm": + norm_groups = min(hid_feats // 8, 32) + self._norm_layer = NLF.create(norm_layer, norm_groups) + else: + self._norm_layer = None + + self.use_norm = use_norm + self.norm_before = norm_before + + self.dropout_rate = dropout_rate + self.output_type = output_type + if use_in_norm: + assert not self.norm_before + self.in_norm = self._norm_layer(in_feats) + + if num_hid_layers == 1: + self.fc_layers = nn.Linear(in_feats, bottleneck_feats) + else: + layers = [nn.Linear(in_feats, hid_feats)] + if use_norm and norm_before: + layers.append(self._norm_layer(hid_feats)) + layers.append(AF.create(hid_act)) + if use_norm and not norm_before: + layers.append(self._norm_layer(hid_feats)) + if self.dropout_rate > 0: + layers.append(nn.Dropout(self.dropout_rate)) + + for _ in range(num_hid_layers - 2): + layers.append(nn.Linear(hid_feats, hid_feats)) + if use_norm and norm_before: + layers.append(self._norm_layer(hid_feats)) + layers.append(AF.create(hid_act)) + if use_norm and not norm_before: + layers.append(self._norm_layer(hid_feats)) + if self.dropout_rate > 0: + layers.append(nn.Dropout(self.dropout_rate)) + + layers.append(nn.Linear(hid_feats, bottleneck_feats)) + self.hid_layers = nn.Sequential(*layers) + + self.apply(self._init_weights) + if output_type == "softmax": + output = nn.Linear(bottleneck_feats, num_classes, bias=False) + with torch.no_grad(): + self.output = nn.utils.weight_norm(output) + self.output.weight_g.data.fill_(1) + self.output.weight_g.requires_grad = False + elif output_type == "cos-softmax": + self.output = CosLossOutput( + hid_feats, + num_classes, + cos_scale=1, + margin=0, + margin_warmup_epochs=0, + intertop_k=0, + intertop_margin=0, + ) + else: + raise ValueError(f"wrong loss_type={output_type}") + + # def before_cloning(self): + # if self.output_type == "cos-softmax": + # return None, None + + # torch.nn.utils.remove_weight_norm(self.output) + # return None, None + # cloned_output = self._clone_output() + # output = self.output + # self.output = None + # return output, cloned_output + + # def after_cloning(self, output: nn.Module): + # if self.output_type == "cos-softmax": + # return + + # self.output = nn.utils.weight_norm(self.output) + # self.output.weight_g.data.fill_(1) + # self.output.weight_g.requires_grad = False + + # def _clone_output(self): + # output = nn.utils.weight_norm( + # nn.Linear(self.bottleneck_feats, self.num_classes, bias=False) + # ) + # output.weight_g.data.fill_(1) + # output.weight_v.data = self.output_v.data.detach() + # output.weight_g.requires_grad = False + # return output + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None): + if self.use_in_norm: + x = self.in_norm(x) + assert not torch.any( + torch.isnan(x) + ), f"x is nan {x.size()} {torch.sum(torch.isnan(x))}" + x = self.hid_layers(x) + assert not torch.any( + torch.isnan(x) + ), f"x_hid is nan {x.size()} {torch.sum(torch.isnan(x))}" + x = nn.functional.normalize(x, dim=-1, p=2) + assert not torch.any( + torch.isnan(x) + ), f"x_l2 is nan {x.size()} {torch.sum(torch.isnan(x))}" + x = self.output(x) + assert not torch.any( + torch.isnan(x) + ), f"out is nan {x.size()} {torch.sum(torch.isnan(x))}" + return x + + def get_config(self): + hid_act = AF.get_config(self.fc_blocks[0].activation) + + config = { + "in_feats": self.in_feats, + "num_classes": self.num_classes, + "hid_feats": self.hid_feats, + "bottleneck_feats": self.bottleneck_feats, + "num_hid_layers": self.num_hid_layers, + "hid_act": hid_act, + "output_type": self.output_type, + "norm_layer": self.norm_layer, + "use_norm": self.use_norm, + "norm_before": self.norm_before, + "dropout_rate": self.dropout_rate, + "use_in_norm": self.use_in_norm, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + # if "wo_norm" in kwargs: + # kwargs["use_norm"] = not kwargs["wo_norm"] + # del kwargs["wo_norm"] + + # if "norm_after" in kwargs: + # kwargs["norm_before"] = not kwargs["norm_after"] + # del kwargs["norm_after"] + + args = filter_func_args(DINOHead.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--botteneck-feats", + default=256, + type=int, + help=("bottleneck dimension before output layer"), + ) + + parser.add_argument( + "--num-hid-layers", + default=3, + type=int, + help=("number of hidden layers in the classif head"), + ) + + try: + parser.add_argument("--hid-act", default="gelu", help="hidden activation") + except: + pass + + parser.add_argument( + "--output-layer", + default="softmax", + choices=["softmax", "cos-softmax"], + help="loss type: softmax, cos-softmax", + ) + + try: + parser.add_argument( + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer for all components of x-vector network", + ) + except: + pass + + parser.add_argument( + "--use-norm", + default=True, + action=ActionYesNo, + help="without batch normalization", + ) + + parser.add_argument( + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton before activation", + ) + + parser.add_argument( + "--use-in-norm", + default=False, + action=ActionYesNo, + help="batch normalizaton in the classif head input", + ) + + try: + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") + except: + pass + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/feat_fuser_mvn.py b/hyperion/torch/narchs/feat_fuser_mvn.py index 6fa4c6c0..0656e279 100644 --- a/hyperion/torch/narchs/feat_fuser_mvn.py +++ b/hyperion/torch/narchs/feat_fuser_mvn.py @@ -6,7 +6,7 @@ import torch import torch.nn as nn -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..layers import FeatFuserFactory as FFF from ..layers import MeanVarianceNorm as MVN @@ -103,7 +103,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--aug-after-mvn", default=False, - action="store_true", + action=ActionYesNo, help=("do spec augment after st-mvn," "instead of before"), ) diff --git a/hyperion/torch/narchs/proj_head.py b/hyperion/torch/narchs/proj_head.py new file mode 100644 index 00000000..549f9e6a --- /dev/null +++ b/hyperion/torch/narchs/proj_head.py @@ -0,0 +1,149 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from torch.nn import Linear + +from ...utils.misc import filter_func_args +from ..layer_blocks import FCBlock +from ..layers import ActivationFactory as AF +from ..layers import NormLayer1dFactory as NLF +from .net_arch import NetArch + + +class ProjHead(NetArch): + """Classification Head for x-vector style networks + + Attributes: + in_feats: input features + num_classes: number of output classes + out_feats: dimension of embedding layer + num_embed_layers: number of hidden layers + hid_act: str or dict hidden activation type in ['relu', 'relu6', 'swish', ... ] + loss_type: type of loss function that will be used with the x-vector in ['softmax', 'cos-softmax', 'arc-softmax'], + corresponding to standard cross-entorpy, additive margin softmax or additive angular margin softmax. + cos_scale: scale parameter for cos-softmax and arc-softmax + margin: margin parameter for cos-softmax and arc-softmax + margin_warmup_epochs: number of epochs to anneal the margin from 0 to margin + intertop_k: adds negative angular penalty to k largest negative scores. + intertop_margin: inter-top-k penalty. + num_subcenters: number of subcenters in subcenter losses + norm_layer: norm_layer object or str indicating type norm layer, if None it uses BatchNorm1d + use_norm: it True it uses layer/batch-normalization + norm_before: if True, layer-norm is before the activation function + use_in_norm: put batchnorm at the input + """ + + def __init__( + self, + in_feats, + out_feats=256, + norm_layer=None, + use_norm=True, + norm_before=True, + ): + super().__init__() + + self.in_feats = in_feats + self.out_feats = out_feats + self.norm_layer = norm_layer + self.use_norm = use_norm + self.norm_before = norm_before + + if use_norm: + norm_groups = None + if norm_layer == "group-norm": + norm_groups = min(out_feats // 8, 32) + _norm_layer = NLF.create(norm_layer, norm_groups) + if norm_before: + self._norm_layer = _norm_layer(in_feats) + else: + self._norm_layer = _norm_layer(out_feats) + else: + self._norm_layer = None + + self.proj = nn.Linear(in_feats, out_feats) + + def forward(self, x, y=None): + if self.use_norm and self.norm_before: + x = self._norm_layer(x) + assert not torch.any( + torch.isnan(x) + ), f"x before proj is nan {x.size()} {torch.sum(torch.isnan(x))}" + x = self.proj(x) + assert not torch.any( + torch.isnan(x) + ), f"x after proj is nan {x.size()} {torch.sum(torch.isnan(x))}" + if self.use_norm and not self.norm_before: + x = self._norm_layer(x) + assert not torch.any( + torch.isnan(x) + ), f"x after bn is nan {x.size()} {torch.sum(torch.isnan(x))}" + return x + + def get_config(self): + hid_act = AF.get_config(self.fc_blocks[0].activation) + + config = { + "in_feats": self.in_feats, + "out_feats": self.out_feats, + "norm_layer": self.norm_layer, + "use_norm": self.use_norm, + "norm_before": self.norm_before, + } + + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(ProjHead.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--out-feats", default=256, type=int, help=("projection dimension") + ) + + try: + parser.add_argument( + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer for all components of x-vector network", + ) + except: + pass + + parser.add_argument( + "--use-norm", + default=False, + action=ActionYesNo, + help="without batch normalization", + ) + + parser.add_argument( + "--norm-before", + default=True, + action=ActionYesNo, + help="batch normalizaton after activation", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/narchs/rnn_transducer_decoder.py b/hyperion/torch/narchs/rnn_transducer_decoder.py index 763ec67c..77c1234a 100644 --- a/hyperion/torch/narchs/rnn_transducer_decoder.py +++ b/hyperion/torch/narchs/rnn_transducer_decoder.py @@ -2,16 +2,15 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +import logging from dataclasses import dataclass from typing import Dict, List, Optional, Tuple -import torchaudio -import torchaudio.functional -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +import torchaudio +import torchaudio.functional +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser try: import k2 @@ -36,8 +35,8 @@ class Hypothesis: class RNNTransducerDecoder(NetArch): - """ RNN-T Decoder composed of Predictor and Joiner networks - Implementation based on + """RNN-T Decoder composed of Predictor and Joiner networks + Implementation based on https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/transducer/transducer.py Attributes: @@ -48,15 +47,15 @@ class RNNTransducerDecoder(NetArch): blank_id: id of the null symbol. rnnt_loss: type of rnn-t loss between torchaudio, k2 or k2_pruned. rnnt_type: rnn-t variation between regular, modified or constrained. - delay_penalty: penalize symbol delay, which is used to make symbol + delay_penalty: penalize symbol delay, which is used to make symbol emit earlier. reduction: type of reduction for rnn-t loss between sum or mean - prune_range: how many symbols to keep for each frame in k2 rnn-t + prune_range: how many symbols to keep for each frame in k2 rnn-t pruned loss. lm_scale: language model scale in rnn-t smoothed loss. am_scale: acoustic model scale in rnn-t smoothed loss. simple_loss_scale: weight of rnn-t simple loss when using k2 pruned loss. - pruned_warmup_steps: number of steps to warm up the k2 rnn-t pruned loss + pruned_warmup_steps: number of steps to warm up the k2 rnn-t pruned loss from 0.1 to 1. """ @@ -77,7 +76,6 @@ def __init__( simple_loss_scale: float = 0.5, pruned_warmup_steps: int = 2000, ): - super().__init__() self.in_feats = in_feats self.vocab_size = vocab_size @@ -206,7 +204,6 @@ def _rnnt_loss_k2_pruned( y_lengths: torch.Tensor, pred_out: torch.Tensor, ): - y_padded = y.pad(mode="constant", padding_value=0) y_padded = y_padded.to(torch.int64) boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device) @@ -281,7 +278,6 @@ def _rnnt_loss_k2_pruned( def forward( self, x: torch.Tensor, x_lengths: torch.Tensor, y: k2.RaggedTensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - # get y_lengths row_splits = y.shape.row_splits(1) y_lengths = row_splits[1:] - row_splits[:-1] @@ -415,7 +411,10 @@ def decode_time_sync_beam_search( if cached_key not in cache: pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) + pred_out, pred_state = self.predictor( + pred_in, + y_star.pred_state, + ) cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] @@ -455,7 +454,9 @@ def decode_time_sync_beam_search( new_ys = y_star.ys + [i] new_log_prob = y_star.log_prob + v new_hyp = Hypothesis( - ys=new_ys, log_prob=new_log_prob, pred_state=pred_state, + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, ) A.append(new_hyp) @@ -528,7 +529,10 @@ def decode_align_length_sync_beam_search( if cached_key not in cache: pred_in = torch.tensor([y_star.ys[-1]], device=device).reshape(1, 1) - pred_out, pred_state = self.predictor(pred_in, y_star.pred_state,) + pred_out, pred_state = self.predictor( + pred_in, + y_star.pred_state, + ) cache[cached_key] = (pred_out, pred_state) else: pred_out, pred_state = cache[cached_key] @@ -565,7 +569,9 @@ def decode_align_length_sync_beam_search( new_ys = y_star.ys + [i] new_log_prob = y_star.log_prob + v new_hyp = Hypothesis( - ys=new_ys, log_prob=new_log_prob, pred_state=pred_state, + ys=new_ys, + log_prob=new_log_prob, + pred_state=pred_state, ) A.append(new_hyp) @@ -574,7 +580,9 @@ def decode_align_length_sync_beam_search( # A_most_probable = max(A, key=lambda hyp: hyp.log_prob) # print("tuAB1", t, u, len(A), A_most_probable.log_prob, len(B)) B0 = sorted( - [hyp for hyp in A], key=lambda hyp: hyp.log_prob, reverse=True, + [hyp for hyp in A], + key=lambda hyp: hyp.log_prob, + reverse=True, ) B = [] B_ys = set() @@ -621,7 +629,6 @@ def filter_finetune_args(**kwargs): @staticmethod def add_pred_args(parser): - pred_parser = ArgumentParser(prog="") pred_parser.add_argument( "--pred-type", @@ -682,7 +689,6 @@ def add_pred_args(parser): @staticmethod def add_joiner_args(parser): - pred_parser = ArgumentParser(prog="") pred_parser.add_argument( "--joiner-type", @@ -702,7 +708,6 @@ def add_joiner_args(parser): def add_class_args( parser, prefix=None, skip=set(["in_feats", "blank_id", "vocab_size"]) ): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -791,7 +796,6 @@ def add_class_args( @staticmethod def add_finetune_args(parser, prefix=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/optim/__init__.py b/hyperion/torch/optim/__init__.py index fd05c755..33364d63 100644 --- a/hyperion/torch/optim/__init__.py +++ b/hyperion/torch/optim/__init__.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .ema import ExpMovingAvg from .factory import OptimizerFactory from .fgsm import FGSM from .radam import RAdam diff --git a/hyperion/torch/optim/factory.py b/hyperion/torch/optim/factory.py index 95117b05..b01d3b62 100644 --- a/hyperion/torch/optim/factory.py +++ b/hyperion/torch/optim/factory.py @@ -4,12 +4,11 @@ """ import logging -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.optim as optim +from jsonargparse import ActionParser, ArgumentParser -from ...utils.misc import filter_args +from ...utils.misc import filter_args, filter_func_args from .radam import RAdam @@ -39,7 +38,6 @@ def create( max_iter=20, oss=False, ): - kwargs = locals() base_opt = None if opt_type == "sgd": @@ -152,29 +150,30 @@ def create( @staticmethod def filter_args(**kwargs): - valid_args = ( - "opt_type", - "lr", - "momentum", - "beta1", - "beta2", - "rho", - "eps", - "weight_decay", - "amsgrad", - "nesterov", - "lambd", - "asgd_alpha", - "t0", - "rmsprop_alpha", - "centered", - "lr_decay", - "init_acc_val", - "max_iter", - "oss", - ) - - return filter_args(valid_args, kwargs) + return filter_func_args(OptimizerFactory.create, kwargs) + # valid_args = ( + # "opt_type", + # "lr", + # "momentum", + # "beta1", + # "beta2", + # "rho", + # "eps", + # "weight_decay", + # "amsgrad", + # "nesterov", + # "lambd", + # "asgd_alpha", + # "t0", + # "rmsprop_alpha", + # "centered", + # "lr_decay", + # "init_acc_val", + # "max_iter", + # "oss", + # ) + + # return filter_args(valid_args, kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -323,6 +322,5 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='optimizer options') add_argparse_args = add_class_args diff --git a/hyperion/torch/optim/radam.py b/hyperion/torch/optim/radam.py index 1b7a588f..1aa98517 100644 --- a/hyperion/torch/optim/radam.py +++ b/hyperion/torch/optim/radam.py @@ -1,7 +1,6 @@ """ Code taken from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam/radam.py """ -# import math @@ -62,13 +61,11 @@ def __setstate__(self, state): super().__setstate__(state) def step(self, closure=None): - loss = None if closure is not None: loss = closure() for group in self.param_groups: - for p in group["params"]: if p.grad is None: continue diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 97be320c..3d5c8c9e 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -23,9 +23,10 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) TorchModel.registry[cls.__name__] = cls - def __init__(self): + def __init__(self, bias_weight_decay=None): super().__init__() self._train_mode = "full" + self.bias_weight_decay = bias_weight_decay def get_config(self): config = {"class_name": self.__class__.__name__} @@ -91,10 +92,26 @@ def print_parameter_list(self): logging.info("buffers: %s", n) def has_param_groups(self): - return False + return self.bias_weight_decay is not None def trainable_param_groups(self): - return self.trainable_parameters() + assert self.bias_weight_decay is not None + if self.bias_weight_decay is None: + return [{"params": self.trainable_parameters()}] + + regularized = [] + not_regularized = [] + for name, param in self.trainable_named_parameters(): + # we do not regularize biases nor Norm parameters + if name.endswith(".bias") or len(param.shape) == 1: + not_regularized.append(param) + else: + regularized.append(param) + + return [ + {"params": regularized}, + {"params": not_regularized, "weight_decay": self.bias_weight_decay}, + ] def freeze(self): for param in self.parameters(): @@ -315,6 +332,7 @@ def _try_to_get_from_hf( @staticmethod def auto_load( file_path: PathLike, + model_name: Optional[str] = None, extra_objs: dict = {}, map_location: Optional[ Union[ @@ -348,7 +366,9 @@ def auto_load( else: raise Exception("unknown object with class_name=%s" % (class_name)) - state_dict = model_data["model_state_dict"] + if model_name is None: + model_name = "model" + state_dict = model_data[f"{model_name}_state_dict"] if "n_averaged" in state_dict: del state_dict["n_averaged"] diff --git a/hyperion/torch/trainers/__init__.py b/hyperion/torch/trainers/__init__.py index c1530608..94326857 100644 --- a/hyperion/torch/trainers/__init__.py +++ b/hyperion/torch/trainers/__init__.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from .dino_xvector_trainer import DINOXVectorTrainer from .dvae_trainer import DVAETrainer from .torch_trainer import TorchTrainer from .transducer_trainer import TransducerTrainer @@ -13,6 +14,5 @@ from .xvector_adv_trainer_from_wav import XVectorAdvTrainerFromWav from .xvector_trainer import XVectorTrainer from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg -from .xvector_trainer_deep_feat_reg_from_wav import \ - XVectorTrainerDeepFeatRegFromWav +from .xvector_trainer_deep_feat_reg_from_wav import XVectorTrainerDeepFeatRegFromWav from .xvector_trainer_from_wav import XVectorTrainerFromWav diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 9f5fafe6..a0f5f1d4 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset @@ -46,6 +45,7 @@ class AETrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -69,7 +69,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -78,46 +78,17 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="x", ): - if loss is None: loss = nn.MSELoss() super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # loss, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - def train_epoch(self, data_loader): """Training epoch loop @@ -146,9 +117,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): @@ -156,17 +127,17 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - # total_batches += 1 logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): - batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() @@ -197,7 +168,6 @@ def validation_epoch(self, data_loader, swa_update_bn=False): @staticmethod def add_class_args(parser, prefix=None, train_modes=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/trainers/dino_xvector_trainer.py b/hyperion/torch/trainers/dino_xvector_trainer.py new file mode 100644 index 00000000..bb7b427d --- /dev/null +++ b/hyperion/torch/trainers/dino_xvector_trainer.py @@ -0,0 +1,385 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +from collections import OrderedDict as ODict + +import torch +import torch.cuda.amp as amp +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser +from torch.distributed.elastic.multiprocessing.errors import record + +from ...utils.misc import filter_func_args +from ..optim import ExpMovingAvg as EMA +from ..utils import MetricAcc, TorchDDP, tensors_subset +from .torch_trainer import TorchTrainer + + +class DINOXVectorTrainer(TorchTrainer): + """Trainer to train x-vector style models. + + Attributes: + model: x-Vector model object. + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + teacher_optim: teacher EMA momentum + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch + cpu_offload: CPU offload of gradients when using fully sharded ddp + input_key: dict. key for nnet input. + target_key: dict. key for nnet targets. + """ + + def __init__( + self, + student_model, + teacher_model, + loss, + optim, + teacher_optim, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + eff_batch_size=None, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + train_mode="full", + freeze_output_layer_steps=3000, + use_amp=False, + log_interval=1000, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + save_interval_steps=None, + cpu_offload=False, + input_key="x", + ): + super_args = filter_func_args(super().__init__, locals()) + self.teacher_model = teacher_model + self.teacher_optim = teacher_optim + self.freeze_output_layer_steps = freeze_output_layer_steps + super().__init__(student_model, **super_args) + + def prepare_models_for_training(self): + super().prepare_models_for_training() + self.teacher_model, self.teacher_optimizer = self._prepare_model_for_ema( + self.teacher_model, + self.teacher_optim, + self.device, + self.ddp, + ) + + def _prepare_model_for_ema(self, model, optim, device, ddp): + if device is not None: + model.to(device) + + optimizer = EMA(model.parameters(), **optim) + + if ddp: + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + + return model, optimizer + + def set_train_mode(self): + super().set_train_mode() + self.teacher_model.freeze() + + @torch.no_grad() + def update_teacher_model(self): + self.teacher_optimizer.step(self.model.parameters()) + # print( + # "pmw", + # self.model.xvector.proj_head_net.proj.weight[:5, :5], + # self.teacher_model.xvector.proj_head_net.proj.weight[:5, :5], + # ) + # print( + # "mw", + # self.model.xvector.classif_net.output.weight[:5, :5], + # self.teacher_model.xvector.classif_net.output.weight[:5, :5], + # ) + # print( + # "mwg", + # self.model.xvector.classif_net.output.weight_g[:5, :5], + # self.teacher_model.xvector.classif_net.output.weight_g[:5, :5], + # ) + # print( + # "mwv", + # self.model.xvector.classif_net.output.weight_v[:5, :5], + # self.teacher_model.xvector.classif_net.output.weight_v[:5, :5], + # flush=True, + # ) + # print("------------------------------", flush=True) + + @staticmethod + def get_augs_keys(batch, base_key, subset, skip=set()): + base_key = f"{base_key}_{subset}" + keys = [] + + chunk_idx = 0 + while True: + found_chunk = 0 + chunk_key = f"{base_key}_{chunk_idx}" + if chunk_key in batch: + if chunk_key not in skip: + keys.append(chunk_key) + found_chunk = True + aug_idx = 0 + while True: + aug_key = f"{chunk_key}_aug_{aug_idx}" + if aug_key in batch: + if aug_key not in skip: + keys.append(aug_key) + + aug_idx += 1 + found_chunk = True + else: + break + + if not found_chunk: + break + + chunk_idx += 1 + + return keys + + @record + def train_epoch(self, data_loader): + """Training epoch loop + + Args: + data_loader: pytorch data loader returning features and class labels. + """ + metric_acc = MetricAcc(device=self.device) + batch_metrics = ODict() + self.model.train() + self.teacher_model.train() + self.loss.update_temp(self.cur_epoch) + self.loss.train() + + for batch, data in enumerate(data_loader): + self.loggers.on_batch_begin(batch) + + if batch % self.grad_acc_steps == 0: + self.optimizer.zero_grad() + + teacher_keys = self.get_augs_keys(data, self.input_key, "teacher") + student_keys = self.get_augs_keys(data, self.input_key, "student") + with amp.autocast(enabled=self.use_amp): + with torch.no_grad(): + teacher_data = tensors_subset(data, teacher_keys, self.device) + batch_size = teacher_data[0].size(0) + num_teacher_crops = len(teacher_data) + teacher_data = torch.cat(teacher_data, dim=0) + teacher_out = self.teacher_model(teacher_data) + + if num_teacher_crops > 1: + student_out1 = self.model(teacher_data) + + student_data = tensors_subset(data, student_keys, self.device) + num_student_crops = len(student_data) + student_data = torch.cat(student_data, dim=0) + student_out2 = self.model(student_data) + assert not torch.any(torch.isnan(teacher_out)), "teacher is nan" + assert not torch.any(torch.isinf(teacher_out)), "teacher is inf" + assert not torch.any(torch.isnan(student_out1)), "s1 is nan" + assert not torch.any(torch.isinf(student_out1)), "s1 is inf" + assert not torch.any(torch.isnan(student_out2)), "s2 is nan" + assert not torch.any(torch.isinf(student_out2)), "s2 is inf" + if num_teacher_crops > 1: + student_out = torch.cat((student_out1, student_out2), dim=0) + num_student_crops += num_teacher_crops + else: + student_out = student_out2 + + loss = ( + self.loss( + student_out, teacher_out, num_student_crops, num_teacher_crops + ) + / self.grad_acc_steps + ) + assert not torch.isnan( + loss + ), f"loss is nan {batch} {torch.mean(teacher_out)} {torch.mean(student_out1)} {torch.mean(student_out2)}" + + if self.use_amp: + self.grad_scaler.scale(loss).backward() + else: + loss.backward() + + if (batch + 1) % self.grad_acc_steps == 0: + self.cur_batch = batch + 1 + if self.freeze_output_layer_steps > self.global_step: + self.model.cancel_output_layer_grads() + + self.update_model() + self.update_teacher_model() + self.save_checkpoint(partial=True) + + batch_metrics["loss"] = loss.item() * self.grad_acc_steps + # for k, metric in self.metrics.items(): + # batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + logs = metric_acc.metrics + lrs = self._get_lrs() + logs.update(lrs) + logs["ema_momentum"] = self.teacher_optimizer.momentum + self.loggers.on_batch_end(logs=logs, batch_size=batch_size) + + logs = metric_acc.metrics + logs = ODict(("train_" + k, v) for k, v in logs.items()) + lrs = self._get_lrs() + logs.update(lrs) + logs["ema_momentum"] = self.teacher_optimizer.momentum + return logs + + @torch.no_grad() + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + self.teacher_model.eval() + self.loss.eval() + + if swa_update_bn: + log_tag = "train_" + self.model.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, data in enumerate(data_loader): + teacher_keys = self.get_augs_keys(data, self.input_key, "teacher") + student_keys = self.get_augs_keys(data, self.input_key, "student") + with amp.autocast(enabled=self.use_amp): + teacher_data = tensors_subset(data, teacher_keys, self.device) + batch_size = teacher_data[0].size(0) + num_teacher_crops = len(teacher_data) + teacher_data = torch.cat(teacher_data, dim=0) + teacher_out = self.teacher_model(teacher_data) + + if num_teacher_crops > 1: + student_out1 = self.model(teacher_data) + + student_data = tensors_subset(data, student_keys, self.device) + num_student_crops = len(student_data) + student_data = torch.cat(student_data, dim=0) + student_out2 = self.model(student_data) + if num_teacher_crops > 1: + student_out = torch.cat((student_out1, student_out2), dim=0) + num_student_crops += num_teacher_crops + else: + student_out = student_out2 + + loss = self.loss( + student_out, teacher_out, num_student_crops, num_teacher_crops + ) + + batch_metrics["loss"] = loss.item() + # for k, metric in self.metrics.items(): + # batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) + return logs + + def _load_checkpoint(self, checkpoint): + self.teacher_model.load_state_dict(checkpoint["teacher_model_state_dict"]) + # self.teacher_model.load_state_dict(checkpoint["teacher_state_dict"]) + self.teacher_optimizer.load_state_dict( + checkpoint["teacher_optimizer_state_dict"] + ) + return super()._load_checkpoint(checkpoint) + + def checkpoint(self, logs=None): + checkpoint = super().checkpoint(logs) + self.teacher_model.train() + checkpoint["teacher_model_state_dict"] = self.teacher_model.state_dict() + checkpoint["teacher_optimizer_state_dict"] = self.teacher_optimizer.state_dict() + return checkpoint + + def teacher_checkpoint(self, logs=None): + """Creates a checkpoint of the teacher model, to save and posterior recovery + + Args: + logs: logs containing the current value of the metrics. + """ + self.teacher_model.train() + checkpoint = { + "epoch": self.cur_epoch, + "batch": self.cur_batch, + "global_step": self.global_step, + "model_cfg": self.teacher_model.get_config(), + "model_state_dict": self.teacher_model.state_dict(), + "optimizer_state_dict": self.teacher_optimizer.state_dict(), + } + + if logs is not None: + checkpoint["logs"] = logs + + return checkpoint + + @staticmethod + def filter_args(**kwargs): + args = filter_func_args(DINOXVectorTrainer.__init__, kwargs) + return args + + @staticmethod + def add_class_args(parser, prefix=None, train_modes=None, skip=set()): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + skip.add("teacher_key") + TorchTrainer.add_class_args(parser, train_modes=train_modes) + EMA.add_class_args(parser, prefix="teacher_optim") + parser.add_argument( + "--freeze-output-layer-steps", + default=1500, + type=int, + help="freeze the output layer during the first updates of the model", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index e2d2d1f6..718630d6 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset @@ -45,6 +44,7 @@ class DVAETrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -67,7 +67,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -76,6 +76,7 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x_aug", target_key="x", @@ -144,9 +145,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["elbo"] = elbo.item() for metric in ["log_px", "kldiv_z"]: @@ -156,12 +157,14 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -201,7 +204,6 @@ def validation_epoch(self, data_loader, swa_update_bn=False): @staticmethod def add_class_args(parser, prefix=None, train_modes=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index d6761e87..a0099c02 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -48,6 +48,7 @@ class PLDATrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -73,7 +74,7 @@ def __init__( p_tar=0.5, train_mode="train", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -82,46 +83,17 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="class_id", ): - if loss is None: loss = nn.CrossEntropyLoss() super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # loss, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - self.loss_bce = BCEWithLLR(p_tar) self.loss_weights = loss_weights @@ -179,9 +151,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps if return_bin: @@ -193,11 +165,13 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 7260595c..4fa5bdab 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -4,19 +4,21 @@ """ import contextlib +import glob import logging import math import os from collections import OrderedDict as ODict from enum import Enum from pathlib import Path +from typing import Any, Dict, Optional import torch import torch.cuda.amp as amp import torch.distributed as dist import torch.nn as nn from fairscale.optim.grad_scaler import ShardedGradScaler -from jsonargparse import ActionParser, ArgumentParser +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from torch.optim.swa_utils import SWALR, AveragedModel from ...utils.misc import filter_func_args @@ -71,6 +73,7 @@ class TorchTrainer(object): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -94,7 +97,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -103,6 +106,7 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="class_id", @@ -111,9 +115,12 @@ def __init__( self.loss = loss self.epochs = epochs self.cur_epoch = cur_epoch + self.cur_batch = 0 self.grad_acc_steps = grad_acc_steps self.eff_batch_size = eff_batch_size self.exp_path = Path(exp_path) + self.optim = optim + self.lrsched = lrsched if loggers is None: self.loggers = self._default_loggers( @@ -139,94 +146,212 @@ def __init__( self.target_key = target_key self.ddp = ddp self.ddp_type = ddp_type + self.cpu_offload = cpu_offload self.rank = 0 self.world_size = 1 + self.in_swa = False + self.global_step = 0 + self.save_interval_steps = save_interval_steps if ddp: self.rank = dist.get_rank() self.world_size = dist.get_world_size() self.set_train_mode() + self.prepare_models_for_training() + + # if device is not None: + # self.model.to(device) + # if loss is not None: + # self.loss.to(device) + + # if ddp: + # if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP: + # self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + # if self.rank == 0: + # logging.info( + # "training in multiple gpus with distributed-data-parallel" + # ) + # oss = False if ddp_type == DDPType.DDP else True + # self.optimizer = self._make_optimizer(optim, self.model, oss=oss) + # self.model = TorchDDP( + # self.model, + # device_ids=[device], + # output_device=device, + # ) + # elif ddp_type == DDPType.OSS_SHARDED_DDP: + # self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + # if self.rank == 0: + # logging.info( + # "training in multiple gpus with fair sharded-distributed-data-parallel" + # ) + # self.optimizer = self._make_optimizer(optim, self.model, oss=True) + # self.model = FairShardedDDP(self.model, self.optimizer) + # else: + # if self.rank == 0: + # logging.info( + # "training in multiple gpus with fair fully-sharded-distributed-data-parallel" + # ) + # # syncbathcnorm is not supported here, it raises exception + # self.model = FairFullyShardedDDP( + # self.model, + # mixed_precision=self.use_amp, + # move_params_to_cpu=cpu_offload, + # ) + # self.optimizer = self._make_optimizer(optim, self.model, oss=False) + + # else: + # self.optimizer = self._make_optimizer(optim, self.model) + + # # make the learning rate scheduler + # self.lr_scheduler = self._make_lr_sched(lrsched, self.optimizer) + + # if self.use_amp: + # if ddp and ddp_type != DDPType.DDP: + # if self.rank == 0: + # logging.info( + # "using automatic mixed precision training with sharded-grad-scaler" + # ) + # self.grad_scaler = ShardedGradScaler() + # else: + # if self.rank == 0: + # logging.info( + # "using automatic mixed precision training with grad-scaler" + # ) + # self.grad_scaler = amp.GradScaler() + # self.amp_autocast = amp.autocast + # else: + # self.amp_autocast = contextlib.nullcontext + + # self.in_swa = False + # if self.do_swa: + # if self.rank == 0: + # logging.info("init SWA model") + # self.swa_model = AveragedModel(self.model) + # self.swa_scheduler = SWALR( + # self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs + # ) + + def prepare_models_for_training(self): + self.loss = self._prepare_loss_for_training(self.loss, self.device) + ( + self.model, + self.optimizer, + self.lr_scheduler, + self.grad_scaler, + self.swa_model, + self.swa_scheduler, + ) = self._prepare_model_for_training( + self.model, + self.optim, + self.lrsched, + self.device, + self.use_amp, + self.ddp, + self.ddp_type, + self.cpu_offload, + self.do_swa, + self.swa_lr, + self.swa_anneal_epochs, + ) + + def _prepare_loss_for_training(self, loss, device): + if loss is not None: + loss.to(device) + + return loss + def _prepare_model_for_training( + self, + model, + optim, + lrsched, + device, + use_amp, + ddp, + ddp_type, + cpu_offload, + do_swa, + swa_lr, + swa_anneal_epochs, + ): if device is not None: - self.model.to(device) - if loss is not None: - self.loss.to(device) + model.to(device) if ddp: if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if self.rank == 0: logging.info( "training in multiple gpus with distributed-data-parallel" ) oss = False if ddp_type == DDPType.DDP else True - self.optimizer = self._make_optimizer(optim, self.model, oss=oss) - self.model = TorchDDP( - self.model, + optimizer = self._make_optimizer(optim, model, oss=oss) + model = TorchDDP( + model, device_ids=[device], output_device=device, ) elif ddp_type == DDPType.OSS_SHARDED_DDP: - self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if self.rank == 0: logging.info( "training in multiple gpus with fair sharded-distributed-data-parallel" ) - self.optimizer = self._make_optimizer(optim, self.model, oss=True) - self.model = FairShardedDDP(self.model, self.optimizer) + optimizer = self._make_optimizer(optim, model, oss=True) + model = FairShardedDDP(model, optimizer) else: if self.rank == 0: logging.info( "training in multiple gpus with fair fully-sharded-distributed-data-parallel" ) # syncbathcnorm is not supported here, it raises exception - self.model = FairFullyShardedDDP( - self.model, - mixed_precision=self.use_amp, + model = FairFullyShardedDDP( + model, + mixed_precision=use_amp, move_params_to_cpu=cpu_offload, ) - self.optimizer = self._make_optimizer(optim, self.model, oss=False) + optimizer = self._make_optimizer(optim, model, oss=False) else: - self.optimizer = self._make_optimizer(optim, self.model) + optimizer = self._make_optimizer(optim, model) # make the learning rate scheduler - self.lr_scheduler = self._make_lr_sched(lrsched, self.optimizer) + lr_scheduler = self._make_lr_sched(lrsched, optimizer) - if self.use_amp: + if use_amp: if ddp and ddp_type != DDPType.DDP: if self.rank == 0: logging.info( "using automatic mixed precision training with sharded-grad-scaler" ) - self.grad_scaler = ShardedGradScaler() + grad_scaler = ShardedGradScaler() else: if self.rank == 0: logging.info( "using automatic mixed precision training with grad-scaler" ) - self.grad_scaler = amp.GradScaler() - self.amp_autocast = amp.autocast - else: - self.amp_autocast = contextlib.nullcontext + grad_scaler = amp.GradScaler() - self.in_swa = False - if self.do_swa: + swa_model = None + swa_scheduler = None + if do_swa: if self.rank == 0: logging.info("init SWA model") - self.swa_model = AveragedModel(self.model) - self.swa_scheduler = SWALR( - self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs + swa_model = AveragedModel(model) + swa_scheduler = SWALR( + optimizer, swa_lr=swa_lr, anneal_epochs=swa_anneal_epochs ) - def set_epoch(self, data_loader): + return model, optimizer, lr_scheduler, grad_scaler, swa_model, swa_scheduler + + def set_epoch(self, data_loader, cur_batch: int = 0): try: data_loader.dataset.set_epoch(self.cur_epoch) except AttributeError: logging.warning("dataset doesn't have set_epoch member function") try: - data_loader.batch_sampler.set_epoch(self.cur_epoch) + data_loader.batch_sampler.set_epoch(self.cur_epoch, cur_batch) except AttributeError: logging.warning("sampler doesn't have set_epoch member function") @@ -246,7 +371,7 @@ def fit(self, train_data, val_data=None): val_logs = {} self.loggers.on_train_begin(epochs=self.epochs) for epoch in range(self.cur_epoch, self.epochs): - self.set_epoch(train_data) + self.set_epoch(train_data, self.cur_batch) self.loggers.on_epoch_begin(epoch, batches=len(train_data)) if self.lr_scheduler is not None: # this is needed by cosine scheduler @@ -254,6 +379,7 @@ def fit(self, train_data, val_data=None): self.lr_scheduler.on_epoch_begin(epoch, epoch_updates=epoch_updates) logs = self.train_epoch(train_data) + self.cur_batch = 0 if val_data is not None: self.set_epoch(val_data) val_logs = self.validation_epoch(val_data) @@ -311,7 +437,7 @@ def train_epoch(self, data_loader): with amp.autocast(enabled=self.use_amp): output = self.model(input_data) - loss = self.loss(output, target).mean() / self.grad_acc_steps + loss = self.loss(output, target) / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -319,9 +445,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): @@ -329,13 +455,14 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - # total_batches += 1 logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -399,24 +526,54 @@ def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): model.parameters(), grad_clip, norm_type=grad_clip_norm ) - def update_model(self): + def _update_model_by_optim( + self, model, optimizer, grad_clip, grad_clip_norm, use_amp, grad_scaler + ): """Updates the model and does gradding clipping.""" - if self.use_amp: - if self.grad_clip > 0: - self.grad_scaler.unscale_(self.optimizer) - self._clip_grad_norm( - self.model, self.optimizer, self.grad_clip, self.grad_clip_norm - ) + if use_amp: + if grad_clip > 0: + grad_scaler.unscale_(optimizer) + self._clip_grad_norm(model, optimizer, grad_clip, grad_clip_norm) - self.grad_scaler.step(self.optimizer) - self.grad_scaler.update() + grad_scaler.step(optimizer) + grad_scaler.update() else: - if self.grad_clip > 0: - self._clip_grad_norm( - self.model, self.optimizer, self.grad_clip, self.grad_clip_norm - ) + if grad_clip > 0: + self._clip_grad_norm(model, optimizer, grad_clip, grad_clip_norm) + + optimizer.step() + + def update_model(self): + """Updates the model and does gradding clipping.""" + if self.lr_scheduler is not None and not self.in_swa: + self.lr_scheduler.on_opt_step() + + self._update_model_by_optim( + self.model, + self.optimizer, + self.grad_clip, + self.grad_clip_norm, + self.use_amp, + self.grad_scaler, + ) + self.global_step += 1 - self.optimizer.step() + # if self.use_amp: + # if self.grad_clip > 0: + # self.grad_scaler.unscale_(self.optimizer) + # self._clip_grad_norm( + # self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + # ) + + # self.grad_scaler.step(self.optimizer) + # self.grad_scaler.update() + # else: + # if self.grad_clip > 0: + # self._clip_grad_norm( + # self.model, self.optimizer, self.grad_clip, self.grad_clip_norm + # ) + + # self.optimizer.step() def _make_optimizer(self, optim, model, oss=False): """Makes an optimizer object.""" @@ -429,7 +586,6 @@ def _make_optimizer(self, optim, model, oss=False): if self.rank == 0: logging.info("optimizer args={}".format(opt_args)) - # optimizer = OF.create(model.parameters(), **opt_args) optimizer = OF.create(model.trainable_param_groups(), **opt_args) return optimizer @@ -469,17 +625,14 @@ def _get_lr(self): def _get_lrs(self): """Returns the current learning rates of all param groups to show in the loggers""" - lrs = [param_group["lr"] for param_group in self.optimizer.param_groups] - all_eq = True - for lr in lrs: - if lr != lrs[0]: - all_eq = False - break - - if all_eq: - return {"lr": lrs[0]} + lrs = { + f"lr_{i}": param_group["lr"] + for i, param_group in enumerate(self.optimizer.param_groups) + } + if len(lrs) == 1: + lrs["lr"] = lrs.pop("lr_0") - return {f"lr_{i}": lr for i, lr in enumerate(lrs)} + return lrs def _compute_grad_acc_steps(self, data_loader): if self.eff_batch_size is None: @@ -524,6 +677,8 @@ def checkpoint(self, logs=None): self.model.train() checkpoint = { "epoch": self.cur_epoch, + "batch": self.cur_batch, + "global_step": self.global_step, "rng_state": torch.get_rng_state(), "model_cfg": self.model.get_config(), "model_state_dict": self.model.state_dict(), @@ -544,12 +699,22 @@ def checkpoint(self, logs=None): return checkpoint - def save_checkpoint(self, logs=None): + def save_partial_checkpoint(self): + return ( + self.save_interval_steps is not None + and self.global_step % self.save_interval_steps == 0 + ) + + def new_save_checkpoint(self, logs=None, partial: bool = False): """Saves a checkpoint of the training status Args: logs: logs containing the current value of the metrics. + partial: if True, it is saving in the middle of the epoch """ + if partial and not self.save_partial_checkpoint(): + return + if self.ddp and ( self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP ): @@ -564,7 +729,58 @@ def save_checkpoint(self, logs=None): return checkpoint = self.checkpoint(logs) - file_path = "%s/model_ep%04d.pth" % (self.exp_path, self.cur_epoch) + self.save_model_checkpoint("model", checkpoint, partial=partial) + + def save_model_checkpoint( + self, model_name: str, checkpoint: Dict[str, Any], partial: bool = False + ): + if partial: + file_path = "%s/%s_ep%04d_step%08d.pth" % ( + model_name, + self.exp_path, + self.cur_epoch, + self.global_step, + ) + else: + file_path = "%s/%s_ep%04d.pth" % (model_name, self.exp_path, self.cur_epoch) + + torch.save(checkpoint, file_path) + + def save_checkpoint(self, logs=None, partial: bool = False): + """Saves a checkpoint of the training status + + Args: + logs: logs containing the current value of the metrics. + partial: if True, it is saving in the middle of the epoch + """ + if partial and ( + self.save_interval_steps is None + or self.global_step % self.save_interval_steps != 0 + ): + return + + if self.ddp and ( + self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP + ): + # Not sure what this does, just copying from the example in + # https://github.com/facebookresearch/fairscale/blob/master/benchmarks/oss.py + # Check the checkpointing in the case of the OSS optimizer + # Memory usage could spill over from there + # optimizer = cast(OSS, optimizer) + self.optimizer.consolidate_state_dict() + + if self.rank != 0: + return + + checkpoint = self.checkpoint(logs) + if partial: + file_path = "%s/model_ep%04d_step%08d.pth" % ( + self.exp_path, + self.cur_epoch, + self.global_step, + ) + else: + file_path = "%s/model_ep%04d.pth" % (self.exp_path, self.cur_epoch) torch.save(checkpoint, file_path) @@ -584,13 +800,7 @@ def save_swa_model(self, logs=None): torch.save(checkpoint, file_path) - def load_checkpoint(self, file_path): - """Loads a training checkpoint from file. - - Args: - file_path: checkpoint file path - """ - checkpoint = torch.load(file_path, map_location=torch.device("cpu")) + def _load_checkpoint(self, checkpoint): rng_state = checkpoint["rng_state"] torch.set_rng_state(rng_state) if self.rank > 0: @@ -600,6 +810,11 @@ def load_checkpoint(self, file_path): del dummy self.cur_epoch = checkpoint["epoch"] + if "batch" in checkpoint: + self.cur_batch = checkpoint["batch"] + else: + self.cur_batch = 0 + try: self.model.load_state_dict(checkpoint["model_state_dict"]) except: @@ -610,6 +825,12 @@ def load_checkpoint(self, file_path): if self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) + if "global_step" in checkpoint: + self.global_step = checkpoint["global_step"] + elif self.lr_scheduler is not None: + # this for older models that didn't save the global step + self.global_step = self.lr_scheduler.step + # if self.use_amp: # amp.load_state_dict(checkpoint['amp']) if self.do_swa: @@ -638,17 +859,31 @@ def load_checkpoint(self, file_path): return logs + def load_checkpoint(self, file_path): + """Loads a training checkpoint from file. + + Args: + file_path: checkpoint file path + """ + checkpoint = torch.load(file_path, map_location=torch.device("cpu")) + return self._load_checkpoint(checkpoint) + def load_last_checkpoint(self): """Loads the last training checkpoint in the experiment dir.""" for epoch in range(self.epochs, 0, -1): - file_path = "%s/model_ep%04d.pth" % (self.exp_path, epoch) - if os.path.isfile(file_path): + file_path = Path("%s/model_ep%04d.pth" % (self.exp_path, epoch)) + if file_path.is_file(): + steps_pattern = "%s/model_ep%04d_steps*.pth" % (self.exp_path, epoch) + steps_file_paths = sorted(glob.glob(steps_pattern)) + if len(steps_file_paths) > 0: + file_path = steps_file_paths[-1] + return self.load_checkpoint(file_path) return None @staticmethod - def get_augs_keys(batch, base_key, skip={}): + def get_augs_keys(batch, base_key, skip=set()): keys = [] if base_key in batch and base_key not in skip: keys.append(base_key) @@ -712,12 +947,18 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): parser.add_argument( "--log-interval", type=int, - default=10, + default=1000, help="how many batches to wait before logging training status", ) + parser.add_argument( + "--save-interval-steps", + default=None, + type=int, + help="number of steps between model saves, if None only saves at the end of the epoch", + ) parser.add_argument( "--use-tensorboard", - action="store_true", + action=ActionYesNo, default=False, help="use tensorboard logger", ) @@ -745,13 +986,13 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): ) parser.add_argument( "--use-amp", - action="store_true", + action=ActionYesNo, default=False, help="use mixed precision training", ) parser.add_argument( "--cpu-offload", - action="store_true", + action=ActionYesNo, default=False, help="CPU offload of gradients when using fully_sharded_ddp", ) diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 3a9cc288..541dc126 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -6,11 +6,10 @@ import os from collections import OrderedDict as ODict -import torchaudio -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +import torchaudio +from jsonargparse import ActionParser, ArgumentParser from torch.distributed.elastic.multiprocessing.errors import record from ...utils.misc import filter_func_args @@ -47,6 +46,7 @@ class TransducerTrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp """ @@ -68,7 +68,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -77,11 +77,11 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="text", ): - loss = None super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) @@ -93,9 +93,7 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ - batch_keys = [ - self.input_key, f"{self.input_key}_lengths", self.target_key - ] + batch_keys = [self.input_key, f"{self.input_key}_lengths", self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() @@ -110,15 +108,14 @@ def train_epoch(self, data_loader): # # TODO: Check and Modify data, target # data, audio_length, target = data.to(self.device), audio_length.to( # self.device), target.to(self.device) - #print(data.keys(), batch_keys, flush=True) + # print(data.keys(), batch_keys, flush=True) input_data, input_lengths, target = tensors_subset( - data, batch_keys, self.device) + data, batch_keys, self.device + ) batch_size = input_data.shape[0] with self.amp_autocast(): - output = self.model(input_data, - x_lengths=input_lengths, - y=target) + output = self.model(input_data, x_lengths=input_lengths, y=target) loss = output.loss loss = loss.mean() / self.grad_acc_steps @@ -128,9 +125,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) for k, v in output.items(): if "loss" in k and v is not None: @@ -141,12 +138,14 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -156,9 +155,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): data_loader: PyTorch data loader return input/output pairs. sw_update_bn: wheter or not, update batch-norm layers in SWA. """ - batch_keys = [ - self.input_key, f"{self.input_key}_lengths", self.target_key - ] + batch_keys = [self.input_key, f"{self.input_key}_lengths", self.target_key] metric_acc = MetricAcc(self.device) batch_metrics = ODict() with torch.no_grad(): @@ -170,9 +167,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.eval() for batch, data in enumerate(data_loader): - input_data, input_lengths, target = tensors_subset( - data, batch_keys, self.device) + data, batch_keys, self.device + ) batch_size = input_data.shape[0] # data, audio_length, target = data.to( @@ -183,9 +180,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): # batch_size = data.shape[0] with self.amp_autocast(): - output = self.model(input_data, - x_lengths=input_lengths, - y=target) + output = self.model(input_data, x_lengths=input_lengths, y=target) for k, v in output.items(): if "loss" in k and v is not None: @@ -208,14 +203,11 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): super_skip = skip.copy() super_skip.add("target_key") - TorchTrainer.add_class_args(parser, - train_modes=train_modes, - skip=super_skip) + TorchTrainer.add_class_args(parser, train_modes=train_modes, skip=super_skip) if "target_key" not in skip: - parser.add_argument("--target-key", - default="text", - help="dict. key for nnet targets") + parser.add_argument( + "--target-key", default="text", help="dict. key for nnet targets" + ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index f4877dc6..72942506 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset @@ -45,6 +44,7 @@ class VAETrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -67,7 +67,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -76,11 +76,11 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="x", ): - super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) @@ -145,9 +145,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["elbo"] = elbo.item() for metric in ["log_px", "kldiv_z"]: @@ -157,12 +157,14 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -204,7 +206,6 @@ def validation_epoch(self, data_loader, swa_update_bn=False): @staticmethod def add_class_args(parser, prefix=None, train_modes=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index fc9d98f1..c89cfd9a 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset @@ -45,6 +44,7 @@ class VQDVAETrainer(DVAETrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -67,7 +67,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -76,44 +76,15 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x_aug", target_key="x", ): - super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - def train_epoch(self, data_loader): - batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() @@ -128,9 +99,8 @@ def train_epoch(self, data_loader): batch_size = input_data.size(0) with amp.autocast(enabled=self.use_amp): output = self.model(input_data, x_target=target, return_x_mean=True) - loss = output["loss"] + loss = output["loss"] / self.grad_acc_steps x_hat = output["x_mean"] - loss = loss.mean() / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -138,9 +108,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for metric in ["elbo", "log_px", "kldiv_z", "vq_loss"]: @@ -153,12 +123,14 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -197,7 +169,6 @@ def validation_epoch(self, data_loader, swa_update_bn=False): @staticmethod def add_class_args(parser, prefix=None, train_modes=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 35946e96..7d82dde2 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -7,11 +7,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset @@ -45,6 +44,7 @@ class VQVAETrainer(VAETrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -67,7 +67,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -76,6 +76,7 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="x", @@ -128,7 +129,7 @@ def train_epoch(self, data_loader): output = self.model(input_data, x_target=target, return_x_mean=True) loss = output["loss"] x_hat = output["x_mean"] - loss = loss.mean() / self.grad_acc_steps + loss = loss / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -136,9 +137,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for metric in ["elbo", "log_px", "kldiv_z", "vq_loss"]: @@ -151,12 +152,14 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -195,7 +198,6 @@ def validation_epoch(self, data_loader, swa_update_bn=False): @staticmethod def add_class_args(parser, prefix=None, train_modes=None, skip=set()): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 303427de..3943a681 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -7,11 +7,10 @@ import time from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset @@ -49,6 +48,7 @@ class XVectorAdvTrainer(XVectorTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -75,7 +75,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -84,43 +84,13 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="class_id", ): - super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - - # super().__init__( - # model, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # loss=loss, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - self.attack = attack self.attack.to(device) self.p_attack = p_attack * self.grad_acc_steps @@ -155,7 +125,7 @@ def train_epoch(self, data_loader): # generate adversarial attacks logging.info("generating adv attack for batch=%d", batch) self.model.eval() - data_adv = self.attack.generate(inptu_data, target) + data_adv = self.attack.generate(input_data, target) max_delta = torch.max(torch.abs(data_adv - data)).item() logging.info("adv attack max perturbation=%f", max_delta) input_data = data_adv @@ -173,9 +143,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): @@ -183,12 +153,14 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 2a012dde..522d7e0b 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -7,11 +7,10 @@ import time from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset @@ -50,6 +49,7 @@ class XVectorAdvTrainerFromWav(XVectorTrainerFromWav): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -77,7 +77,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -86,44 +86,13 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="class_id", ): - super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - - # super().__init__( - # model, - # feat_extractor, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # loss=loss, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - self.attack = attack self.attack.to(device) self.p_attack = p_attack * self.grad_acc_steps @@ -182,9 +151,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): @@ -192,12 +161,14 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index aedd5be0..02c48577 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -45,6 +45,7 @@ class XVectorTrainer(TorchTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -68,7 +69,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -77,6 +78,7 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="class_id", @@ -104,14 +106,6 @@ def train_epoch(self, data_loader): for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) - # try: - # l1 = self.model.hf_feats.hf_model.encoder.layers[0].attention.v_proj - # # print(f"lora train {l1.training}") - # print(f"loraA {l1.lora_A}") - # print(f"loraB {l1.lora_B}", flush=True) - # except: - # pass - if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() @@ -131,9 +125,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * loss_scale for k, metric in self.metrics.items(): diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 9d04af42..d4a34abc 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -6,11 +6,10 @@ import os from collections import OrderedDict as ODict -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.cuda.amp as amp import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset @@ -51,6 +50,7 @@ class XVectorTrainerDeepFeatReg(XVectorTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -80,7 +80,7 @@ def __init__( reg_loss=None, train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -89,43 +89,14 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="class_id", ): - super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # loss=loss, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - self.prior_model = prior_model if reg_loss is None or reg_loss == "l1": reg_loss = nn.L1Loss() @@ -219,9 +190,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) @@ -229,11 +200,13 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs @staticmethod diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 6d06eac8..041a1ea7 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -50,6 +50,7 @@ class XVectorTrainerDeepFeatRegFromWav(XVectorTrainerDeepFeatReg): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -89,49 +90,14 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="class_id", ): - super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # prior_model, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # reg_layers_enc=reg_layers_enc, - # reg_layers_classif=reg_layers_classif, - # reg_weight_enc=reg_weight_enc, - # reg_weight_classif=reg_weight_classif, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # loss=loss, - # reg_loss=reg_loss, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - self.feat_extractor = feat_extractor if device is not None: self.feat_extractor.to(device) @@ -218,21 +184,23 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics logs = ODict(("train_" + k, v) for k, v in logs.items()) - logs["lr"] = self._get_lr() + lrs = self._get_lrs() + logs.update(lrs) return logs def validation_epoch(self, data_loader, swa_update_bn=False): diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 6d00806a..2d6b5514 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -44,6 +44,7 @@ class XVectorTrainerFromWav(XVectorTrainer): swa_start: epoch to start doing swa swa_lr: SWA learning rate swa_anneal_epochs: SWA learning rate anneal epochs + save_interval_steps: number of steps between model saves, if None only saves at the end of the epoch cpu_offload: CPU offload of gradients when using fully sharded ddp input_key: dict. key for nnet input. target_key: dict. key for nnet targets. @@ -68,7 +69,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, - log_interval=10, + log_interval=1000, use_tensorboard=False, use_wandb=False, wandb={}, @@ -77,6 +78,7 @@ def __init__( swa_start=0, swa_lr=1e-3, swa_anneal_epochs=10, + save_interval_steps=None, cpu_offload=False, input_key="x", target_key="class_id", @@ -120,9 +122,9 @@ def train_epoch(self, data_loader): loss.backward() if (batch + 1) % self.grad_acc_steps == 0: - if self.lr_scheduler is not None and not self.in_swa: - self.lr_scheduler.on_opt_step() + self.cur_batch = batch + 1 self.update_model() + self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 4f006c0a..aa5efe37 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -54,8 +54,8 @@ def ddp_init( device = open_device(num_gpus) return device, 0, 1 - os.environ["MASTER_ADDR"] = master_addr - os.environ["MASTER_PORT"] = master_port + os.environ["MASTER_ADDR"] = str(master_addr) + os.environ["MASTER_PORT"] = str(master_port) logging.info( f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port} gpu_id={gpu_id}" diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index 51f0f37a..1b35364d 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -3,24 +3,26 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import logging -from pathlib import Path -from typing import List, Dict, Optional, Union -from copy import deepcopy import math +from copy import deepcopy +from pathlib import Path +from typing import Dict, List, Optional, Union + +import lhotse import numpy as np import pandas as pd import yaml -from .info_table import InfoTable from .class_info import ClassInfo +from .enrollment_map import EnrollmentMap from .feature_set import FeatureSet +from .info_table import InfoTable from .misc import PathLike from .recording_set import RecordingSet from .segment_set import SegmentSet -from .enrollment_map import EnrollmentMap +from .sparse_trial_key import SparseTrialKey from .trial_key import TrialKey from .trial_ndx import TrialNdx -from .sparse_trial_key import SparseTrialKey class Dataset: @@ -822,7 +824,7 @@ def remove_recordings( def remove_classes(self, classes_name: str): if self._classes_paths[classes_name] is not None: - self._files_to_delete.append(self._class_paths[class_name]) + self._files_to_delete.append(self._class_paths[classes_name]) del self._classes[classes_name] del self._classes_paths[classes_name] @@ -1219,3 +1221,11 @@ def split_train_val( val_ds.clean() return train_ds, val_ds + + def from_lhotse( + cls, + cuts: Optional[Union[lhotse.CutSet, PathLike]] = None, + recordings: Optional[Union[lhotse.RecordingSet, PathLike]] = None, + supervisions: Optional[Union[lhotse.SupervisionSet, PathLike]] = None, + ): + return None From 4593c4b70cdd3a11a9a82eb4ca08596162601def Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 14 Dec 2023 11:11:26 -0500 Subject: [PATCH 125/154] dino seems to be working --- egs/librispeech/v1/datapath.sh | 18 +++ hyperion/torch/losses/dino_loss.py | 2 +- hyperion/torch/optim/ema.py | 74 ++++++++++ hyperion/torch/torch_model.py | 1 - hyperion/torch/trainers/ae_trainer.py | 1 + .../torch/trainers/dino_xvector_trainer.py | 67 ++++++--- hyperion/torch/trainers/dvae_trainer.py | 1 + hyperion/torch/trainers/plda_trainer.py | 1 + hyperion/torch/trainers/torch_trainer.py | 132 ++++++++++++++++-- hyperion/torch/trainers/transducer_trainer.py | 1 + hyperion/torch/trainers/vae_trainer.py | 1 + hyperion/torch/trainers/vq_dvae_trainer.py | 1 + hyperion/torch/trainers/vq_vae_trainer.py | 1 + .../torch/trainers/xvector_adv_trainer.py | 1 + .../trainers/xvector_adv_trainer_from_wav.py | 1 + hyperion/torch/trainers/xvector_trainer.py | 2 + .../trainers/xvector_trainer_deep_feat_reg.py | 1 + .../xvector_trainer_deep_feat_reg_from_wav.py | 1 + .../trainers/xvector_trainer_from_wav.py | 1 + hyperion/torch/wd_schedulers/__init__.py | 9 ++ hyperion/torch/wd_schedulers/cos_wd.py | 50 +++++++ hyperion/torch/wd_schedulers/factory.py | 89 ++++++++++++ hyperion/torch/wd_schedulers/wd_scheduler.py | 120 ++++++++++++++++ 23 files changed, 541 insertions(+), 35 deletions(-) create mode 100644 egs/librispeech/v1/datapath.sh create mode 100644 hyperion/torch/optim/ema.py create mode 100644 hyperion/torch/wd_schedulers/__init__.py create mode 100644 hyperion/torch/wd_schedulers/cos_wd.py create mode 100644 hyperion/torch/wd_schedulers/factory.py create mode 100644 hyperion/torch/wd_schedulers/wd_scheduler.py diff --git a/egs/librispeech/v1/datapath.sh b/egs/librispeech/v1/datapath.sh new file mode 100644 index 00000000..3e8de307 --- /dev/null +++ b/egs/librispeech/v1/datapath.sh @@ -0,0 +1,18 @@ +# Copyright +# 2018 Johns Hopkins University (Author: Jesus Villalba) +# +# Paths to the databases used in the experiment + + +if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then + librispeech_root=/export/corpora5/LibriSpeech + musan_root=/export/corpora5/JHU/musan +elif [ "$(hostname --domain)" == "cm.gemini" ];then + librispeech_root=/export/common/data/corpora/ASR/openslr/SLR12/LibriSpeech + musan_root=/export/common/data/corpora/MUSAN/musan +else + echo "Put your database paths here" + exit 1 +fi + + diff --git a/hyperion/torch/losses/dino_loss.py b/hyperion/torch/losses/dino_loss.py index b22489a3..55f8e846 100644 --- a/hyperion/torch/losses/dino_loss.py +++ b/hyperion/torch/losses/dino_loss.py @@ -51,7 +51,7 @@ def update_temp(self, epoch: int): * epoch / self.temp_warmup_epochs ) - logging.info("updating dino-loss teacher temp=%.2f", self.cur_teacher_temp) + logging.info("updating dino-loss teacher temp=%.3f", self.cur_teacher_temp) else: self.cur_teacher_temp = self.teacher_temp diff --git a/hyperion/torch/optim/ema.py b/hyperion/torch/optim/ema.py new file mode 100644 index 00000000..f120bf21 --- /dev/null +++ b/hyperion/torch/optim/ema.py @@ -0,0 +1,74 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import math + +import torch +from jsonargparse import ActionParser, ArgumentParser + + +class ExpMovingAvg: + def __init__( + self, params, init_momentum=0.996, momentum=0.996, warmup_steps=0, global_step=0 + ): + if not isinstance(params, list): + params = [params] + self.params = [list(p) for p in params] + self.init_momentum = init_momentum + self._momentum = momentum + self.warmup_steps = warmup_steps + self.global_step = global_step + + def state_dict(self): + """Returns the state of the optimizer as a :class:`dict` needed to restart the training.""" + return {"global_step": self.global_step} + + def load_state_dict(self, state_dict): + """Loads the optimizer state. + + Arguments: + state_dict (dict): scheduler state. Should be an object returned + from a call to :meth:`state_dict`. + """ + self.__dict__.update(state_dict) + + @property + def momentum(self): + if self.global_step >= self.warmup_steps: + return self._momentum + else: + alpha = (1 + math.cos(self.global_step / self.warmup_steps * math.pi)) / 2 + return self.init_momentum * alpha + self._momentum * (1 - alpha) + + @torch.no_grad() + def step(self, new_params): + if not isinstance(new_params, list): + new_params = [new_params] + + assert len(self.params) == len(new_params) + momentum = self.momentum + for param_group, new_param_group in zip(self.params, new_params): + for p, p_new in zip(param_group, new_param_group): + p.data.mul_(momentum).add_((1 - momentum) * p_new.data) + + self.global_step += 1 + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--init-momentum", default=0.996, type=float, help="initial momentum" + ) + parser.add_argument( + "--momentum", default=0.996, type=float, help="final momentum" + ) + parser.add_argument( + "--warmup-steps", default=0, type=int, help="momentum warmup steps" + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 3d5c8c9e..242402bc 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -95,7 +95,6 @@ def has_param_groups(self): return self.bias_weight_decay is not None def trainable_param_groups(self): - assert self.bias_weight_decay is not None if self.bias_weight_decay is None: return [{"params": self.trainable_parameters()}] diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index a0f5f1d4..9939797e 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -64,6 +64,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/dino_xvector_trainer.py b/hyperion/torch/trainers/dino_xvector_trainer.py index bb7b427d..e4051058 100644 --- a/hyperion/torch/trainers/dino_xvector_trainer.py +++ b/hyperion/torch/trainers/dino_xvector_trainer.py @@ -15,7 +15,7 @@ from ...utils.misc import filter_func_args from ..optim import ExpMovingAvg as EMA from ..utils import MetricAcc, TorchDDP, tensors_subset -from .torch_trainer import TorchTrainer +from .torch_trainer import DDPType, TorchTrainer class DINOXVectorTrainer(TorchTrainer): @@ -69,6 +69,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", @@ -121,28 +122,6 @@ def set_train_mode(self): @torch.no_grad() def update_teacher_model(self): self.teacher_optimizer.step(self.model.parameters()) - # print( - # "pmw", - # self.model.xvector.proj_head_net.proj.weight[:5, :5], - # self.teacher_model.xvector.proj_head_net.proj.weight[:5, :5], - # ) - # print( - # "mw", - # self.model.xvector.classif_net.output.weight[:5, :5], - # self.teacher_model.xvector.classif_net.output.weight[:5, :5], - # ) - # print( - # "mwg", - # self.model.xvector.classif_net.output.weight_g[:5, :5], - # self.teacher_model.xvector.classif_net.output.weight_g[:5, :5], - # ) - # print( - # "mwv", - # self.model.xvector.classif_net.output.weight_v[:5, :5], - # self.teacher_model.xvector.classif_net.output.weight_v[:5, :5], - # flush=True, - # ) - # print("------------------------------", flush=True) @staticmethod def get_augs_keys(batch, base_key, subset, skip=set()): @@ -264,6 +243,7 @@ def train_epoch(self, data_loader): logs = ODict(("train_" + k, v) for k, v in logs.items()) lrs = self._get_lrs() logs.update(lrs) + logs.update(self._get_wds()) logs["ema_momentum"] = self.teacher_optimizer.momentum return logs @@ -332,6 +312,18 @@ def _load_checkpoint(self, checkpoint): ) return super()._load_checkpoint(checkpoint) + def _new_load_checkpoint(self, checkpoint, teacher_checkpoint): + self.teacher_model.load_state_dict(teacher_checkpoint["model_state_dict"]) + self.teacher_optimizer.load_state_dict( + teacher_checkpoint["optimizer_state_dict"] + ) + return super()._load_checkpoint(checkpoint) + + def load_checkpoint(self, epoch, step): + checkpoint = self.load_model_checkpoint("model", epoch, step) + teacher_checkpoint = self.load_model_checkpoint("teacher_model", epoch, step) + return self._new_load_checkpoint(checkpoint, teacher_checkpoint) + def checkpoint(self, logs=None): checkpoint = super().checkpoint(logs) self.teacher_model.train() @@ -360,6 +352,35 @@ def teacher_checkpoint(self, logs=None): return checkpoint + def save_checkpoint(self, logs=None, partial: bool = False): + """Saves a checkpoint of the training status + + Args: + logs: logs containing the current value of the metrics. + partial: if True, it is saving in the middle of the epoch + """ + if partial and not self.save_partial_checkpoint(): + return + + if self.ddp and ( + self.ddp_type == DDPType.OSS_DDP or self.ddp_type == DDPType.OSS_SHARDED_DDP + ): + # Not sure what this does, just copying from the example in + # https://github.com/facebookresearch/fairscale/blob/master/benchmarks/oss.py + # Check the checkpointing in the case of the OSS optimizer + # Memory usage could spill over from there + # optimizer = cast(OSS, optimizer) + self.optimizer.consolidate_state_dict() + + if self.rank != 0: + return + + checkpoint = self.checkpoint(logs) + self.save_model_checkpoint("model", checkpoint, partial=partial) + + teacher_checkpoint = self.teacher_checkpoint(logs) + self.save_model_checkpoint("teacher_model", teacher_checkpoint, partial=partial) + @staticmethod def filter_args(**kwargs): args = filter_func_args(DINOXVectorTrainer.__init__, kwargs) diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 718630d6..f128db44 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -62,6 +62,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index a0099c02..71845a4b 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -66,6 +66,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 4fa5bdab..b3d6cb9f 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -8,6 +8,7 @@ import logging import math import os +import re from collections import OrderedDict as ODict from enum import Enum from pathlib import Path @@ -33,6 +34,8 @@ TorchDDP, tensors_subset, ) +from ..wd_schedulers import WDScheduler as WDS +from ..wd_schedulers import WDSchedulerFactory as WDSF class DDPType(str, Enum): @@ -92,6 +95,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", @@ -121,6 +125,7 @@ def __init__( self.exp_path = Path(exp_path) self.optim = optim self.lrsched = lrsched + self.wdsched = wdsched if loggers is None: self.loggers = self._default_loggers( @@ -237,6 +242,7 @@ def prepare_models_for_training(self): self.model, self.optimizer, self.lr_scheduler, + self.wd_scheduler, self.grad_scaler, self.swa_model, self.swa_scheduler, @@ -244,6 +250,7 @@ def prepare_models_for_training(self): self.model, self.optim, self.lrsched, + self.wdsched, self.device, self.use_amp, self.ddp, @@ -265,6 +272,7 @@ def _prepare_model_for_training( model, optim, lrsched, + wdsched, device, use_amp, ddp, @@ -318,6 +326,9 @@ def _prepare_model_for_training( # make the learning rate scheduler lr_scheduler = self._make_lr_sched(lrsched, optimizer) + # make weight decay scheduler if needed + wd_scheduler = self._make_wd_sched(wdsched, optimizer) + if use_amp: if ddp and ddp_type != DDPType.DDP: if self.rank == 0: @@ -342,7 +353,15 @@ def _prepare_model_for_training( optimizer, swa_lr=swa_lr, anneal_epochs=swa_anneal_epochs ) - return model, optimizer, lr_scheduler, grad_scaler, swa_model, swa_scheduler + return ( + model, + optimizer, + lr_scheduler, + wd_scheduler, + grad_scaler, + swa_model, + swa_scheduler, + ) def set_epoch(self, data_loader, cur_batch: int = 0): try: @@ -378,6 +397,9 @@ def fit(self, train_data, val_data=None): epoch_updates = int(len(train_data) / self.grad_acc_steps) self.lr_scheduler.on_epoch_begin(epoch, epoch_updates=epoch_updates) + if self.wd_scheduler is not None: + self.wd_scheduler.on_epoch_begin(epoch) + logs = self.train_epoch(train_data) self.cur_batch = 0 if val_data is not None: @@ -395,6 +417,8 @@ def fit(self, train_data, val_data=None): else: if self.lr_scheduler is not None: self.lr_scheduler.on_epoch_end(logs) + if self.wd_scheduler is not None: + self.wd_scheduler.on_epoch_end() self.save_checkpoint(logs) @@ -463,6 +487,7 @@ def train_epoch(self, data_loader): logs = ODict(("train_" + k, v) for k, v in logs.items()) lrs = self._get_lrs() logs.update(lrs) + logs.update(self._get_wds()) return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -502,7 +527,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): def bn_update_epoch(self, data_loader): logs = self.validation_epoch(data_loader, swa_update_bn=True) - logs["lr"] = self._get_lr() + logs.update(self._get_lrs()) return logs def _clip_grad_norm(self, model, optim, grad_clip, grad_clip_norm): @@ -597,10 +622,22 @@ def _make_lr_sched(self, lr_sched, optim): assert isinstance(lr_sched, dict) args = LRSF.filter_args(**lr_sched) if self.rank == 0: - logging.info("lr scheduler args={}".format(args)) + logging.info(f"lr scheduler args={args}") lr_sched = LRSF.create(optim, **args) return lr_sched + def _make_wd_sched(self, wd_sched, optim): + """Makes a Learning Rate scheduler object.""" + if wd_sched is None or isinstance(wd_sched, WDS): + return wd_sched + + assert isinstance(wd_sched, dict) + args = WDSF.filter_args(**wd_sched) + if self.rank == 0: + logging.info("wd scheduler args={args}") + wd_sched = WDSF.create(optim, **args) + return wd_sched + def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): """Creates the default data loaders""" prog_log = ProgLogger(interval=log_interval) @@ -634,6 +671,27 @@ def _get_lrs(self): return lrs + def _get_wd(self): + """Returns the current learning rate to show in the loggers""" + wds = [ + param_group["weight_decay"] for param_group in self.optimizer.param_groups + ] + return max(wds) + + def _get_wds(self, if_scheduler=True): + """Returns the current learning rates of all param groups to show in the loggers""" + if if_scheduler and self.wd_scheduler is None: + return {} + + wds = { + f"wd_{i}": param_group["weight_decay"] + for i, param_group in enumerate(self.optimizer.param_groups) + } + if len(wds) == 1: + wds["wd"] = wds.pop("wd_0") + + return wds + def _compute_grad_acc_steps(self, data_loader): if self.eff_batch_size is None: return @@ -690,6 +748,9 @@ def checkpoint(self, logs=None): if self.lr_scheduler is not None: checkpoint["lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() + if self.wd_scheduler is not None: + checkpoint["wd_scheduler_state_dict"] = self.wd_scheduler.state_dict() + if logs is not None: checkpoint["logs"] = logs @@ -705,7 +766,7 @@ def save_partial_checkpoint(self): and self.global_step % self.save_interval_steps == 0 ) - def new_save_checkpoint(self, logs=None, partial: bool = False): + def save_checkpoint(self, logs=None, partial: bool = False): """Saves a checkpoint of the training status Args: @@ -735,18 +796,19 @@ def save_model_checkpoint( self, model_name: str, checkpoint: Dict[str, Any], partial: bool = False ): if partial: - file_path = "%s/%s_ep%04d_step%08d.pth" % ( - model_name, + file_path = "%s/%s_ep%04d_step%10d.pth" % ( self.exp_path, + model_name, self.cur_epoch, self.global_step, ) else: - file_path = "%s/%s_ep%04d.pth" % (model_name, self.exp_path, self.cur_epoch) + file_path = "%s/%s_ep%04d.pth" % (self.exp_path, model_name, self.cur_epoch) + logging.info("saving %s to %s", model_name, file_path) torch.save(checkpoint, file_path) - def save_checkpoint(self, logs=None, partial: bool = False): + def old_save_checkpoint(self, logs=None, partial: bool = False): """Saves a checkpoint of the training status Args: @@ -824,6 +886,8 @@ def _load_checkpoint(self, checkpoint): self.loss.load_state_dict(checkpoint["loss_state_dict"]) if self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) + if self.wd_scheduler is not None: + self.wd_scheduler.load_state_dict(checkpoint["wd_scheduler_state_dict"]) if "global_step" in checkpoint: self.global_step = checkpoint["global_step"] @@ -859,6 +923,51 @@ def _load_checkpoint(self, checkpoint): return logs + def find_last_checkpoint(self, model_name="model"): + """finds the last checkpoint epoch and step in the experiment dir""" + last_epoch = 0 + last_step = 0 + file_pattern = "%s/%s_ep[0-9]*.pth" % (self.exp_path, model_name) + file_paths = sorted(glob.glob(file_pattern)) + if len(file_paths) > 0: + last_epoch = int(re.search(r"ep[0-9]*", file_paths[-1])[2:]) + + file_pattern = "%s/%s_ep%04d_step[0-9]*.pth" % ( + self.exp_path, + model_name, + last_epoch, + ) + file_paths = sorted(glob.glob(file_pattern)) + if len(file_paths) > 0: + last_step = int(re.search(r"step[0-9]*", file_paths[-1])[4:]) + + return last_epoch, last_step + + def load_last_checkpoint(self): + """Loads the last training checkpoint in the experiment dir.""" + last_epoch, last_step = self.find_last_checkpoint() + if last_epoch > 0 or last_step > 0: + return self.new_load_checkpoint(last_epoch, last_step) + + return None + + def load_model_checkpoint(self, model_name="model", epoch=0, step=0): + if step == 0: + file_path = "%s/%s_ep%04d.pth" % (self.exp_path, model_name, epoch) + else: + file_path = "%s/%s_ep%04d_steps%10d.pth" % ( + self.exp_path, + model_name, + epoch, + step, + ) + logging.info("loading %s from %s", model_name, file_path) + return torch.load(file_path, map_location=torch.device("cpu")) + + def new_load_checkpoint(self, epoch, step): + checkpoint = self.load_model_checkpoint("model", epoch, step) + return self._load_checkpoint(checkpoint) + def load_checkpoint(self, file_path): """Loads a training checkpoint from file. @@ -868,7 +977,7 @@ def load_checkpoint(self, file_path): checkpoint = torch.load(file_path, map_location=torch.device("cpu")) return self._load_checkpoint(checkpoint) - def load_last_checkpoint(self): + def old_load_last_checkpoint(self): """Loads the last training checkpoint in the experiment dir.""" for epoch in range(self.epochs, 0, -1): file_path = Path("%s/model_ep%04d.pth" % (self.exp_path, epoch)) @@ -924,11 +1033,14 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): if "lrsched" not in skip: LRSF.add_class_args(parser, prefix="lrsched") + if "wdsched" not in skip: + WDSF.add_class_args(parser, prefix="wdsched") + parser.add_argument( "--grad-acc-steps", type=int, default=1, - help="gradient accumulation batches before weigth update", + help="gradient accumulation batches before weight update", ) parser.add_argument( "--eff-batch-size", diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 541dc126..1d4665cf 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -62,6 +62,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index 72942506..79526122 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -62,6 +62,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index c89cfd9a..ff3f85cc 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -62,6 +62,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 7d82dde2..4ec04fde 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -62,6 +62,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 3943a681..e19945d1 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -69,6 +69,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 522d7e0b..ad6a3262 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -71,6 +71,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 02c48577..a9000f38 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -63,6 +63,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", @@ -143,4 +144,5 @@ def train_epoch(self, data_loader): logs = ODict(("train_" + k, v) for k, v in logs.items()) lrs = self._get_lrs() logs.update(lrs) + logs.update(self._get_wds()) return logs diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index d4a34abc..d80f03f1 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -73,6 +73,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 041a1ea7..cf956dc7 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -74,6 +74,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 2d6b5514..89c9b9a7 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -63,6 +63,7 @@ def __init__( device=None, metrics=None, lrsched=None, + wdsched=None, loggers=None, ddp=False, ddp_type="ddp", diff --git a/hyperion/torch/wd_schedulers/__init__.py b/hyperion/torch/wd_schedulers/__init__.py new file mode 100644 index 00000000..d8440b12 --- /dev/null +++ b/hyperion/torch/wd_schedulers/__init__.py @@ -0,0 +1,9 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +from .cos_wd import CosineWD +from .factory import WDSchedulerFactory +from .wd_scheduler import WDScheduler diff --git a/hyperion/torch/wd_schedulers/cos_wd.py b/hyperion/torch/wd_schedulers/cos_wd.py new file mode 100644 index 00000000..563e4353 --- /dev/null +++ b/hyperion/torch/wd_schedulers/cos_wd.py @@ -0,0 +1,50 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +import logging +import math + +import torch + +from .wd_scheduler import WDScheduler + + +class CosineWD(WDScheduler): + r"""Set the weight decay of each parameter group using a cosine + + Attributes: + optimizer: Pytorch optimizer object. + initial_wd: initial value of the weight decay. + warmup_steps: number of warm up steps to get the the weight decay to its final value. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_wd_on_opt_step: if True, updates the weight decay each time we update the model, + otherwise after each epoch. + """ + + def __init__( + self, + optimizer, + initial_wd=0, + warmup_steps=0, + epoch=0, + step=0, + update_wd_on_opt_step=False, + ): + super().__init__( + optimizer, initial_wd, warmup_steps, epoch, step, update_wd_on_opt_step + ) + + def get_wd(self, step): + if step >= self.warmup_steps: + return self.final_wds + + r = math.pi / self.warmup_steps + return [ + final_wd + (init_wd - final_wd) * (1 + math.cos(r * step)) / 2 + for init_wd, final_wd in zip(self.initial_wds, self.final_wds) + ] diff --git a/hyperion/torch/wd_schedulers/factory.py b/hyperion/torch/wd_schedulers/factory.py new file mode 100644 index 00000000..3820daa2 --- /dev/null +++ b/hyperion/torch/wd_schedulers/factory.py @@ -0,0 +1,89 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args +from .cos_wd import CosineWD + + +class WDSchedulerFactory: + def create( + optimizer, + wdsch_type, + initial_wd=None, + warmup_steps=0, + update_wd_on_opt_step=False, + ): + """Creates a weight decay scheduler object. + + Args: + optimizer: Pytorch optimizer object. + wdsched_type: type of scheduler in ["none", "cos_wd"]. + initial_wd: inital value of weight decay + warmup_steps: steps until reaching final weight decay + update_wd_on_opt_step: if True, updates the wd each time we update the model, + otherwise after each epoch. + """ + + if wdsch_type == "none": + return None + + if wdsch_type == "cos_lr": + return CosineWD( + optimizer, + initial_wd=initial_wd, + warmup_steps=warmup_steps, + update_wd_on_opt_step=update_wd_on_opt_step, + ) + + raise ValueError(f"invalid wdsch_type={wdsch_type}") + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(WDSchedulerFactory.create, kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--wdsch-type", + type=str.lower, + default="none", + choices=[ + "none", + "cos_wd", + ], + help=("weight decay schedulers: None," "Cosine Annealing."), + ) + + parser.add_argument( + "--initial-wd", + default=None, + type=float, + help=( + "Initial value of weight decay, it is expected to be lower than final value." + ), + ) + + parser.add_argument( + "--warmup-steps", + default=0, + type=int, + help=("Number of steps to reach the final value of weight decay"), + ) + + parser.add_argument( + "--update-wd-on-opt-step", + default=False, + action=ActionYesNo, + help=("Update weight decay based on batch number instead of epoch number"), + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/wd_schedulers/wd_scheduler.py b/hyperion/torch/wd_schedulers/wd_scheduler.py new file mode 100644 index 00000000..a3059edc --- /dev/null +++ b/hyperion/torch/wd_schedulers/wd_scheduler.py @@ -0,0 +1,120 @@ +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import torch +import torch.optim as optim + + +class WDScheduler: + """Base class for weight decay schedulers. + + Attributes: + optimizer: Pytorch optimizer object. + initial_wd: initial value of the weight decay. + warmup_steps: number of warm up steps to get the the weight decay to its final value. + epoch: initial training training epoch, this is needed to restart the model + training. + step: initial training step, this is needed to restart the model training. + update_wd_on_opt_step: if True, updates the weight decay each time we update the model, + otherwise after each epoch. + """ + + def __init__( + self, + optimizer, + initial_wd=0, + warmup_steps=0, + epoch=0, + step=0, + update_wd_on_opt_step=False, + ): + if not isinstance(optimizer, optim.Optimizer): + raise TypeError("%s is not an Optimizer" % (type(optimizer).__name__)) + self.optimizer = optimizer + + if epoch == 0: + for group in optimizer.param_groups: + group.setdefault("final_wd", group["weight_decay"]) + else: + for i, group in enumerate(optimizer.param_groups): + if "final_wd" not in group: + raise KeyError( + "param 'final_wd' is not specified " + "in param_groups[{}] when resuming an optimizer".format(i) + ) + + self.final_wds = list( + map(lambda group: group["final_wd"], optimizer.param_groups) + ) + + if isinstance(initial_wd, list) or isinstance(initial_wd, tuple): + if len(initial_wd) != len(optimizer.param_groups): + raise ValueError( + "expected {} initial_wds, got {}".format( + len(optimizer.param_groups), len(initial_wd) + ) + ) + self.initial_wds = list(initial_wd) + else: + max_wd = max([group["final_wd"] for group in optimizer.param_groups]) + self.initial_wds = [ + initial_wd * group["final_wd"] / max_wd + for group in optimizer.param_groups + ] + + self.warmup_steps = warmup_steps + self.epoch = epoch + self.step = step + self.update_wd_on_opt_step = update_wd_on_opt_step + + @property + def in_warmup(self): + return self.step < self.warmup_steps + + def state_dict(self): + """Returns the state of the scheduler as a :class:`dict`. + + It contains an entry for every variable in self.__dict__ which + is not the optimizer. + """ + return { + key: value for key, value in self.__dict__.items() if key != "optimizer" + } + + def load_state_dict(self, state_dict): + """Loads the schedulers state. + + Arguments: + state_dict (dict): scheduler state. Should be an object returned + from a call to :meth:`state_dict`. + """ + self.__dict__.update(state_dict) + + def get_wd(self): + raise NotImplementedError + + def on_epoch_begin(self, epoch=None, **kwargs): + if epoch is not None: + self.epoch = epoch + + if self.update_wd_on_opt_step: + return + + for param_group, wd in zip( + self.optimizer.param_groups, self.get_wd(self.epoch) + ): + param_group["weight_decay"] = wd + + def on_epoch_end(self, metrics=None): + self.epoch += 1 + + def on_opt_step(self): + if self.update_wd_on_opt_step: + for param_group, wd in zip( + self.optimizer.param_groups, self.get_wd(self.step) + ): + param_group["weight_decay"] = wd + + self.step += 1 From 249281957a5c6231b4c04d8c62c9cd1189ef617d Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 10 Jan 2024 07:08:10 -0500 Subject: [PATCH 126/154] added clustering to dino --- .../ssl.v1/conf/teacher_reverb_noise_aug.yaml | 26 ++ ...n_006_extract_dino_embeds_cluster_eval.sh} | 82 +++- egs/voxceleb/ssl.v1/run_007_train_xvector.sh | 90 +++++ ...train_ecapatdnn512x3_xvec_stage1_v3.1.yaml | 99 +++++ ...train_ecapatdnn512x3_xvec_stage2_v3.1.yaml | 74 ++++ .../train_lresnet34_xvec_stage1_v3.1.yaml | 75 ++++ hyperion/bin/cluster_embeddings.py | 362 ++++++++++++++++++ hyperion/bin/eval_plda_backend.py | 232 +++++++++++ hyperion/bin/train_plda.py | 161 ++++++++ hyperion/helpers/plda_factory.py | 83 ++-- hyperion/np/clustering/__init__.py | 3 +- hyperion/np/clustering/kmeans.py | 184 ++++++++- hyperion/np/clustering/spectral_clustering.py | 312 +++++++++++++++ hyperion/np/np_model.py | 19 + hyperion/np/pdfs/mixtures/gmm_diag_cov.py | 8 +- hyperion/np/pdfs/plda/__init__.py | 3 +- hyperion/np/pdfs/plda/factory.py | 204 ++++++++++ hyperion/np/pdfs/plda/frplda.py | 5 +- hyperion/np/pdfs/plda/plda.py | 15 +- hyperion/np/pdfs/plda/plda_base.py | 61 ++- hyperion/np/pdfs/plda/splda.py | 13 +- hyperion/np/preprocessing/__init__.py | 6 + hyperion/np/preprocessing/resampler.py | 46 +++ hyperion/np/transforms/lda.py | 36 ++ hyperion/np/transforms/pca.py | 26 +- hyperion/torch/data/audio_dataset.py | 50 +-- hyperion/torch/layers/global_pool.py | 26 +- hyperion/torch/narchs/dino_head.py | 73 +--- hyperion/torch/narchs/proj_head.py | 28 +- .../torch/trainers/dino_xvector_trainer.py | 6 +- hyperion/torch/trainers/torch_trainer.py | 12 +- hyperion/torch/wd_schedulers/factory.py | 2 +- hyperion/torch/wd_schedulers/wd_scheduler.py | 4 + 33 files changed, 2203 insertions(+), 223 deletions(-) create mode 100644 egs/voxceleb/ssl.v1/conf/teacher_reverb_noise_aug.yaml rename egs/voxceleb/ssl.v1/{run_006_extract_dino_embeds.sh => run_006_extract_dino_embeds_cluster_eval.sh} (60%) create mode 100755 egs/voxceleb/ssl.v1/run_007_train_xvector.sh create mode 100644 egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage1_v3.1.yaml create mode 100644 hyperion/bin/cluster_embeddings.py create mode 100755 hyperion/bin/eval_plda_backend.py create mode 100644 hyperion/bin/train_plda.py create mode 100644 hyperion/np/clustering/spectral_clustering.py create mode 100644 hyperion/np/pdfs/plda/factory.py create mode 100644 hyperion/np/preprocessing/__init__.py create mode 100644 hyperion/np/preprocessing/resampler.py diff --git a/egs/voxceleb/ssl.v1/conf/teacher_reverb_noise_aug.yaml b/egs/voxceleb/ssl.v1/conf/teacher_reverb_noise_aug.yaml new file mode 100644 index 00000000..6c2fecc0 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/teacher_reverb_noise_aug.yaml @@ -0,0 +1,26 @@ +reverb_aug: + reverb_prob: 0.3 + max_reverb_context: 0.5 + rir_types: + smallroom: + weight: 1 + rir_path: csv:data/rirs_smallroom/rirs.csv + rir_norm: max +noise_aug: + noise_prob: 0.7 + noise_types: + noise: + weight: 1 + noise_path: data/musan_noise_proc_audio/recordings.csv + min_snr: 10 + max_snr: 28 + music: + weight: 1 + noise_path: data/musan_music_proc_audio/recordings.csv + min_snr: 13 + max_snr: 28 + babble: + weight: 1 + noise_path: data/musan_speech_babble/recordings.csv + min_snr: 13 + max_snr: 28 diff --git a/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds.sh b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh similarity index 60% rename from egs/voxceleb/ssl.v1/run_006_extract_dino_embeds.sh rename to egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh index 36ccd294..8973483c 100755 --- a/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds.sh +++ b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh @@ -30,25 +30,14 @@ if [ $nnet_stage -eq 1 ];then elif [ $nnet_stage -eq 2 ];then nnet=$nnet_s2 nnet_name=$nnet_s2_name -elif [ $nnet_stage -eq 3 ];then - nnet=$nnet_s3 - nnet_name=$nnet_s3_name -elif [ $nnet_stage -eq 4 ];then - nnet=$nnet_s4 - nnet_name=$nnet_s4_name -elif [ $nnet_stage -eq 5 ];then - nnet=$nnet_s5 - nnet_name=$nnet_s5_name -elif [ $nnet_stage -eq 6 ];then - nnet=$nnet_s6 - nnet_name=$nnet_s6_name fi xvector_dir=exp/xvectors/$nnet_name score_dir=exp/scores/$nnet_name score_cosine_dir=$score_dir/cosine +score_plda_dir=$score_dir/${cluster_name}_plda -if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qmf" == "true" || "$do_pca" == "true") ]]; then +if [ $stage -le 1 ]; then # Extract xvectors for training LDA/PLDA nj=100 for name in voxceleb2cat_train @@ -63,7 +52,7 @@ if [[ $stage -le 1 && ( "$do_plda" == "true" || "$do_snorm" == "true" || "$do_qm hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ - --random-utt-length --min-utt-length 2 --max-utt-length 30 \ + --random-utt-length --min-utt-length 30 --max-utt-length 30 \ --model-path $nnet \ --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv hyperion-tables cat \ @@ -135,5 +124,70 @@ if [ $stage -le 3 ];then --output-file $score_cosine_dir/voxceleb1_results.csv cat $score_cosine_dir/voxceleb1_results.csv + exit fi + +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name +if [ $stage -le 4 ];then + echo "Cluster Vox2" + mkdir -p $cluster_dir + $train_cmd --mem 50G --num-threads 32 $cluster_dir/clustering.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV \ + hyperion-cluster-embeddings $cluster_method --cfg $cluster_cfg \ + --segments-file data/voxceleb2cat_train_xvector_train/segments.csv \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --output-file $cluster_dir/voxceleb2cat_train_xvector_train/segments.csv +fi + +if [ $stage -le 5 ];then + echo "Train PLDA" + $train_cmd $cluster_dir/plda.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV \ + hyperion-train-plda --cfg $plda_cfg \ + --segments-file $cluster_dir/voxceleb2cat_train_xvector_train/segments.csv \ + --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --preproc-file $cluster_dir/plda/preproc.h5 \ + --plda-file $cluster_dir/plda/plda.h5 + + +fi + +if [ $stage -le 6 ];then + + echo "Eval Voxceleb 1 with PLDA" + num_parts=8 + for((i=1;i<=$num_parts;i++)); + do + for((j=1;j<=$num_parts;j++)); + do + $train_cmd $score_plda_dir/log/voxceleb1_${i}_${j}.log \ + hyp_utils/conda_env.sh \ + hyperion-eval-plda-backend \ + --feats-file csv:$xvector_dir/voxceleb1_test/xvector.csv \ + --ndx-file data/voxceleb1_test/trials.csv \ + --enroll-map-file data/voxceleb1_test/enrollment.csv \ + --score-file $score_plda_dir/voxceleb1_scores.csv \ + --preproc-file $cluster_dir/plda/preproc.h5 \ + --plda-file $cluster_dir/plda/plda.h5 \ + --enroll-part-idx $i --num-enroll-parts $num_parts \ + --test-part-idx $j --num-test-parts $num_parts & + done + done + wait + hyperion-merge-scores --output-file $score_plda_dir/voxceleb1_scores.csv \ + --num-enroll-parts $num_parts --num-test-parts $num_parts + + $train_cmd --mem 12G --num-threads 6 $score_plda_dir/log/score_voxceleb1.log \ + hyperion-eval-verification-metrics \ + --score-files $score_plda_dir/voxceleb1_scores.csv \ + --key-files data/voxceleb1_test/trials_{o,e,h}.csv \ + --score-names voxceleb1 \ + --key-names O E H \ + --sparse \ + --output-file $score_plda_dir/voxceleb1_results.csv + + cat $score_plda_dir/voxceleb1_results.csv + exit +fi +exit diff --git a/egs/voxceleb/ssl.v1/run_007_train_xvector.sh b/egs/voxceleb/ssl.v1/run_007_train_xvector.sh new file mode 100755 index 00000000..40aceb07 --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_007_train_xvector.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_data_dir=data/${nnet_data}_xvector_train +val_data_dir=data/${nnet_data}_xvector_val + +#add extra args from the command line arguments +if [ -n "$num_workers" ];then + extra_args="--data.train.data_loader.num-workers $num_workers" +fi +if [ "$use_tb" == "true" ];then + extra_args="$extra_args --trainer.use-tensorboard" +fi +if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" +fi + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +xvector_dir=exp/xvectors/$nnet_s1_name/voxceleb2cat_train +output_dir=exp/clustering/$nnet_s1_name/$cluster_method/voxceleb2cat_train_xvector_train +if [ $stage -le 1 ];then + mkdir -p $output_dir + $train_cmd --mem 50G --num-threads 32 $output_dir/clustering.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV \ + hyperion-cluster-embeddings $cluster_method --cfg $cluster_cfg \ + --segments-file data/voxceleb2cat_train_xvector_train/segments.csv \ + --feats-file csv:$xvector_dir/xvector.csv \ + --output-file $output_dir/segments.csv +fi +exit +# Network Training +if [ $stage -le 2 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir \ + --num-gpus $ngpu \ + +fi + + +# Large Margin Fine-tuning +if [ $stage -le 2 ]; then + if [ "$use_wandb" == "true" ];then + extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" + fi + mkdir -p $nnet_s2_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s2_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ + --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ + --data.train.dataset.segments-file $train_data_dir/segments.csv \ + --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ + --data.val.dataset.segments-file $val_data_dir/segments.csv \ + --in-model-file $nnet_s1 \ + --trainer.exp-path $nnet_s2_dir \ + --num-gpus $ngpu \ + +fi diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..958c6237 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage1_v3.1.yaml @@ -0,0 +1,99 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.0 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 40 + eff_batch_size: 256 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..c19546e8 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn512x3_xvec_stage2_v3.1.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 6.0 + min_chunk_length: 6.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0. +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + swa_start: 31 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker + train_mode: full diff --git a/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..2244fd38 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage1_v3.1.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/hyperion/bin/cluster_embeddings.py b/hyperion/bin/cluster_embeddings.py new file mode 100644 index 00000000..998b1f17 --- /dev/null +++ b/hyperion/bin/cluster_embeddings.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) +from scipy import sparse + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.clustering import AHC, KMeans, KMeansInitMethod, SpectralClustering +from hyperion.np.pdfs import DiagGMM +from hyperion.np.transforms import PCA, LNorm +from hyperion.utils import SegmentSet +from hyperion.utils.math_funcs import cosine_scoring + +subcommand_list = [ + "cos_ahc", + "spectral_clustering", +] + + +def add_common_args(parser): + parser.add_argument("--feats-file", required=True) + parser.add_argument("--segments-file", required=True) + parser.add_argument("--output-file", required=True) + parser.add_argument( + "--filter-by-gmm-post", + default=0, + type=float, + help="remove segments with gmm posterior lower than threshold", + ) + + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def load_data(segments_file, feats_file): + logging.info("loading data") + segments = SegmentSet.load(segments_file) + reader = DRF.create(feats_file) + x = reader.read(segments["id"], squeeze=True) + return segments, x + + +def do_pca(x, pca_args): + pca_var_r = pca_args["pca_var_r"] + logging.info("computing pca pca_var_r=%f", pca_var_r) + if pca_var_r < 1: + pca = PCA(**pca_args) + pca.fit(x) + x = pca(x) + logging.info("pca-dim=%d", x.shape[1]) + + return x + + +def do_kmeans(x, samples_per_cluster, epochs, rtol, init_method, num_workers): + if samples_per_cluster > 1: + km_clusters = x.shape[0] // samples_per_cluster + logging.info("kmeans with num_clusters=%d", km_clusters) + kmeans = KMeans( + num_clusters=km_clusters, + rtol=rtol, + epochs=epochs, + init_method=init_method, + num_workers=num_workers, + ) + kmeans.fit(x) + idx_km, _ = kmeans(x) + x_km = kmeans.mu + del kmeans + else: + idx_km = None + x_km = x + + return x_km, idx_km + + +def get_gmm_post(x, y): + logging.info("computing cluster posteriors with gmm") + num_comp = np.max(y) + 1 + gmm = DiagGMM(num_comp=num_comp, x_dim=x.shape[1], min_N=1) + u_dim = gmm.compute_suff_stats(x[:1]).shape[1] + N = np.zeros((num_comp,), dtype=float) + 1e-5 + u_x = np.zeros((num_comp, u_dim), dtype=float) + + for c in range(num_comp): + mask = y == c + N_c = np.sum(mask) + if N_c == 0: + continue + + N[c] = N_c + u_x_c = gmm.compute_suff_stats(x[mask]) + u_x[c] = np.sum(u_x_c, axis=0) + + gmm.Mstep(N, u_x) + p = gmm.compute_pz(x, mode="std") + p_max = p[np.arange(x.shape[0]), y] + zz = p_max < 0.5 + print(np.mean(p[zz]), np.max(p[zz]), p_max[zz]) + p_2nd = np.sort(p, axis=1, kind="heapsort")[:, -2] + return p_max, p_2nd + + +def cos_ahc( + segments_file, + feats_file, + output_file, + lnorm, + pca, + linkage_method, + stop_criterion, + num_clusters, + threshold, + ahc_precision, + pre_kmeans, + num_workers, + filter_by_gmm_post, +): + segments, x = load_data(segments_file, feats_file) + if lnorm: + x = LNorm()(x) + + x = do_pca(x, pca) + x_km, idx_km = do_kmeans(x, num_workers=num_workers, **pre_kmeans) + + logging.info("compute affinity matrix") + if ahc_precision == "single": + x_lowprec = x_km.astype(np.float32) + elif ahc_precision == "half": + x_lowprec = x_km.astype(np.float16) + else: + x_lowprec = x_km + + scores = cosine_scoring(x_lowprec, x_lowprec) + + logging.info("running AHC") + ahc = AHC(method=linkage_method) + ahc.fit(scores) + if stop_criterion == "threshold": + y = ahc.get_flat_clusters_from_thr(threshold) + else: + y = ahc.get_flat_clusters_from_num_clusters(num_clusters) + + del ahc + if idx_km is not None: + y = y[idx_km] + del x_km + + p_max, p_2nd = get_gmm_post(x, y) + segments["cluster"] = y + segments["post_cluster"] = p_max + segments["post_cluster_2nd"] = p_2nd + if filter_by_gmm_post > 0: + idx = segments["post_cluster"] > filter_by_gmm_post + segments = SegmentSet(segments.loc[idx]) + + segments.save(output_file) + + +def make_cos_ahc_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + add_common_args(parser) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + PCA.add_class_args(parser, prefix="pca") + parser.add_argument( + "--linkage-method", + default="average", + choices=["single", "complete", "average", "weighted", "ward"], + help="linkage method", + ) + parser.add_argument( + "--stop-criterion", + default="threshold", + choices=["threshold", "num_clusters"], + help="stopping criterion", + ) + parser.add_argument( + "--num-clusters", default=None, type=int, help="number of AHC clusters" + ) + parser.add_argument("--threshold", default=0, type=float, help="stopping threshold") + parser.add_argument( + "--ahc-precision", default="single", choices=["half", "single", "double"] + ) + parser.add_argument( + "--pre_kmeans.samples-per-cluster", + default=1, + type=int, + help="first k-means is done to recuce the computing cost of AHC", + ) + parser.add_argument( + "--pre_kmeans.init_method", + default=KMeansInitMethod.max_dist, + choices=KMeansInitMethod.choices(), + ) + parser.add_argument("--pre_kmeans.epochs", default=100, type=int) + parser.add_argument("--pre_kmeans.rtol", default=0.001, type=float) + parser.add_argument("--num_workers", default=1, type=int) + return parser + + +def compute_sc_affinity(x, aff_func, gauss_sigma, aff_thr, precision): + if precision == "single": + x = x.astype(np.float32) + elif precision == "half": + x = x.astype(np.float16) + + scores = cosine_scoring(x, x) + if aff_func == "gauss_cos": + assert gauss_sigma > 0 + d2 = 1 - scores + scores = np.exp(-d2 / gauss_sigma) + + assert aff_thr < 1 + scores[scores < aff_thr] = 0 + num_nodes = scores.shape[0] + scores.flat[:: num_nodes + 1] = 0 + aff_size = num_nodes**2 + num_edges = np.sum(scores > 0) + r = aff_size / num_edges + logging.info("num_nodes^2=%d, num_edges=%d r=%f", aff_size, num_edges, r) + if r > 4: + scores = sparse.csr_matrix(scores) + return scores + + +def spectral_clustering( + segments_file, + feats_file, + output_file, + lnorm, + pca, + pre_kmeans, + affinity, + spectral_clustering, + filter_by_gmm_post, +): + segments, x = load_data(segments_file, feats_file) + if lnorm: + x = LNorm()(x) + + x = do_pca(x, pca) + x_km, idx_km = do_kmeans(x, **pre_kmeans) + A = compute_sc_affinity(x_km, **affinity) + sc = SpectralClustering(**spectral_clustering) + y, num_clusters, eigengap_stats = sc.fit(A) + if idx_km is not None: + y = y[idx_km] + del x_km + + segments["cluster"] = y + if num_clusters > 1: + p_max, p_2nd = get_gmm_post(x, y) + segments["post_cluster"] = p_max + segments["post_cluster_2nd"] = p_2nd + + if filter_by_gmm_post > 0: + idx = segments["post_cluster"] > filter_by_gmm_post + segments = SegmentSet(segments.loc[idx]) + + segments.save(output_file) + output_file = Path(output_file) + fig_file = output_file.with_stem(output_file.stem + "_eigengap").with_suffix(".png") + sc.plot_eigengap_stats(eigengap_stats, num_clusters, fig_file) + + df_eig = pd.DataFrame( + {k: eigengap_stats[k] for k in ["eig_vals", "eigengap", "d_eig_vals"]} + ) + df_eig["num_clusters"] = np.arange(1, len(df_eig) + 1) + eig_file = fig_file.with_suffix(".csv") + df_eig.to_csv(eig_file, index=False) + + +def make_spectral_clustering_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + add_common_args(parser) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + PCA.add_class_args(parser, prefix="pca") + parser.add_argument( + "--pre_kmeans.samples-per-cluster", + default=1, + type=int, + help="first k-means is done to recuce the computing cost of AHC", + ) + parser.add_argument( + "--pre_kmeans.init_method", + default=KMeansInitMethod.max_dist, + choices=KMeansInitMethod.choices(), + ) + parser.add_argument("--pre_kmeans.epochs", default=100, type=int) + parser.add_argument("--pre_kmeans.rtol", default=0.001, type=float) + parser.add_argument("--pre_kmeans.num_workers", default=1, type=int) + parser.add_argument( + "--affinity.aff_func", default="cos", choices=["cos", "gauss_cos"] + ) + parser.add_argument( + "--affinity.gauss-sigma", + default=1, + type=float, + help="std. dev. of gauss function", + ) + parser.add_argument( + "--affinity.aff-thr", + default=0, + type=float, + help="values under this are set to 0", + ) + parser.add_argument( + "--affinity.precision", default="single", choices=["half", "single", "double"] + ) + SpectralClustering.add_class_args(parser, prefix="spectral_clustering") + + return parser + + +def main(): + parser = ArgumentParser( + description="Cluster embeddings into classes, usually speakers" + ) + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in subcommand_list: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(subcommand, subparser) + + args = parser.parse_args() + subcommand = args.subcommand + kwargs = namespace_to_dict(args)[args.subcommand] + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/eval_plda_backend.py b/hyperion/bin/eval_plda_backend.py new file mode 100755 index 00000000..2058b2cb --- /dev/null +++ b/hyperion/bin/eval_plda_backend.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python +""" + Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +""" +import logging +import time +from pathlib import Path + +import numpy as np +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np import NPModel +from hyperion.np.pdfs import PLDAFactory, PLDALLRNvsMMethod +from hyperion.np.score_norm import AdaptSNorm +from hyperion.np.transforms import LNorm, TransformList +from hyperion.utils import EnrollmentMap, SegmentSet, TrialKey, TrialNdx, TrialScores +from hyperion.utils.math_funcs import cosine_scoring + + +def load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, +): + test_feats_reader = DRF.create(feats_file) + if enroll_feats_file is not None and enroll_feats_file != feats_file: + enroll_feats_reader = DRF.create(enroll_feats_file) + else: + enroll_feats_reader = test_feats_reader + + enroll_map = EnrollmentMap.load(enroll_map_file) + try: + ndx = TrialNdx.load(ndx_file) + except: + ndx = TrialKey.load(ndx_file).to_ndx() + + if num_enroll_parts > 1 or num_test_parts > 1: + ndx = ndx.split( + enroll_part_idx, num_enroll_parts, test_part_idx, num_test_parts + ) + + enroll_map = enroll_map.filter(items=ndx.model_set) + x_e = enroll_feats_reader.read(enroll_map["segmentid"], squeeze=True) + x_t = test_feats_reader.read(ndx.seg_set, squeeze=True) + return enroll_map, ndx, x_e, x_t + + +def load_cohort_data(segments_file, feats_file): + segments = SegmentSet.load(segments_file) + feats_reader = DRF.create(feats_file) + x = feats_reader.read(segments["id"], squeeze=True) + return segments, x + + +def eval_backend( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + preproc_file, + plda_file, + llr_method, + score_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + cohort_segments_file, + cohort_feats_file, + cohort_nbest, + avg_cohort_by, +): + logging.info("loading data") + enroll_map, ndx, x_e, x_t = load_trial_data( + enroll_map_file, + ndx_file, + enroll_feats_file, + feats_file, + enroll_part_idx, + num_enroll_parts, + test_part_idx, + num_test_parts, + ) + enroll_set, enroll_ids = np.unique(enroll_map["id"], return_inverse=True) + if len(enroll_set) == np.max(enroll_ids) + 1: + is_Nvs1 = False + else: + is_Nvs1 = True + + t1 = time.time() + + if preproc_file is not None: + logging.info("Loading Preprocessor") + preprocessor = TransformList.load(preproc_file) + x_e = preprocessor(x_e) + x_t = preprocessor(x_t) + if llr_method == PLDALLRNvsMMethod.vavg and isinstance( + preprocessor.transforms[-1], LNorm + ): + llr_method = PLDALLRNvsMMethod.lnorm_vavg + + assert llr_method == PLDALLRNvsMMethod.lnorm_vavg, preprocessor.transforms + logging.info("Loading PLDA model") + plda_model = NPModel.auto_load(plda_file) + logging.info("computing score") + if is_Nvs1: + scores = plda_model.llr_Nvs1(x_e, x_t, ids1=enroll_ids, method=llr_method) + else: + scores = plda_model.llr_1vs1(x_e, x_t) + + dt = time.time() - t1 + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if cohort_segments_file is not None: + t1 = time.time() + cohort_segments, x_coh = load_cohort_data( + cohort_segments_file, cohort_feats_file + ) + if preproc_file is not None: + x_coh = preprocessor(x_coh) + + if avg_cohort_by is not None: + cohort_class = cohort_segments[avg_cohort_by] + _, cohort_ids = np.unique(cohort_class, return_inverse=True) + else: + cohort_ids = None + + logging.info("computing enroll vs cohort") + scores_enr_coh = plda_model.llr_NvsM( + x_e, x_coh, ids1=enroll_ids, ids2=cohort_ids, method=llr_method + ) + logging.info("computing cohort vs test") + scores_coh_test = plda_model.lrr_Nvs1( + x_coh, x_t, ids1=cohort_ids, method=llr_method + ) + snorm = AdaptSNorm(cohort_nbest) + scores = snorm(scores, scores_coh_test, scores_enr_coh) + dt = time.time() - t1 + logging.info( + "s-norm elapsed time: %.2f s. elapsed time per trial: %.2f ms.", + dt, + dt / num_trials * 1000, + ) + + if num_enroll_parts > 1 or num_test_parts > 1: + score_file = Path(score_file) + new_suffix = f".{enroll_part_idx}.{test_part_idx}{score_file.suffix}" + score_file = score_file.with_suffix(new_suffix) + + logging.info("saving scores to %s", score_file) + # sort scores rows to match the ndx model_set order + sort_idx = [np.nonzero(enroll_set == e)[0][0] for e in ndx.model_set] + scores = scores[sort_idx] + scores = TrialScores(ndx.model_set, ndx.seg_set, scores, ndx.trial_mask) + scores.save(score_file) + + +def main(): + parser = ArgumentParser(description="Eval PLDA LLR with optional AS-Norm") + + parser.add_argument("--enroll-feats-file", default=None) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-map-file", required=True) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--plda-file", required=True) + parser.add_argument( + "--llr-method", + default=PLDALLRNvsMMethod.vavg, + choices=PLDALLRNvsMMethod.choices(), + ) + parser.add_argument("--cohort-segments-file", default=None) + parser.add_argument("--cohort-feats-file", default=None) + parser.add_argument("--cohort-nbest", type=int, default=1000) + parser.add_argument( + "--avg-cohort-by", + default=None, + help="segments file column to average vectors from same class class", + ) + parser.add_argument("--score-file", required=True) + parser.add_argument( + "--enroll-part-idx", default=1, type=int, help="enroll part index" + ) + parser.add_argument( + "--num-enroll-parts", + default=1, + type=int, + help="""number of parts in which we divide the enroll + list to run evaluation in parallel""", + ) + parser.add_argument("--test-part-idx", default=1, type=int, help="test part index") + parser.add_argument( + "--num-test-parts", + default=1, + type=int, + help="""number of parts in which we divide the test list + to run evaluation in parallel""", + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() + config_logger(args.verbose) + del args.verbose + logging.debug(args) + + eval_backend(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_plda.py b/hyperion/bin/train_plda.py new file mode 100644 index 00000000..b33afa31 --- /dev/null +++ b/hyperion/bin/train_plda.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +import sys +import time +from pathlib import Path + +import numpy as np +import pandas as pd +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.io import RandomAccessDataReaderFactory as DRF +from hyperion.np.pdfs import PLDAFactory +from hyperion.np.transforms import LDA, PCA, CentWhiten, LNorm, TransformList +from hyperion.utils import SegmentSet + + +def load_data(segments_file, feats_file, class_name): + logging.info("loading data") + segments = SegmentSet.load(segments_file) + reader = DRF.create(feats_file) + x = reader.read(segments["id"], squeeze=True) + _, y = np.unique(segments[class_name], return_inverse=True) + return segments, x, y + + +def train_pca(x, pca_lnorm, pca_args): + pca_var_r = pca_args["pca_var_r"] + logging.info("computing pca pca_var_r=%f", pca_var_r) + pca = None + pca_lnorm = None + if pca_var_r < 1: + if pca_lnorm: + logging.info("LNorm before PCA") + pca_lnorm = LNorm(name="pca_lnorm") + x = pca_lnorm(x) + + pca = PCA(**pca_args) + pca.fit(x) + x = pca(x) + logging.info("pca-dim=%d", x.shape[1]) + + return x, pca_lnorm, pca + + +def train_plda( + segments_file, + feats_file, + class_name, + preproc_file, + plda_file, + pca, + lda, + plda, + pca_lnorm, + do_lda, + lda_lnorm, + plda_lnorm, + plda_center, + plda_whiten, +): + segments, x, y = load_data(segments_file, feats_file, class_name) + transform_list = [] + + x, pca_lnorm, pca_model = train_pca(x, pca_lnorm, pca) + if pca_lnorm is not None: + transform_list.append(pca_lnorm) + + if pca_model is not None: + transform_list.append(pca_model) + + if do_lda and x.shape[1] > lda["lda_dim"]: + if lda_lnorm: + logging.info("LNorm before LDA") + t = LNorm(name="lda_lnorm") + x = t(x) + transform_list.append(t) + + logging.info("Training LDA") + lda_model = LDA(**lda) + lda_model.fit(x, y) + x = lda_model(x) + transform_list.append(lda_model) + + if plda_center or plda_whiten: + if plda_lnorm: + t = LNorm(update_mu=plda_center, update_T=plda_whiten, name="plda_lnorm") + else: + t = CentWhiten(update_mu=plda_center, update_T=plda_whiten, name="plda_cw") + + logging.info("Training Center/Whiten/LNorm") + t.fit(x) + logging.info("Center/Whiten/LNorm before PLDA") + x = t(x) + transform_list.append(t) + elif plda_lnorm: + logging.info("LNorm before PLDA") + t = LNorm(name="plda_lnorm") + x = t(x) + transform_list.append(t) + + logging.info("Training PLDA") + plda["y_dim"] = min(x.shape[1], plda["y_dim"]) + plda = PLDAFactory.create(**plda) + elbo, elbo_norm = plda.fit(x, y) + + logging.info("Saving Models") + if len(transform_list) > 0: + transform_list = TransformList(transform_list) + transform_list.save(preproc_file) + + plda.save(plda_file) + loss_file = Path(plda_file).with_suffix(".csv") + loss_df = pd.DataFrame( + {"epoch": np.arange(1, len(elbo) + 1), "elbo": elbo, "elbo_norm": elbo_norm} + ) + loss_df.to_csv(loss_file, index=False) + + +def main(): + parser = ArgumentParser(description="Trains PLDA model and embedding preprocessor") + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--feats-file", required=True) + parser.add_argument("--segments-file", required=True) + parser.add_argument("--class-name", default="speaker") + parser.add_argument("--preproc-file", required=True) + parser.add_argument("--plda-file", required=True) + PCA.add_class_args(parser, prefix="pca") + LDA.add_class_args(parser, prefix="lda") + PLDAFactory.add_class_args(parser, prefix="plda") + parser.add_argument("--pca-lnorm", default=False, action=ActionYesNo) + parser.add_argument("--lda-lnorm", default=False, action=ActionYesNo) + parser.add_argument("--do-lda", default=False, action=ActionYesNo) + parser.add_argument("--plda-lnorm", default=True, action=ActionYesNo) + parser.add_argument("--plda-center", default=True, action=ActionYesNo) + parser.add_argument("--plda-whiten", default=True, action=ActionYesNo) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + args = parser.parse_args() + config_logger(args.verbose) + logging.debug(args) + del args["verbose"] + del args["cfg"] + train_plda(**namespace_to_dict(args)) + + +if __name__ == "__main__": + main() diff --git a/hyperion/helpers/plda_factory.py b/hyperion/helpers/plda_factory.py index 16cf01c4..0b90b334 100644 --- a/hyperion/helpers/plda_factory.py +++ b/hyperion/helpers/plda_factory.py @@ -3,16 +3,30 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from enum import Enum + import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..np.pdfs.plda import FRPLDA, PLDA, SPLDA +from ..utils.misc import filter_func_args + + +class PLDAType(str, Enum): + frplda = "frplda" + splda = "splda" + plda = "plda" + + @staticmethod + def choices(): + return [PLDAType.frplda, PLDAType.splda, PLDAType.plda] class PLDAFactory(object): """Class to create PLDA objects.""" @staticmethod - def create_plda( + def create( plda_type, y_dim=None, z_dim=None, @@ -27,8 +41,7 @@ def create_plda( name="plda", **kwargs ): - - if plda_type == "frplda": + if plda_type == PLDAType.frplda: return FRPLDA( fullcov_W=fullcov_W, update_mu=update_mu, @@ -37,7 +50,7 @@ def create_plda( name=name, **kwargs ) - if plda_type == "splda": + if plda_type == PLDAType.splda: return SPLDA( y_dim=y_dim, fullcov_W=fullcov_W, @@ -48,7 +61,7 @@ def create_plda( **kwargs ) - if plda_type == "plda": + if plda_type == PLDAType.plda: return PLDA( y_dim=y_dim, z_dim=z_dim, @@ -71,7 +84,9 @@ def load_plda(plda_type, model_file): return PLDA.load(model_file) @staticmethod - def filter_train_args(prefix=None, **kwargs): + def filter_args(**kwargs): + return filter_func_args(PLDAFactory.create, kwargs) + valid_args = ( "plda_type", "y_dim", @@ -109,7 +124,7 @@ def filter_train_args(prefix=None, **kwargs): "update_D", ) - for a, b in zip(ne_args1, neg_args2): + for a, b in zip(neg_args1, neg_args2): d[b] = not d[a] del d[a] @@ -117,63 +132,62 @@ def filter_train_args(prefix=None, **kwargs): @staticmethod def add_class_args(parser, prefix=None): - if prefix is None: - p1 = "--" - else: - p1 = "--" + prefix + "." + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") parser.add_argument( - p1 + "plda-type", - default="splda", - choices=["frplda", "splda", "plda"], + "--plda-type", + default=PLDAType.splda, + choices=PLDAType.choices(), help="PLDA type", ) parser.add_argument( - p1 + "y-dim", type=int, default=150, help="num. of eigenvoices" + "--y-dim", type=int, default=150, help="num. of eigenvoices" ) parser.add_argument( - p1 + "z-dim", type=int, default=400, help="num. of eigenchannels" + "--z-dim", type=int, default=400, help="num. of eigenchannels" ) parser.add_argument( - p1 + "diag-W", - default=False, - action="store_false", - help="use diagonal covariance W", + "--fullcov-W", + default=True, + action=ActionYesNo, + help="use full covariance W", ) parser.add_argument( - p1 + "no-update-mu", - default=False, - action="store_true", + "--update-mu", + default=True, + action=ActionYesNo, help="not update mu", ) parser.add_argument( - p1 + "no-update-V", default=False, action="store_true", help="not update V" + "--update-V", default=True, action=ActionYesNo, help="update V" ) parser.add_argument( - p1 + "no-update-U", default=False, action="store_true", help="not update U" + "--update-U", default=True, action=ActionYesNo, help="update U" ) parser.add_argument( - p1 + "no-update-B", default=False, action="store_true", help="not update B" + "--update-B", default=True, action=ActionYesNo, help="update B" ) parser.add_argument( - p1 + "no-update-W", default=False, action="store_true", help="not update W" + "--update-W", default=True, action=ActionYesNo, help="update W" ) parser.add_argument( - p1 + "no-update-D", default=False, action="store_true", help="not update D" + "--update-D", default=True, action=ActionYesNo, help="update D" ) parser.add_argument( - p1 + "floor-iD", + "--floor-iD", type=float, default=1e-5, help="floor for inverse of D matrix", ) - parser.add_argument(p1 + "epochs", type=int, default=40, help="num. of epochs") + parser.add_argument("--epochs", type=int, default=40, help="num. of epochs") parser.add_argument( - p1 + "ml-md", + "--ml-md", default="ml+md", choices=["ml+md", "ml", "md"], help=("optimization type"), @@ -187,7 +201,12 @@ def add_class_args(parser, prefix=None): help=("epochs in which we do MD, if None we do it in all the epochs"), ) - parser.add_argument(p1 + "name", default="plda", help="model name") + parser.add_argument("--name", default="plda", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) @staticmethod def filter_eval_args(prefix=None, **kwargs): diff --git a/hyperion/np/clustering/__init__.py b/hyperion/np/clustering/__init__.py index 0841d47e..80cfaa2c 100644 --- a/hyperion/np/clustering/__init__.py +++ b/hyperion/np/clustering/__init__.py @@ -4,4 +4,5 @@ """ from .ahc import AHC -from .kmeans import KMeans +from .kmeans import KMeans, KMeansInitMethod +from .spectral_clustering import SpectralClustering diff --git a/hyperion/np/clustering/kmeans.py b/hyperion/np/clustering/kmeans.py index abb88463..82d257d1 100644 --- a/hyperion/np/clustering/kmeans.py +++ b/hyperion/np/clustering/kmeans.py @@ -5,6 +5,8 @@ import logging import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from enum import Enum import h5py import numpy as np @@ -13,6 +15,15 @@ from ..np_model import NPModel +class KMeansInitMethod(str, Enum): + max_dist = "max_dist" + random = "random" + + @staticmethod + def choices(): + return [KMeansInitMethod.max_dist, KMeansInitMethod.random] + + class KMeans(NPModel): """K-Means clustering class. @@ -22,13 +33,30 @@ class KMeans(NPModel): rtol: minimum delta in loss function used as stopping criterion. """ - def __init__(self, num_clusters, mu=None, rtol=0.001, **kwargs): - super(KMeans, self).__init__(**kwargs) + def __init__( + self, + num_clusters, + mu=None, + rtol=0.001, + epochs=100, + init_method=KMeansInitMethod.max_dist, + num_workers=1, + verbose=True, + rng_seed=11235813, + **kwargs + ): + super().__init__(**kwargs) self.num_clusters = num_clusters self.mu = mu self.rtol = rtol + self.epochs = epochs + self.verbose = verbose + self.num_workers = num_workers + self.init_method = init_method + if self.init_method == KMeansInitMethod.random: + self.rng = np.random.default_rng(seed=rng_seed) - def fit(self, x, epochs=100): + def fit(self, x): """Performs the clustering. Args: @@ -39,22 +67,59 @@ def fit(self, x, epochs=100): loss: value of loss function (num_epochs,). cluster_index: clustering labels as int numpy array with shape=(num_samples,) """ - loss = np.zeros((epochs,), dtype=float_cpu()) - self.mu = self._choose_seeds(x) - cluster_index, err2 = self.predict(x) - for epoch in range(epochs): - self.mu = self._compute_centroids(x, cluster_index) - cluster_index, err2 = self.predict(x) + loss = np.zeros((self.epochs,), dtype=float_cpu()) + if self.init_method == KMeansInitMethod.max_dist: + if self.num_workers == 1: + self.mu = self._choose_seeds_max_dist(x) + else: + self.mu = self._choose_seeds_max_dist_multithread(x) + else: + self.mu = self._choose_seeds_random(x) + + cluster_index, err2 = self(x) + for epoch in range(self.epochs): + if self.num_workers == 1: + self.mu = self._compute_centroids(x, cluster_index) + else: + self.mu = self._compute_centroids_multithread(x, cluster_index) + cluster_index, err2 = self(x) loss[epoch] = np.mean(err2) if epoch > 0: - delta = np.abs(loss[epoch - 1] - loss[epoch]) / loss[epoch - 1] + delta = np.abs(loss[epoch - 1] - loss[epoch]) / ( + loss[epoch - 1] + 1e-10 + ) + if self.verbose: + logging.info( + "epoch: %d loss: %f rdelta: %f", epoch, loss[epoch], delta + ) if delta < self.rtol: loss = loss[: epoch + 1] break + else: + if self.verbose: + logging.info("epoch: %d loss: %f", epoch, loss[epoch]) return loss, cluster_index - def _choose_seeds(self, x): + def _choose_seeds_random(self, x): + """Chooses the initial seeds for the clustering randomly. + + Args: + x: input data (num_samples, feat_dim). + + Returns: + Initial centers (num_clusters, feat_dim) + """ + if self.verbose: + logging.info("choosing seeds") + + mu = self.rng.choice(x, size=(self.num_clusters,), replace=False, shuffle=False) + if self.verbose: + logging.info("%d seeds chosen", self.num_clusters) + + return mu + + def _choose_seeds_max_dist(self, x): """Chooses the initial seeds for the clustering. Args: @@ -63,6 +128,8 @@ def _choose_seeds(self, x): Returns: Initial centers (num_clusters, feat_dim) """ + if self.verbose: + logging.info("choosing seeds") mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) mu[0] = x[0] for i in range(1, self.num_clusters): @@ -73,6 +140,40 @@ def _choose_seeds(self, x): mu[i] = x[index] return mu + @staticmethod + def _compute_d2(x, mu): + return np.sum(np.square(x - mu), axis=-1) + + def _choose_seeds_max_dist_multithread(self, x): + """Chooses the initial seeds for the clustering. + + Args: + x: input data (num_samples, feat_dim). + + Returns: + Initial centers (num_clusters, feat_dim) + """ + if self.verbose: + logging.info("choosing seeds") + + mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) + with ThreadPoolExecutor(max_workers=self.num_workers) as executor: + mu[0] = x[0] + for i in range(1, self.num_clusters): + d = np.zeros((x.shape[0],), dtype=float_cpu()) + + futures = { + executor.submit(KMeans._compute_d2, x, mu[j]): j for j in range(i) + } + for future in as_completed(futures): + d += future.result() + + index = np.argmax(d) + mu[i] = x[index] + if self.verbose and (i % 10 == 0 or i == self.num_clusters - 1): + logging.info("%d seeds chosen", i + 1) + return mu + def _compute_centroids(self, x, index): """Compute the centroids given cluster assigments. @@ -90,6 +191,38 @@ def _compute_centroids(self, x, index): mu[k] = np.mean(x[index == k], axis=0) return mu + @staticmethod + def _compute_centroid(x, index, k): + r = index == k + if np.sum(r) > 0: + return np.mean(x[index == k], axis=0) + else: + return None + + def _compute_centroids_multithread(self, x, index): + """Compute the centroids given cluster assigments. + + Args: + x: input data (num_samples, feat_dim) + index: cluster assignments as integers with shape=(num_samples,) + + Returns: + Cluster centroids (num_clusters, feat_dim) + """ + mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) + with ThreadPoolExecutor(max_workers=self.num_workers) as executor: + futures = { + executor.submit(KMeans._compute_centroid, x, index, k): k + for k in range(self.num_clusters) + } + for future in as_completed(futures): + k = futures[future] + mu_k = future.result() + if mu_k is not None: + mu[k] = mu_k + + return mu + def predict(self, x): """Compute the cluster labels for new data. @@ -106,3 +239,32 @@ def predict(self, x): index = np.argmin(err2, axis=-1) return index, err2[np.arange(x.shape[0]), index] + + def predict_multithread(self, x): + """Compute the cluster labels for new data. + + Args: + x: input data (num_samples, feat_dim) + + Returns: + Cluster assignments as integer array (num_samples,) + Square distance of each element to the center of its cluster. + """ + err2 = np.zeros((x.shape[0], self.num_clusters), dtype=float_cpu()) + with ThreadPoolExecutor(max_workers=self.num_workers) as executor: + futures = { + executor.submit(KMeans._compute_d2, x, self.mu[k]): k + for k in range(self.num_clusters) + } + for future in as_completed(futures): + k = futures[future] + err2[:, k] = future.result() + + index = np.argmin(err2, axis=-1) + return index, err2[np.arange(x.shape[0]), index] + + def __call__(self, x): + if self.num_workers == 1: + return self.predict(x) + else: + return self.predict_multithread(x) diff --git a/hyperion/np/clustering/spectral_clustering.py b/hyperion/np/clustering/spectral_clustering.py new file mode 100644 index 00000000..ab2fad26 --- /dev/null +++ b/hyperion/np/clustering/spectral_clustering.py @@ -0,0 +1,312 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from copy import copy +from enum import Enum +from typing import Any, Dict, Optional + +import h5py +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from scipy import sparse +from scipy.linalg import eigh +from scipy.sparse.csgraph import laplacian as csgraph_laplacian +from scipy.sparse.linalg import eigsh +from sklearn.metrics import completeness_score, homogeneity_score +from sklearn.preprocessing import normalize + +from ...hyp_defs import float_cpu +from ...utils import PathLike +from ..np_model import NPModel +from .kmeans import KMeans, KMeansInitMethod + + +class LaplacianType(str, Enum): + unnormalized = "unnormalized" + norm_sym = "norm_sym" + norm_rw = "norm_rw" + + @staticmethod + def choices(): + return [ + LaplacianType.unnormalized, + LaplacianType.norm_sym, + LaplacianType.norm_rw, + ] + + +class SpectralClusteringNumClassCriterion(str, Enum): + max_eigengap = "max_eigengap" + max_d_eig_vals = "max_d_eig_vals" + thr_eigengap = "thr_eigengap" + thr_d_eig_vals = "thr_d_eig_vals" + + @staticmethod + def choices(): + return [ + SpectralClusteringNumClassCriterion.max_eigengap, + SpectralClusteringNumClassCriterion.max_d_eig_vals, + SpectralClusteringNumClassCriterion.thr_eigengap, + SpectralClusteringNumClassCriterion.thr_d_eig_vals, + ] + + +class SpectralClustering(NPModel): + """Spectral Clustering class""" + + def __init__( + self, + laplacian: str = "norm_sym", + num_clusters: Optional[int] = None, + max_num_clusters: Optional[int] = None, + criterion: SpectralClusteringNumClassCriterion = SpectralClusteringNumClassCriterion.max_eigengap, + thr_eigengap: float = 1e-3, + kmeans_epochs: int = 100, + kmeans_init_method: KMeansInitMethod = KMeansInitMethod.max_dist, + num_workers: int = 1, + ): + self.laplacian = laplacian + self.num_clusters = num_clusters + self.max_num_clusters = max_num_clusters + self.criterion = criterion + self.kmeans_epochs = kmeans_epochs + self.thr_eigengap = thr_eigengap + self.kmeans_init_method = kmeans_init_method + self.num_workers = num_workers + + def spectral_embedding(self, x: np.ndarray): + num_nodes = x.shape[0] + if not sparse.issparse(x): + x.flat[:: num_nodes + 1] = 0 + r = num_nodes**2 / np.sum(x > 0) + if r > 4: + x = sparse.csr_matrix(x) + + D = None + if self.laplacian in LaplacianType.unnormalized: + L = csgraph_laplacian(x, normed=False) + elif self.laplacian == LaplacianType.norm_sym: + L = csgraph_laplacian(x, normed=True) + elif self.laplacian == LaplacianType.norm_rw: + L, dd = csgraph_laplacian(x, normed=False, return_diag=True) + if sparse.issparse(L): + D = sparse.diags(dd) + else: + D = np.diag(dd) + + max_num_clusters = num_nodes - 1 + if self.max_num_clusters is not None: + max_num_clusters = min(max_num_clusters, self.max_num_clusters) + if self.num_clusters is not None: + max_num_clusters = min(max_num_clusters, self.num_clusters) + + eig_vals, eig_vecs = eigsh(L, k=max_num_clusters, M=D, which="SM") + eig_vals = eig_vals[1:] + eig_vecs = eig_vecs[:, 1:] + return eig_vals, eig_vecs + + def spectral_embedding_0(self, x: np.ndarray): + num_nodes = x.shape[0] + x.flat[:: num_nodes + 1] = 0 + d = np.sum(x, axis=1) + D = None + if self.laplacian in LaplacianType.unnormalized: + L = np.diag(d) - x + elif self.laplacian == LaplacianType.norm_sym: + idsqrt = 1 / np.sqrt(d) + L = np.eye(num_nodes) - idsqrt[:, None] * x * idsqrt + elif self.laplacian == LaplacianType.norm_rw: + D = np.diag(d) + L = D - x + + max_num_clusters = num_nodes + if self.max_num_clusters is not None: + max_num_clusters = min(max_num_clusters, self.max_num_clusters) + if self.num_clusters is not None: + max_num_clusters = min(max_num_clusters, self.num_clusters) + + eig_vals, eig_vecs = eigh( + L, b=D, overwrite_a=True, subset_by_index=[1, max_num_clusters - 1] + ) + + return eig_vals, eig_vecs + + def compute_eigengap(self, eig_vals: np.ndarray): + eig_vals = np.concatenate(([0.0], eig_vals)) + eigengap = np.diff(np.concatenate(([0.0], eig_vals))) + filter = np.array([1 / 60, -3 / 20, 3 / 4, 0.0, -3 / 4, 3 / 20, -1 / 60]) + eig_vals_ext = np.concatenate((eig_vals, [eig_vals[-1]] * 3)) + d_eig_vals = np.convolve(eig_vals, filter)[3:-6] + k_max = np.argmax(eigengap) + gap_max = eigengap[k_max] + # k_relmax = [] + # gap_relmax = [] + # gap_norm_relmax = [] + # for k in range(len(eigengap)): + # if k == 0 and eigengap[k] > eigengap[k + 1]: + # k_relmax.append(k) + # gap_relmax.append(eigengap[k]) + # gap_norm_relmax.append(eigengap[k] / eigengap[k + 1]) + # elif k == len(eigengap) - 1 and eigengap[k] > eigengap[k - 1]: + # k_relmax.append(k) + # gap_relmax.append(eigengap[k]) + # gap_norm_relmax.append(eigengap[k] / eigengap[k - 1]) + # elif eigengap[k] > eigengap[k - 1] and eigengap[k] > eigengap[k + 1]: + # k_relmax.append(k) + # gap_relmax.append(eigengap[k]) + # gap_norm_relmax.append( + # 2 * eigengap[k] / (eigengap[k - 1] + eigengap[k + 1]) + # ) + + # idx = np.argmax(gap_norm_relmax) + # gap_norm_relmax_max = gap_norm_relmax[idx] + # k_relmax_max = k_relmax[idx] + eigengap_stats = { + "eig_vals": eig_vals, + "eigengap": eigengap, + "gap_max": gap_max, + "k_max": k_max, + # "gap_relmax": gap_relmax, + # "k_relmax": k_relmax, + # "gap_norm_relmax": gap_norm_relmax, + # "gap_norm_relmax_max": gap_norm_relmax_max, + # "k_relmax_max": k_relmax_max, + "d_eig_vals": d_eig_vals, + } + return eigengap_stats + + def predict_num_clusters(self, eigengap_stats: np.ndarray): + if self.num_clusters is not None: + num_clusters = self.num_clusters + + elif self.criterion == SpectralClusteringNumClassCriterion.max_eigengap: + num_clusters = eigengap_stats["k_max"] + 1 + elif self.criterion == SpectralClusteringNumClassCriterion.max_d_eig_vals: + num_clusters = np.argmax(eigengap_stats["d_eig_vals"]) + 1 + elif self.criterion == SpectralClusteringNumClassCriterion.thr_eigengap: + nz = (eigengap_stats["eigengap"] < self.thr_eigengap).nonzero()[0] + num_clusters = nz[nz > eigengap_stats["k_max"]][0] + 1 + elif self.criterion == SpectralClusteringNumClassCriterion.thr_d_eig_vals: + nz = (eigengap_stats["d_eig_vals"] < self.thr_eigengap).nonzero()[0] + num_clusters = nz[nz > eigengap_stats["k_max"]][0] + 1 + else: + raise ValueError(f"invalid num clusters criterion {self.criterion}") + return num_clusters + + def normalize_eigvecs(self, eig_vecs: np.ndarray): + if self.laplacian == LaplacianType.norm_sym: + return normalize(eig_vecs, axis=1) + else: + return eig_vecs + + def do_kmeans(self, x: np.ndarray, num_clusters: Optional[int] = None): + if num_clusters is None: + num_clusters = x.shape[1] + 1 + kmeans = KMeans( + num_clusters=num_clusters, + epochs=self.kmeans_epochs, + init_method=self.kmeans_init_method, + num_workers=self.num_workers, + ) + kmeans.fit(x) + y, _ = kmeans(x) + return y + + def fit(self, x: np.ndarray): + logging.info("compute spectral embeddings") + + eig_vals, eig_vecs = self.spectral_embedding(x) + if self.num_clusters is None: + logging.info("compute eigengap stats") + eigengap_stats = self.compute_eigengap(eig_vals) + else: + eigengap_stats = None + + logging.info("predicting number of clusters") + num_clusters = self.predict_num_clusters(eigengap_stats) + logging.info("predicted num_clusters=%d", num_clusters) + if num_clusters == 1: + return np.zeros((x.shape[0]), dtype=int), num_clusters, eigengap_stats + # minus one because we already removed the first eig vector + logging.info("normalizing embeddings") + eig_vecs = eig_vecs[:, : num_clusters - 1] + eig_vecs = self.normalize_eigvecs(eig_vecs) + logging.info("running k-means") + y = self.do_kmeans(eig_vecs, num_clusters) + return y, num_clusters, eigengap_stats + + def plot_eigengap_stats( + self, + eigengap_stats: Dict[str, Any], + num_clusters: int, + fig_file: Optional[PathLike] = None, + ): + fig, (ax0, ax1, ax2) = plt.subplots( + nrows=1, ncols=3, sharex=True, figsize=(12, 6) + ) + eig_vals = eigengap_stats["eig_vals"] + ax0.plot(np.arange(1, len(eig_vals) + 1), eig_vals, "b") + ax0.vlines( + num_clusters, ymin=np.min(eig_vals), ymax=np.max(eig_vals), colors="r" + ) + ax0.grid() + ax0.set_title("eigen_vals") + eigengap = eigengap_stats["eigengap"] + ax1.plot(np.arange(1, len(eigengap) + 1), eigengap, "b") + ax1.vlines( + num_clusters, ymin=np.min(eigengap), ymax=np.max(eigengap), colors="r" + ) + ax1.grid() + ax1.set_title("eigengap") + d_eig_vals = eigengap_stats["d_eig_vals"] + ax2.plot(np.arange(1, len(d_eig_vals) + 1), d_eig_vals, "b") + ax2.vlines( + num_clusters, ymin=np.min(d_eig_vals), ymax=np.max(d_eig_vals), colors="r" + ) + ax2.grid() + ax2.set_title("d_eigen_val") + if fig_file is not None: + fig.savefig(fig_file) + + @staticmethod + def add_class_args(parser, prefix=None): + """It adds the arguments corresponding to the class to jsonarparse. + Args: + parser: jsonargparse object + prefix: argument prefix. + """ + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--laplacian", + default=LaplacianType.norm_sym, + choices=LaplacianType.choices(), + ) + parser.add_argument("--num-clusters", default=None, type=int) + parser.add_argument("--max-num-clusters", default=None, type=int) + parser.add_argument( + "--criterion", + default=SpectralClusteringNumClassCriterion.max_eigengap, + choices=SpectralClusteringNumClassCriterion.choices(), + ) + parser.add_argument("--thr-eigengap", default=1e-3, type=float) + parser.add_argument("--kmeans-epochs", default=100, type=int) + parser.add_argument( + "--kmeans-init-method", + default=KMeansInitMethod.max_dist, + choices=KMeansInitMethod.choices(), + ) + parser.add_argument("--num-workers", default=1, type=int) + + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) diff --git a/hyperion/np/np_model.py b/hyperion/np/np_model.py index aa635fc5..7b3b2e1c 100644 --- a/hyperion/np/np_model.py +++ b/hyperion/np/np_model.py @@ -10,6 +10,7 @@ import numpy as np from ..hyp_defs import float_cpu, float_save +from ..utils.misc import PathLike class NPModel(object): @@ -19,6 +20,12 @@ class NPModel(object): name: optional identifier for the model. """ + registry = {} + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + NPModel.registry[cls.__name__] = cls + def __init__(self, name=None, **kwargs): if name is None: name = self.__class__.__name__ @@ -227,3 +234,15 @@ def get_json_type(obj): def load_config_from_json(json_str): """Converts json string into dict.""" return json.loads(json_str) + + @staticmethod + def auto_load(file_path: PathLike, extra_objs: dict = {}): + class_name = NPModel.load_config(file_path)["class_name"] + if class_name in NPModel.registry: + class_obj = NPModel.registry[class_name] + elif class_name in extra_objs: + class_obj = extra_objs[class_name] + else: + raise Exception("unknown object with class_name=%s" % (class_name)) + + return class_obj.load(file_path) diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index 7589243e..ecc7bad7 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -159,7 +159,7 @@ def norm_suff_stats(self, N, u_x, return_order2=False): F, S = self.unstack_suff_stats(u_x) F_norm = self.cholLambda * (F - N[:, None] * self.mu) if return_order2: - S = S - 2 * self.mu * F + N * self.mu ** 2 + S = S - 2 * self.mu * F + N * self.mu**2 S *= self.Lambda return N, self.stack_suff_stats(F_norm, S) @@ -179,9 +179,11 @@ def Mstep(self, N, u_x): self.mu = F / N[:, None] if self.update_Lambda: - S = S / N[:, None] - self.mu ** 2 + S = S / N[:, None] - self.mu**2 S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) + S_floor = np.maximum(S_floor, 1e-10) S = np.maximum(S, S_floor) + print(np.min(S)) self.Lambda = 1 / S self._Sigma = S self._cholLambda = None @@ -212,7 +214,7 @@ def split_comp(self, K=2): num_comp = self.num_comp * K pi = np.repeat(self.pi, K) / K - Lambda = np.repeat(self.Lambda, K, axis=0) * (K ** 2) + Lambda = np.repeat(self.Lambda, K, axis=0) * (K**2) mu = np.repeat(self.mu, K, axis=0) if K == 2: diff --git a/hyperion/np/pdfs/plda/__init__.py b/hyperion/np/pdfs/plda/__init__.py index 13bc2d81..5961b71f 100644 --- a/hyperion/np/pdfs/plda/__init__.py +++ b/hyperion/np/pdfs/plda/__init__.py @@ -4,7 +4,8 @@ """ +from .factory import PLDAFactory, PLDAType from .frplda import FRPLDA from .plda import PLDA -from .plda_base import PLDABase +from .plda_base import PLDABase, PLDALLRNvsMMethod from .splda import SPLDA diff --git a/hyperion/np/pdfs/plda/factory.py b/hyperion/np/pdfs/plda/factory.py new file mode 100644 index 00000000..dd19ab9f --- /dev/null +++ b/hyperion/np/pdfs/plda/factory.py @@ -0,0 +1,204 @@ +""" + Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from enum import Enum + +import numpy as np +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ....utils.misc import filter_func_args +from .frplda import FRPLDA +from .plda import PLDA +from .plda_base import PLDALLRNvsMMethod +from .splda import SPLDA + + +class PLDAType(str, Enum): + frplda = "frplda" + splda = "splda" + plda = "plda" + + @staticmethod + def choices(): + return [PLDAType.frplda, PLDAType.splda, PLDAType.plda] + + +class PLDAFactory(object): + """Class to create PLDA objects.""" + + @staticmethod + def create( + plda_type, + y_dim=None, + z_dim=None, + fullcov_W=True, + update_mu=True, + update_V=True, + update_U=True, + update_B=True, + update_W=True, + update_D=True, + floor_iD=1e-5, + name="plda", + **kwargs + ): + if plda_type == PLDAType.frplda: + return FRPLDA( + fullcov_W=fullcov_W, + update_mu=update_mu, + update_B=update_B, + update_W=update_W, + name=name, + **kwargs + ) + if plda_type == PLDAType.splda: + return SPLDA( + y_dim=y_dim, + fullcov_W=fullcov_W, + update_mu=update_mu, + update_V=update_V, + update_W=update_W, + name=name, + **kwargs + ) + + if plda_type == PLDAType.plda: + return PLDA( + y_dim=y_dim, + z_dim=z_dim, + floor_iD=floor_iD, + update_mu=update_mu, + update_V=update_V, + update_U=update_U, + update_D=update_D, + name=name, + **kwargs + ) + + @staticmethod + def load_plda(plda_type, model_file): + if plda_type == "frplda": + return FRPLDA.load(model_file) + elif plda_type == "splda": + return SPLDA.load(model_file) + elif plda_type == "plda": + return PLDA.load(model_file) + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(PLDAFactory.create, kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--plda-type", + default=PLDAType.splda, + choices=PLDAType.choices(), + help="PLDA type", + ) + + parser.add_argument( + "--y-dim", type=int, default=150, help="num. of eigenvoices" + ) + parser.add_argument( + "--z-dim", type=int, default=400, help="num. of eigenchannels" + ) + + parser.add_argument( + "--fullcov-W", + default=True, + action=ActionYesNo, + help="use full covariance W", + ) + parser.add_argument( + "--update-mu", + default=True, + action=ActionYesNo, + help="not update mu", + ) + parser.add_argument( + "--update-V", default=True, action=ActionYesNo, help="update V" + ) + parser.add_argument( + "--update-U", default=True, action=ActionYesNo, help="update U" + ) + + parser.add_argument( + "--update-B", default=True, action=ActionYesNo, help="update B" + ) + parser.add_argument( + "--update-W", default=True, action=ActionYesNo, help="update W" + ) + parser.add_argument( + "--update-D", default=True, action=ActionYesNo, help="update D" + ) + parser.add_argument( + "--floor-iD", + type=float, + default=1e-5, + help="floor for inverse of D matrix", + ) + + parser.add_argument("--epochs", type=int, default=40, help="num. of epochs") + parser.add_argument( + "--ml-md", + default="ml+md", + choices=["ml+md", "ml", "md"], + help=("optimization type"), + ) + + parser.add_argument( + "--md-epochs", + default=None, + type=int, + nargs="+", + help=("epochs in which we do MD, if None we do it in all the epochs"), + ) + + parser.add_argument("--name", default="plda", help="model name") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + @staticmethod + def filter_eval_args(**kwargs): + valid_args = "eval_method" + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_llr_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--llr-method", default="vavg", choices=PLDALLRNvsMMethod.choices() + ) + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) + + # @staticmethod + # def add_eval_args(parser, prefix=None): + # if prefix is None: + # p1 = "--" + # else: + # p1 = "--" + prefix + "." + + # parser.add_argument( + # p1 + "plda-type", + # default="splda", + # choices=["frplda", "splda", "plda"], + # help=("PLDA type"), + # ) + # parser.add_argument(p1 + "model-file", required=True, help=("model file")) diff --git a/hyperion/np/pdfs/plda/frplda.py b/hyperion/np/pdfs/plda/frplda.py index af8c5d8b..591948f9 100644 --- a/hyperion/np/pdfs/plda/frplda.py +++ b/hyperion/np/pdfs/plda/frplda.py @@ -36,9 +36,12 @@ def __init__( update_mu=True, update_B=True, update_W=True, + epochs=20, + ml_md="ml+md", + md_epochs=None, **kwargs ): - super().__init__(mu=mu, update_mu=update_mu, **kwargs) + super().__init__(mu=mu, update_mu=update_mu, epochs=epochs, **kwargs) if mu is not None: self.y_dim = mu.shape[0] self.B = B diff --git a/hyperion/np/pdfs/plda/plda.py b/hyperion/np/pdfs/plda/plda.py index 76299970..35b133c2 100644 --- a/hyperion/np/pdfs/plda/plda.py +++ b/hyperion/np/pdfs/plda/plda.py @@ -44,9 +44,20 @@ def __init__( update_V=True, update_U=True, update_D=True, + epochs=20, + ml_md="ml+md", + md_epochs=None, **kwargs ): - super().__init__(y_dim=y_dim, mu=mu, update_mu=update_mu, **kwargs) + super().__init__( + y_dim=y_dim, + mu=mu, + update_mu=update_mu, + epochs=epochs, + ml_md=ml_md, + md_epochs=md_epochs, + **kwargs + ) self.z_dim = z_dim if V is not None: self.y_dim = V.shape[0] @@ -526,7 +537,7 @@ def log_probx_g_yz(self, x, y, z): logp = ( -x.shape[-1] * np.log(2 * np.pi) + logD - - np.sum(self.D * delta ** 2, axis=-1) + - np.sum(self.D * delta**2, axis=-1) ) logp /= 2 return logp diff --git a/hyperion/np/pdfs/plda/plda_base.py b/hyperion/np/pdfs/plda/plda_base.py index 9dde58b1..09544cae 100644 --- a/hyperion/np/pdfs/plda/plda_base.py +++ b/hyperion/np/pdfs/plda/plda_base.py @@ -3,6 +3,8 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +from enum import Enum + import numpy as np from ....hyp_defs import float_cpu @@ -10,6 +12,22 @@ from ..core.pdf import PDF +class PLDALLRNvsMMethod(str, Enum): + vavg = "vavg" + lnorm_vavg = "lnorm-vavg" + savg = "savg" + book = "book" + + @staticmethod + def choices(): + return [ + PLDALLRNvsMMethod.vavg, + PLDALLRNvsMMethod.lnorm_vavg, + PLDALLRNvsMMethod.savg, + PLDALLRNvsMMethod.book, + ] + + class PLDABase(PDF): """Abstract Base class for different versions of Probabilistic Linear Discriminant Analysis (PLDA) models. @@ -21,7 +39,16 @@ class PLDABase(PDF): x_dim: data dimension. """ - def __init__(self, y_dim=None, mu=None, update_mu=True, **kwargs): + def __init__( + self, + y_dim=None, + mu=None, + update_mu=True, + epochs=20, + ml_md="ml+md", + md_epochs=None, + **kwargs + ): super().__init__(**kwargs) self.mu = mu self.y_dim = y_dim @@ -29,6 +56,10 @@ def __init__(self, y_dim=None, mu=None, update_mu=True, **kwargs): if mu is not None: self.x_dim = mu.shape[0] + self.epochs = epochs + self.ml_md = ml_md + self.md_epochs = md_epochs + def initialize(self, D): """initializes the model. @@ -55,8 +86,8 @@ def fit( class_ids_val=None, ptheta_val=None, sample_weight_val=None, - epochs=20, - ml_md="ml+md", + epochs=None, + ml_md=None, md_epochs=None, ): """Trains the model. @@ -80,6 +111,12 @@ def fit( log p(X) of the val. data, if present. log p(x) of the val. data per sample, if present. """ + if epochs is None: + epochs = self.epochs + if ml_md is None: + ml_md = self.ml_md + if md_epochs is None: + md_epochs = self.md_epochs use_ml = False if ml_md == "md" else True use_md = False if ml_md == "ml" else True @@ -107,7 +144,6 @@ def fit( elbo = np.zeros((epochs,), dtype=float_cpu()) elbo_val = np.zeros((epochs,), dtype=float_cpu()) for epoch in range(epochs): - stats = self.Estep(D) elbo[epoch] = self.elbo(stats) if x_val is not None: @@ -206,7 +242,6 @@ def fit_adapt_weighted_avg_model( elbo = np.zeros((epochs,), dtype=float_cpu()) elbo_val = np.zeros((epochs,), dtype=float_cpu()) for epoch in range(epochs): - stats = self.Estep(D) elbo[epoch] = self.elbo(stats) if x_val is not None: @@ -363,17 +398,17 @@ def llr_NvsM(self, x1, x2, ids1=None, ids2=None, method="vavg-lnorm"): Returns: Score matrix with shape (num_enrollment_sides, num_test_sides). """ - if method == "savg": + if method == PLDALLRNvsMMethod.savg: return self.llr_NvsM_savg(x1, ids1, x2, ids2) D1 = x1 if ids1 is None else self.compute_stats_hard(x1, class_ids=ids1) D2 = x2 if ids2 is None else self.compute_stats_hard(x2, class_ids=ids2) - if method == "book": + if method == PLDALLRNvsMMethod.book: return self.llr_NvsM_book(D1, D2) - if method == "vavg": + if method == PLDALLRNvsMMethod.vavg: return self.llr_NvsM_vavg(D1, D2, do_lnorm=False) - if method == "vavg-lnorm": + if method == PLDALLRNvsMMethod.lnorm_vavg: return self.llr_NvsM_vavg(D1, D2, do_lnorm=True) def llr_NvsM_vavg(self, D1, D2, do_lnorm=True): @@ -436,17 +471,17 @@ def llr_Nvs1(self, x1, x2, ids1=None, method="vavg-lnorm"): Returns: Score matrix with shape (num_enrollment_sides, num_test_sides). """ - if method == "savg": + if method == PLDALLRNvsMMethod.savg: return self.llr_Nvs1_savg(x1, ids1, x2) D1 = x1 if ids1 is None else self.compute_stats_hard(x1, class_ids=ids1) - if method == "book": + if method == PLDALLRNvsMMethod.book: D2 = self.compute_stats_hard(x2, np.arange(x2.shape[0])) return self.llr_NvsM_book(D1, D2) - if method == "vavg": + if method == PLDALLRNvsMMethod.vavg: return self.llr_Nvs1_vavg(D1, x2, do_lnorm=False) - if method == "vavg-lnorm": + if method == PLDALLRNvsMMethod.lnorm_vavg: return self.llr_Nvs1_vavg(D1, x2, do_lnorm=True) def llr_Nvs1_vavg(self, D1, x2, do_lnorm=True): diff --git a/hyperion/np/pdfs/plda/splda.py b/hyperion/np/pdfs/plda/splda.py index 5d397183..9e0c2a20 100644 --- a/hyperion/np/pdfs/plda/splda.py +++ b/hyperion/np/pdfs/plda/splda.py @@ -37,9 +37,20 @@ def __init__( update_mu=True, update_V=True, update_W=True, + epochs=20, + ml_md="ml+md", + md_epochs=None, **kwargs ): - super().__init__(y_dim=y_dim, mu=mu, update_mu=update_mu, **kwargs) + super().__init__( + y_dim=y_dim, + mu=mu, + update_mu=update_mu, + epochs=epochs, + ml_md=ml_md, + md_epochs=md_epochs, + **kwargs + ) if V is not None: self.y_dim = V.shape[0] self.V = V diff --git a/hyperion/np/preprocessing/__init__.py b/hyperion/np/preprocessing/__init__.py new file mode 100644 index 00000000..8cbe932a --- /dev/null +++ b/hyperion/np/preprocessing/__init__.py @@ -0,0 +1,6 @@ +""" + Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .resampler import Resampler diff --git a/hyperion/np/preprocessing/resampler.py b/hyperion/np/preprocessing/resampler.py new file mode 100644 index 00000000..1c3e5901 --- /dev/null +++ b/hyperion/np/preprocessing/resampler.py @@ -0,0 +1,46 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + + +class Resampler: + def __init__(self, target_sample_freq: float): + self.target_sample_freq = target_sample_freq + self.resamplers = {} + + def _get_resampler(self, input_sample_freq): + if input_sample_freq in self.resamplers: + return self.resamplers[input_sample_freq] + + import torch + import torchaudio.transforms as tat + + try: + resampler = tat.Resample( + int(input_sample_freq), + int(self.target_sample_freq), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="sinc_interp_kaiser", + beta=14.769656459379492, + ) + except: + resampler = tat.Resample( + int(input_sample_freq), + int(self.target_sample_freq), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + resampler_f = lambda x: resampler(torch.from_numpy(x)).numpy() + self.resamplers[fs] = resampler_f + return resampler_f + + def __call__(self, x, sample_freq: float): + if sample_freq == self.target_sample_freq: + return x, sample_freq + + resampler = self._get_resampler(sample_freq) + return resampler(x), self.target_sample_freq diff --git a/hyperion/np/transforms/lda.py b/hyperion/np/transforms/lda.py index fc886ede..b7a50f80 100644 --- a/hyperion/np/transforms/lda.py +++ b/hyperion/np/transforms/lda.py @@ -6,6 +6,7 @@ import h5py import numpy as np import scipy.linalg as la +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ..np_model import NPModel from .sb_sw import SbSw @@ -157,3 +158,38 @@ def save_mat(self, file_path): with h5py.File(file_path, "w") as f: f.create_dataset("mu", data=self.mu) f.create_dataset("T", data=self.T) + + @staticmethod + def filter_args(**kwargs): + valid_args = ("update_mu", "update_T", "name", "lda_dim") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--update-mu", + default=True, + action=ActionYesNo, + help=("updates centering parameter"), + ) + parser.add_argument( + "--update-T", + default=True, + action=ActionYesNo, + help=("updates projection parameter"), + ) + + parser.add_argument( + "--lda-dim", required=True, help=("output dimension of LDA") + ) + + parser.add_argument("--name", dest="name", default="lda") + if prefix is not None: + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + ) diff --git a/hyperion/np/transforms/pca.py b/hyperion/np/transforms/pca.py index aa25d8e9..98b6c192 100644 --- a/hyperion/np/transforms/pca.py +++ b/hyperion/np/transforms/pca.py @@ -91,7 +91,7 @@ def get_pca_dim_for_var_ratio(x, var_r=1, min_dim=2): rank = matrix_rank(np.dot(x.T, x)) else: sv = la.svd(x, compute_uv=False) - Ecc = np.cumsum(sv ** 2) + Ecc = np.cumsum(sv**2) Ecc = Ecc / Ecc[-1] rank = np.where(Ecc > var_r)[0][0] @@ -186,7 +186,11 @@ def load_params(cls, f, config): """ param_list = ["mu", "T"] params = cls._load_params_to_dict(f, config["name"], param_list) - return cls(mu=params["mu"], T=params["T"], **config,) + return cls( + mu=params["mu"], + T=params["T"], + **config, + ) @classmethod def load_mat(cls, file_path): @@ -202,12 +206,19 @@ def save_mat(self, file_path): @staticmethod def filter_args(**kwargs): - valid_args = ("update_mu", "update_T", "name", "pca_dim", "pca_var_r") + valid_args = ( + "update_mu", + "update_T", + "name", + "pca_dim", + "pca_var_r", + "pca_min_dim", + "whiten", + ) return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): - if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") @@ -242,10 +253,15 @@ def add_class_args(parser, prefix=None): help=("proportion of variance to keep when choosing the PCA dimension"), ) + parser.add_argument( + "--pca-min-dim", default=2, type=int, help=("min. output dimension of PCA") + ) + parser.add_argument("--name", dest="name", default="pca") if prefix is not None: outer_parser.add_argument( - "--" + prefix, action=ActionParser(parser=parser), + "--" + prefix, + action=ActionParser(parser=parser), ) add_argparse_args = add_class_args diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 2329d0b1..e19ec329 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -21,6 +21,7 @@ from ...io import RandomAccessAudioReader as AR from ...np.augment import SpeechAugment +from ...np.preprocessing import Resampler from ...utils.class_info import ClassInfo from ...utils.misc import filter_func_args from ...utils.segment_set import SegmentSet @@ -126,6 +127,7 @@ def __init__( self.target_sample_freq = target_sample_freq self.resamplers = {} + self.resampler = Resampler(target_sample_freq) def _load_legacy_durations(self, time_durs_file): if self.rank == 0: @@ -353,14 +355,19 @@ def _get_resampler(self, fs): return resampler_f def _resample(self, x, fs): - try: - if self.target_sample_freq is None or fs == self.target_sample_freq: - return x, fs - resampler = self._get_resampler(fs) - return resampler(x), self.target_sample_freq - except: + if self.target_sample_freq is None: return x, fs + return self.resampler(x, fs) + + # try: + # if self.target_sample_freq is None or fs == self.target_sample_freq: + # return x, fs + # resampler = self._get_resampler(fs) + # return resampler(x), self.target_sample_freq + # except: + # return x, fs + def __getitem__(self, segment): seg_id, start, duration = self._parse_segment_item(segment) x, fs = self._read_audio(seg_id, start, duration) @@ -368,39 +375,8 @@ def __getitem__(self, segment): data = {"seg_id": seg_id, "sample_freq": fs} x_augs = self._apply_augs(x, duration, fs) data.update(x_augs) - - # if self.augmenters: - # # augmentations - # if duration == 0: - # num_samples = len(x) - # else: - # num_samples = int(duration * fs) - - # reverb_context_samples = len(x) - num_samples - # x_augs = self._apply_augs(x, reverb_context_samples) - # data.update(x_augs) - - # # add original non augmented audio - # if self.return_orig: - # x_orig = x[reverb_context_samples:] - # data["x"] = x_orig - - # else: - # data["x"] = x - seg_info = self._get_segment_info(seg_id) data.update(seg_info) - # if np.any(~np.isfinite(data["x"])): - # print( - # "zzz", - # x.max(), - # x.min(), - # x.mean(), - # data["x"].max(), - # data["x"].min(), - # data["x"].mean(), - # flush=True, - # ) return data @staticmethod diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index 4587fbd2..d314490c 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -201,7 +201,7 @@ def forward(self, x, x_lengths=None, weights=None): # this can produce slightly negative variance when relu6 saturates in all time steps # add 1e-5 for stability s = torch.sqrt( - torch.mean(delta**2, dim=self.dim, keepdim=False).clamp(min=SQRT_EPS) + torch.mean(delta ** 2, dim=self.dim, keepdim=False).clamp(min=SQRT_EPS) ) mus = torch.cat((mu, s), dim=1) @@ -214,7 +214,7 @@ def forward(self, x, x_lengths=None, weights=None): wbar = torch.mean(weights, dim=self.dim, keepdim=True) mu = xbar / wbar delta = x - mu - var = torch.mean(weights * delta**2, dim=self.dim, keepdim=True) / wbar + var = torch.mean(weights * delta ** 2, dim=self.dim, keepdim=True) / wbar s = torch.sqrt(var.clamp(min=SQRT_EPS)) mu = mu.squeeze(self.dim) s = s.squeeze(self.dim) @@ -254,9 +254,9 @@ def _forward_slidwin_int(self, x, win_length, win_shift, snip_edges): c_x = torch.cumsum(x, dim=-1).view(-1, x.shape[-1]) m_x = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length - c_x = torch.cumsum(x**2, dim=-1).view(-1, x.shape[-1]) + c_x = torch.cumsum(x ** 2, dim=-1).view(-1, x.shape[-1]) m_x2 = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length - s_x = torch.sqrt(m_x2 - m_x**2).clamp(min=SQRT_EPS) + s_x = torch.sqrt(m_x2 - m_x ** 2).clamp(min=SQRT_EPS) mus = self._post_slidwin(m_x, s_x, out_shape) return mus @@ -265,7 +265,7 @@ def _forward_slidwin_float(self, x, win_length, win_shift, snip_edges): x, out_shape = self._pre_slidwin(x, win_length, win_shift, snip_edges) num_frames = out_shape[-1] c_x = torch.cumsum(x, dim=-1).view(-1, x.shape[-1]) - c_x2 = torch.cumsum(x**2, dim=-1).view(-1, x.shape[-1]) + c_x2 = torch.cumsum(x ** 2, dim=-1).view(-1, x.shape[-1]) # xx = x.view(-1, x.shape[-1]) # print(xx.shape[1]) @@ -309,7 +309,7 @@ def _forward_slidwin_float(self, x, win_length, win_shift, snip_edges): k += win_shift - var_x = (m_x2 - m_x**2).clamp(min=SQRT_EPS) + var_x = (m_x2 - m_x ** 2).clamp(min=SQRT_EPS) s_x = torch.sqrt(var_x) # idx = torch.isnan(s_x) #.any(dim=1) # if torch.sum(idx) > 0: @@ -400,14 +400,14 @@ def forward(self, x, x_lengths=None, weights=None): weights = self._standardize_weights(x, x_lengths, weights) if weights is None: mu = torch.mean(x, dim=self.dim, keepdim=self.keepdim) - x2bar = torch.mean(x**2, dim=self.dim, keepdim=self.keepdim) + x2bar = torch.mean(x ** 2, dim=self.dim, keepdim=self.keepdim) logvar = torch.log(x2bar - mu * mu + 1e-5) # for stability in case var=0 return torch.cat((mu, logvar), dim=-1) xbar = torch.mean(weights * x, dim=self.dim, keepdim=self.keepdim) wbar = torch.mean(weights, dim=self.dim, keepdim=self.keepdim) mu = xbar / wbar - x2bar = torch.mean(weights * x**2, dim=self.dim, keepdim=self.keepdim) / wbar + x2bar = torch.mean(weights * x ** 2, dim=self.dim, keepdim=self.keepdim) / wbar var = (x2bar - mu * mu).clamp(min=1e-5) logvar = torch.log(var) @@ -444,7 +444,7 @@ def __init__( if dist_pow == 1: self.dist_f = lambda x: torch.norm(x, p=2, dim=-1) else: - self.dist_f = lambda x: torch.sum(x**2, dim=-1) + self.dist_f = lambda x: torch.sum(x ** 2, dim=-1) self.size_multiplier = num_comp @@ -503,7 +503,7 @@ def forward(self, x, x_lengths=None, weights=None): delta = x - self.mu # (batch, time, num_comp, feat_dim) dist = self.dist_f(delta) # (batch, time, num_comp) - llk = -self.prec**2 * dist + self.bias + llk = -self.prec ** 2 * dist + self.bias r = nnf.softmax(llk, dim=-1) # (batch, time, num_comp) if weights is not None: r *= weights @@ -784,7 +784,7 @@ def forward(self, x, x_lengths=None, weights=None): assert not torch.any( torch.isinf(x_inner) ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(x))}" - # logging.info('x_inner1={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) + if self.use_global_context: global_mus = self.stats_pool(x, weights=weights) x_inner = x_inner + self.lin_global(global_mus).unsqueeze(-1) @@ -794,7 +794,7 @@ def forward(self, x, x_lengths=None, weights=None): assert not torch.any( torch.isinf(x_inner) ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(global_mus))}" - # logging.info('x_inner2={} {}'.format(torch.sum(torch.isnan(x_inner)), torch.sum(torch.isinf(x_inner)))) + attn = self.conv2( self.activation(self.norm_layer(x_inner)) ) # (batch, feat_dim, time) @@ -821,7 +821,7 @@ def forward(self, x, x_lengths=None, weights=None): torch.isinf(attn) ), f"attn is inf {torch.sum(torch.isinf(attn))}" mus = self.stats_pool(x, weights=attn) - # logging.info('mus={} {}'.format(torch.sum(torch.isnan(mus)), torch.sum(torch.isinf(mus)))) + if self.keepdim: mus = mus.unsqueeze(self.dim) diff --git a/hyperion/torch/narchs/dino_head.py b/hyperion/torch/narchs/dino_head.py index a59434bf..87c8daae 100644 --- a/hyperion/torch/narchs/dino_head.py +++ b/hyperion/torch/narchs/dino_head.py @@ -2,8 +2,6 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - - from typing import Optional import torch @@ -17,53 +15,6 @@ from ..layers import NormLayer1dFactory as NLF from .net_arch import NetArch -# class DINOHead1(nn.Module): -# def __init__( -# self, -# in_dim, -# out_dim, -# use_bn=False, -# norm_last_layer=True, -# nlayers=3, -# hidden_dim=2048, -# bottleneck_dim=256, -# ): -# super().__init__() -# nlayers = max(nlayers, 1) -# if nlayers == 1: -# self.mlp = nn.Linear(in_dim, bottleneck_dim) -# else: -# layers = [nn.Linear(in_dim, hidden_dim)] -# if use_bn: -# layers.append(nn.BatchNorm1d(hidden_dim)) -# layers.append(nn.GELU()) -# for _ in range(nlayers - 2): -# layers.append(nn.Linear(hidden_dim, hidden_dim)) -# if use_bn: -# layers.append(nn.BatchNorm1d(hidden_dim)) -# layers.append(nn.GELU()) -# layers.append(nn.Linear(hidden_dim, bottleneck_dim)) -# self.mlp = nn.Sequential(*layers) -# self.apply(self._init_weights) -# self.last_layer = nn.utils.weight_norm( -# nn.Linear(bottleneck_dim, out_dim, bias=False) -# ) -# self.last_layer.weight_g.data.fill_(1) -# if norm_last_layer: -# self.last_layer.weight_g.requires_grad = False - -# def _init_weights(self, m): -# if isinstance(m, nn.Linear): -# nn.init.trunc_normal_(m.weight, std=0.02) -# if isinstance(m, nn.Linear) and m.bias is not None: -# nn.init.constant_(m.bias, 0) - -# def forward(self, x): -# x = self.mlp(x) -# x = nn.functional.normalize(x, dim=-1, p=2) -# x = self.last_layer(x) -# return x - class DINOHead(NetArch): """Classification Head for DINO x-vector style networks @@ -208,21 +159,21 @@ def _init_weights(self, m): def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None): if self.use_in_norm: x = self.in_norm(x) - assert not torch.any( - torch.isnan(x) - ), f"x is nan {x.size()} {torch.sum(torch.isnan(x))}" + # assert not torch.any( + # torch.isnan(x) + # ), f"x is nan {x.size()} {torch.sum(torch.isnan(x))}" x = self.hid_layers(x) - assert not torch.any( - torch.isnan(x) - ), f"x_hid is nan {x.size()} {torch.sum(torch.isnan(x))}" + # assert not torch.any( + # torch.isnan(x) + # ), f"x_hid is nan {x.size()} {torch.sum(torch.isnan(x))}" x = nn.functional.normalize(x, dim=-1, p=2) - assert not torch.any( - torch.isnan(x) - ), f"x_l2 is nan {x.size()} {torch.sum(torch.isnan(x))}" + # assert not torch.any( + # torch.isnan(x) + # ), f"x_l2 is nan {x.size()} {torch.sum(torch.isnan(x))}" x = self.output(x) - assert not torch.any( - torch.isnan(x) - ), f"out is nan {x.size()} {torch.sum(torch.isnan(x))}" + # assert not torch.any( + # torch.isnan(x) + # ), f"out is nan {x.size()} {torch.sum(torch.isnan(x))}" return x def get_config(self): diff --git a/hyperion/torch/narchs/proj_head.py b/hyperion/torch/narchs/proj_head.py index 549f9e6a..e2838013 100644 --- a/hyperion/torch/narchs/proj_head.py +++ b/hyperion/torch/narchs/proj_head.py @@ -40,12 +40,7 @@ class ProjHead(NetArch): """ def __init__( - self, - in_feats, - out_feats=256, - norm_layer=None, - use_norm=True, - norm_before=True, + self, in_feats, out_feats=256, norm_layer=None, use_norm=True, norm_before=True, ): super().__init__() @@ -72,23 +67,21 @@ def __init__( def forward(self, x, y=None): if self.use_norm and self.norm_before: x = self._norm_layer(x) - assert not torch.any( - torch.isnan(x) - ), f"x before proj is nan {x.size()} {torch.sum(torch.isnan(x))}" + # assert not torch.any( + # torch.isnan(x) + # ), f"x before proj is nan {x.size()} {torch.sum(torch.isnan(x))}" x = self.proj(x) - assert not torch.any( - torch.isnan(x) - ), f"x after proj is nan {x.size()} {torch.sum(torch.isnan(x))}" + # assert not torch.any( + # torch.isnan(x) + # ), f"x after proj is nan {x.size()} {torch.sum(torch.isnan(x))}" if self.use_norm and not self.norm_before: x = self._norm_layer(x) - assert not torch.any( - torch.isnan(x) - ), f"x after bn is nan {x.size()} {torch.sum(torch.isnan(x))}" + # assert not torch.any( + # torch.isnan(x) + # ), f"x after bn is nan {x.size()} {torch.sum(torch.isnan(x))}" return x def get_config(self): - hid_act = AF.get_config(self.fc_blocks[0].activation) - config = { "in_feats": self.in_feats, "out_feats": self.out_feats, @@ -96,7 +89,6 @@ def get_config(self): "use_norm": self.use_norm, "norm_before": self.norm_before, } - base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) diff --git a/hyperion/torch/trainers/dino_xvector_trainer.py b/hyperion/torch/trainers/dino_xvector_trainer.py index e4051058..26d6a434 100644 --- a/hyperion/torch/trainers/dino_xvector_trainer.py +++ b/hyperion/torch/trainers/dino_xvector_trainer.py @@ -304,7 +304,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): logs = ODict((log_tag + k, v) for k, v in logs.items()) return logs - def _load_checkpoint(self, checkpoint): + def _old_load_checkpoint(self, checkpoint): self.teacher_model.load_state_dict(checkpoint["teacher_model_state_dict"]) # self.teacher_model.load_state_dict(checkpoint["teacher_state_dict"]) self.teacher_optimizer.load_state_dict( @@ -312,7 +312,7 @@ def _load_checkpoint(self, checkpoint): ) return super()._load_checkpoint(checkpoint) - def _new_load_checkpoint(self, checkpoint, teacher_checkpoint): + def _load_checkpoint(self, checkpoint, teacher_checkpoint): self.teacher_model.load_state_dict(teacher_checkpoint["model_state_dict"]) self.teacher_optimizer.load_state_dict( teacher_checkpoint["optimizer_state_dict"] @@ -322,7 +322,7 @@ def _new_load_checkpoint(self, checkpoint, teacher_checkpoint): def load_checkpoint(self, epoch, step): checkpoint = self.load_model_checkpoint("model", epoch, step) teacher_checkpoint = self.load_model_checkpoint("teacher_model", epoch, step) - return self._new_load_checkpoint(checkpoint, teacher_checkpoint) + return self._load_checkpoint(checkpoint, teacher_checkpoint) def checkpoint(self, logs=None): checkpoint = super().checkpoint(logs) diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index b3d6cb9f..36a9a43f 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -634,7 +634,7 @@ def _make_wd_sched(self, wd_sched, optim): assert isinstance(wd_sched, dict) args = WDSF.filter_args(**wd_sched) if self.rank == 0: - logging.info("wd scheduler args={args}") + logging.info(f"wd scheduler args={args}") wd_sched = WDSF.create(optim, **args) return wd_sched @@ -930,7 +930,7 @@ def find_last_checkpoint(self, model_name="model"): file_pattern = "%s/%s_ep[0-9]*.pth" % (self.exp_path, model_name) file_paths = sorted(glob.glob(file_pattern)) if len(file_paths) > 0: - last_epoch = int(re.search(r"ep[0-9]*", file_paths[-1])[2:]) + last_epoch = int(re.search(r"ep[0-9]*", file_paths[-1]).group()[2:]) file_pattern = "%s/%s_ep%04d_step[0-9]*.pth" % ( self.exp_path, @@ -939,7 +939,7 @@ def find_last_checkpoint(self, model_name="model"): ) file_paths = sorted(glob.glob(file_pattern)) if len(file_paths) > 0: - last_step = int(re.search(r"step[0-9]*", file_paths[-1])[4:]) + last_step = int(re.search(r"step[0-9]*", file_paths[-1]).group()[4:]) return last_epoch, last_step @@ -947,7 +947,7 @@ def load_last_checkpoint(self): """Loads the last training checkpoint in the experiment dir.""" last_epoch, last_step = self.find_last_checkpoint() if last_epoch > 0 or last_step > 0: - return self.new_load_checkpoint(last_epoch, last_step) + return self.load_checkpoint(last_epoch, last_step) return None @@ -964,11 +964,11 @@ def load_model_checkpoint(self, model_name="model", epoch=0, step=0): logging.info("loading %s from %s", model_name, file_path) return torch.load(file_path, map_location=torch.device("cpu")) - def new_load_checkpoint(self, epoch, step): + def load_checkpoint(self, epoch, step): checkpoint = self.load_model_checkpoint("model", epoch, step) return self._load_checkpoint(checkpoint) - def load_checkpoint(self, file_path): + def old_load_checkpoint(self, file_path): """Loads a training checkpoint from file. Args: diff --git a/hyperion/torch/wd_schedulers/factory.py b/hyperion/torch/wd_schedulers/factory.py index 3820daa2..dc72bd2c 100644 --- a/hyperion/torch/wd_schedulers/factory.py +++ b/hyperion/torch/wd_schedulers/factory.py @@ -31,7 +31,7 @@ def create( if wdsch_type == "none": return None - if wdsch_type == "cos_lr": + if wdsch_type == "cos_wd": return CosineWD( optimizer, initial_wd=initial_wd, diff --git a/hyperion/torch/wd_schedulers/wd_scheduler.py b/hyperion/torch/wd_schedulers/wd_scheduler.py index a3059edc..3a092c3d 100644 --- a/hyperion/torch/wd_schedulers/wd_scheduler.py +++ b/hyperion/torch/wd_schedulers/wd_scheduler.py @@ -64,6 +64,10 @@ def __init__( for group in optimizer.param_groups ] + if epoch == 0: + for group, wd in zip(optimizer.param_groups, self.initial_wds): + group["weight_decay"] = wd + self.warmup_steps = warmup_steps self.epoch = epoch self.step = step From 68cc0931f86e6380a0cf3d0600f58b2282b2e676 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 18 Jan 2024 18:06:45 -0500 Subject: [PATCH 127/154] added ecapa and lresnet 3.1 --- egs/voxceleb/v1.2/README.md | 26 +++++++ .../train_cfwseresnet34_xvec_stage1_v3.0.yaml | 9 ++- .../train_cfwseresnet34_xvec_stage2_v3.0.yaml | 13 ++-- .../train_cwseresnet34_xvec_stage1_v3.0.yaml | 9 ++- .../train_cwseresnet34_xvec_stage2_v3.0.yaml | 13 ++-- ...rain_ecapatdnn2048x4_xvec_stage1_v3.0.yaml | 9 ++- ...rain_ecapatdnn2048x4_xvec_stage2_v3.0.yaml | 13 ++-- .../train_fwseresnet34_xvec_stage1_v3.0.yaml | 9 ++- .../train_fwseresnet34_xvec_stage2_v3.0.yaml | 13 ++-- ...rain_idrnd_resnet100_xvec_stage1_v3.0.yaml | 9 ++- ...rain_idrnd_resnet100_xvec_stage2_v3.0.yaml | 14 ++-- .../train_lresnet34_xvec_stage2_v3.1.yaml | 73 +++++++++++++++++++ .../conf/train_resnet34_xvec_stage1_v3.0.yaml | 9 ++- .../conf/train_resnet34_xvec_stage2_v3.0.yaml | 13 ++-- .../train_tseresnet34_xvec_stage1_v3.0.yaml | 9 ++- .../train_tseresnet34_xvec_stage2_v3.0.yaml | 13 ++-- ...config_fbank80_stmn_ecapatdnn512x3.v3.1.sh | 46 ++++++++++++ .../config_fbank80_stmn_lresnet34.v3.1.sh | 44 +++++++++++ hyperion/torch/layer_blocks/res2net_blocks.py | 38 +++++++--- hyperion/torch/layer_blocks/resnet_blocks.py | 37 ++++++++-- .../torch/layer_blocks/seresnet_blocks.py | 26 +++++-- hyperion/torch/narchs/resnet.py | 16 +++- hyperion/torch/narchs/resnet_factory.py | 24 +++--- hyperion/torch/trainers/xvector_trainer.py | 47 +++++++++++- 24 files changed, 420 insertions(+), 112 deletions(-) create mode 100644 egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage2_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.1.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_lresnet34.v3.1.sh diff --git a/egs/voxceleb/v1.2/README.md b/egs/voxceleb/v1.2/README.md index 6e8ba07a..6a2502e6 100644 --- a/egs/voxceleb/v1.2/README.md +++ b/egs/voxceleb/v1.2/README.md @@ -96,9 +96,15 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.11 | 0.069 | 0.126 | | | | | Cosine + AS-Norm | 1.10 | 0.065 | 0.108 | | | | | Cosine + QMF | 0.95 | 0.059 | 0.084 | +| config_fbank80_stmn_ecapatdnn512x3.v3.1.sh | ECAPA-TDNN 512x3 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.02 | 0.067 | 0.109 | +| | | | Cosine + AS-Norm | 0.98 | 0.062 | 0.092 | +| | | | Cosine + QMF | 0.85 | 0.061 | 0.091 | | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | | | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | | | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | +| config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.59 | 0.1 | 0.172 | +| | | | Cosine + AS-Norm | 1.54 | 0.927 | 0.140 | +| | | | Cosine + QMF | 1.32 | 0.083 | 0.121 | | config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.048 | 0.071 | | | | | Cosine + AS-Norm | 0.70 | 0.039 | 0.048 | | | | | Cosine + QMF | 0.62 | 0.034 | 0.042 | @@ -129,9 +135,15 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.16 | 0.073 | 0.130 | | | | | Cosine + AS-Norm | 1.13 | 0.068 | 0.118 | | | | | Cosine + QMF | 1.06 | 0.064 | 0.112 | +| config_fbank80_stmn_ecapatdnn512x3.v3.1.sh | ECAPA-TDNN 512x3 | Stage2: SubCenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.06 | 0.066 | 0.116 | +| | | | Cosine + AS-Norm | 1.01 | 0.061 | 0.106 | +| | | | Cosine + QMF | 0.96 | 0.058 | 0.097 | | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | | | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | | | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | +| config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.69 | 0.103 | 0.174 | +| | | | Cosine + AS-Norm | 1.62 | 0.096 | 0.156 | +| | | | Cosine + QMF | 1.51 | 0.091 | 0.152 | | config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.86 | 0.054 | 0.098 | | | | | Cosine + AS-Norm | 0.81 | 0.049 | 0.087 | | | | | Cosine + QMF | 0.77 | 0.046 | 0.082 | @@ -154,6 +166,8 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | | | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | + + ### VoxCeleb 1 Hard-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -161,9 +175,15 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.10 | 0.128 | 0.209 | | | | | Cosine + AS-Norm | 1.99 | 0.118 | 0.190 | | | | | Cosine + QMF | 1.84 | 0.111 | 0.184 | +| config_fbank80_stmn_ecapatdnn512x3.v3.1.sh | ECAPA-TDNN 512x3 | Stage2: SubCenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.93 | 0.120 | 0.198 | +| | | | Cosine + AS-Norm | 1.84 | 0.113 | 0.184 | +| | | | Cosine + QMF | 1.73 | 0.108 | 0.177 | | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | | | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | | | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | +| config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.84 | 0.167 | 0.267 | +| | | | Cosine + AS-Norm | 2.58 | 0.150 | 0.252 | +| | | | Cosine + QMF | 2.45 | 0.144 | 0.234 | | config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.62 | 0.098 | 0.164 | | | | | Cosine + AS-Norm | 1.45 | 0.085 | 0.142 | | | | | Cosine + QMF | 1.36 | 0.082 | 0.137 | @@ -194,9 +214,15 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn512x3.v3.0.sh | ECAPA-TDNN 512x3 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.87 | 0.185 | 0.304 | | | | | Cosine + AS-Norm | 2.84 | 0.182 | 0.304 | | | | | Cosine + QMF | 2.61 | 0.172 | 0.283 | +| config_fbank80_stmn_ecapatdnn512x3.v3.1.sh | ECAPA-TDNN 512x3 | Stage2: SubCenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.60 | 0.174 | 0.287 | +| | | | Cosine + AS-Norm | 2.58 | 0.172 | 0.291 | +| | | | Cosine + QMF | 2.44 | 0.161 | 0.274 | | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | | | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | | | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | +| config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 3.74 | 0.239 | 0.394 | +| | | | Cosine + AS-Norm | 3.45 | 0.225 | 0.377 | +| | | | Cosine + QMF | 3.27 | 0.213 | 0.356 | | config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.19 | 0.142 | 0.242 | | | | | Cosine + AS-Norm | 2.00 | 0.133 | 0.254 | | | | | Cosine + QMF | 1.86 | 0.126 | 0.229 | diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml index f4306e2e..132438bf 100644 --- a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -70,3 +70,4 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml index c0bd44e5..34c0801e 100644 --- a/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_cfwseresnet34_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,18 +21,18 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -68,3 +68,4 @@ trainer: swa_start: 10 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml index b5458f9d..f576e411 100644 --- a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -70,3 +70,4 @@ trainer: log_interval: 1000 epochs: 25 eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml index c0bd44e5..34c0801e 100644 --- a/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_cwseresnet34_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,18 +21,18 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -68,3 +68,4 @@ trainer: swa_start: 10 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml index 2cf31713..23f03de7 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -95,3 +95,4 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml index c348e7c5..79d510ae 100644 --- a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,18 +21,18 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -69,3 +69,4 @@ trainer: swa_start: 10 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml index 01b2cc50..41748978 100644 --- a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -70,3 +70,4 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml index c0bd44e5..34c0801e 100644 --- a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,18 +21,18 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -68,3 +68,4 @@ trainer: swa_start: 10 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml index 74553395..eff62765 100644 --- a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 32 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 32 @@ -71,3 +71,4 @@ trainer: log_interval: 1000 epochs: 30 eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml index b14cfc75..571411ca 100644 --- a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 16 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,18 +21,18 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 16 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -68,3 +68,5 @@ trainer: swa_start: 10 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker + \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..ac859010 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_lresnet34_xvec_stage2_v3.1.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml index 6659b2f6..e35b273a 100644 --- a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -69,3 +69,4 @@ trainer: log_interval: 1000 epochs: 35 eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml index c0bd44e5..34c0801e 100644 --- a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,18 +21,18 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -68,3 +68,4 @@ trainer: swa_start: 10 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml index 58d22733..f4c381d6 100644 --- a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage1_v3.0.yaml @@ -2,11 +2,11 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -17,11 +17,11 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: seg_chunk_sampler min_batch_size: 64 @@ -70,3 +70,4 @@ trainer: log_interval: 1000 epochs: 25 eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml index c0bd44e5..34c0801e 100644 --- a/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml +++ b/egs/voxceleb/v1.2/conf/train_tseresnet34_xvec_stage2_v3.0.yaml @@ -2,18 +2,18 @@ data: train: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -21,18 +21,18 @@ data: val: dataset: class_names: - - class_id + - speaker aug_cfgs: - conf/reverb_noise_aug.yaml return_segment_info: - - class_id + - speaker sampler: sampler_type: class_weighted_random_seg_chunk_sampler min_batch_size: 64 max_chunk_length: 4.0 min_chunk_length: 4.0 num_chunks_per_seg_epoch: 6 - class_name: class_id + class_name: speaker seg_weight_mode: data-prior num_hard_prototypes: 8 data_loader: @@ -68,3 +68,4 @@ trainer: swa_start: 10 swa_lr: 1e-4 swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.1.sh new file mode 100644 index 00000000..05aa4033 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn512x3.v3.1.sh @@ -0,0 +1,46 @@ +# ECAPA-TDNN small + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3.v3.1 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0030.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage2_v3.1.yaml +nnet_name=${feat_type}_ecapatdnn512x3.v3.1 +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0030.pth +nnet_s2=$nnet_s2_dir/swa_model_ep0036.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_lresnet34.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_lresnet34.v3.1.sh new file mode 100644 index 00000000..019ac827 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_lresnet34.v3.1.sh @@ -0,0 +1,44 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34.v3.1 + +nnet_s1_base_cfg=conf/train_lresnet34_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_lresnet34_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 8de700c4..55e35e5f 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -9,6 +9,7 @@ from torch.nn import BatchNorm2d, Conv2d, Dropout2d from ..layers import ActivationFactory as AF +from .resnet_blocks import FreqPosEnc from .se_blocks import CFwSEBlock2d, FwSEBlock2d, SEBlock2d, TSEBlock2d @@ -32,7 +33,6 @@ def _conv1x1(in_channels, out_channels, stride=1, bias=False): def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before): - if norm_before: return nn.Sequential( _conv1x1(in_channels, out_channels, stride, bias=False), @@ -61,8 +61,10 @@ class Res2NetBasicBlock(nn.Module): norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. se_r: squeeze-excitation compression ratio. - time_se: If true, squeeze is done only in time dimension. - num_feats: Number of features in dimension 2, needed if time_se=True. + se_type: type of squeeze excitation in [t-se, cw-se, fw-se, cfw-se] + freq_pos_enc: use frequency wise positional encoder + num_feats: Number of features in dimension 2, needed if se_type!=cw-se or freq_pos_enc=True. + time_se: (legacy deprecated) If true, use t-se """ expansion = 1 @@ -82,10 +84,10 @@ def __init__( norm_before=True, se_r=None, se_type="cw-se", - time_se=False, + freq_pos_enc=False, num_feats=None, + time_se=False, ): - super().__init__() self.in_channels = in_channels @@ -148,9 +150,13 @@ def __init__( self.context = dilation self.downsample_factor = stride + self.pos_enc = None + if freq_pos_enc: + self.pos_enc = FreqPosEnc(num_feats) + if se_r is not None: if time_se: - se_type = "cw-se" + se_type = "t-se" if se_type == "t-se": self.se_layer = TSEBlock2d(channels, num_feats, se_r, activation) @@ -182,6 +188,9 @@ def forward(self, x, x_mask=None): if self.downsample is not None: residual = self.downsample(residual) + if self.pos_enc is not None: + x = self.pos_enc(x) + split_size = [self.width_in for i in range(self.scale - 1)] split_size.append(self.in_channels % self.width_in + self.width_in) split_x = torch.split(x, split_size, 1) @@ -247,8 +256,10 @@ class Res2NetBNBlock(nn.Module): norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. se_r: squeeze-excitation compression ratio. - time_se: If true, squeeze is done only in time dimension. - num_feats: Number of features in dimension 2, needed if time_se=True. + se_type: type of squeeze excitation in [t-se, cw-se, fw-se, cfw-se] + freq_pos_enc: use frequency wise positional encoder + num_feats: Number of features in dimension 2, needed if se_type!=cw-se or freq_pos_enc=True. + time_se: (legacy deprecated) If true, use t-se """ expansion = 4 @@ -268,10 +279,10 @@ def __init__( norm_before=True, se_r=None, se_type="cw-se", - time_se=False, + freq_pos_enc=False, num_feats=None, + time_se=False, ): - super().__init__() self.in_channels = in_channels @@ -329,6 +340,10 @@ def __init__( self.context = dilation self.downsample_factor = stride + self.pos_enc = None + if freq_pos_enc: + self.pos_enc = FreqPosEnc(num_feats) + if se_r is not None: if time_se: se_type = "t-se" @@ -364,6 +379,9 @@ def forward(self, x, x_mask=None): if self.downsample is not None: residual = self.downsample(residual) + if self.pos_enc is not None: + x = self.pos_enc(x) + x = self.conv1(x) if self.norm_before: x = self.bn1(x) diff --git a/hyperion/torch/layer_blocks/resnet_blocks.py b/hyperion/torch/layer_blocks/resnet_blocks.py index c077a54b..428d8139 100644 --- a/hyperion/torch/layer_blocks/resnet_blocks.py +++ b/hyperion/torch/layer_blocks/resnet_blocks.py @@ -2,7 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - +import torch import torch.nn as nn import torch.nn.functional as nnf from torch.nn import BatchNorm2d, Conv2d, Dropout2d @@ -30,7 +30,6 @@ def _conv1x1(in_channels, out_channels, stride=1, bias=False): def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before): - if norm_before: return nn.Sequential( _conv1x1(in_channels, out_channels, stride, bias=False), @@ -40,6 +39,14 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) return _conv1x1(in_channels, out_channels, stride, bias=True) +class FreqPosEnc(nn.Module): + def __init__(self, num_feats): + self.pos_enc = nn.Parameter(torch.zeros((num_feats, 1))) + + def forward(self, x): + return x + self.pos_enc + + class ResNetInputBlock(nn.Module): """Input block for ResNet architecture @@ -67,7 +74,6 @@ def __init__( norm_before=True, do_maxpool=True, ): - super().__init__() padding = int((kernel_size - 1) / 2) @@ -96,7 +102,6 @@ def __init__( self.downsample_factor *= 2 def forward(self, x): - x = self.conv(x) if self.norm_before: x = self.bn(x) @@ -125,6 +130,9 @@ class ResNetBasicBlock(nn.Module): dilation: dilation factor of the conv. kernels. norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. + freq_pos_enc: use frequency wise positional encoder + num_feats: Number of features in dimension 2, needed if freq_pos_enc=True. + """ expansion = 1 @@ -140,8 +148,9 @@ def __init__( dilation=1, norm_layer=None, norm_before=True, + freq_pos_enc=False, + num_feats=None, ): - super().__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d @@ -174,6 +183,9 @@ def __init__( self.context = dilation + stride self.downsample_factor = stride + self.pos_enc = None + if freq_pos_enc: + self.pos_enc = FreqPosEnc(num_feats) @property def out_channels(self): @@ -193,6 +205,9 @@ def forward(self, x, x_mask=None): if self.downsample is not None: residual = self.downsample(residual) + if self.pos_enc is not None: + x = self.pos_enc(x) + x = self.conv1(x) if self.norm_before: x = self.bn1(x) @@ -232,6 +247,8 @@ class ResNetBNBlock(nn.Module): dilation: dilation factor of the conv. kernels. norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. + freq_pos_enc: use frequency wise positional encoder + num_feats: Number of features in dimension 2, needed if freq_pos_enc=True. """ expansion = 4 @@ -248,8 +265,9 @@ def __init__( dilation=1, norm_layer=None, norm_before=True, + freq_pos_enc=False, + num_feats=None, ): - super().__init__() self.in_channels = in_channels @@ -286,6 +304,9 @@ def __init__( self.context = dilation self.downsample_factor = stride + self.pos_enc = None + if freq_pos_enc: + self.pos_enc = FreqPosEnc(num_feats) @property def out_channels(self): @@ -305,6 +326,9 @@ def forward(self, x, x_mask=None): if self.downsample is not None: residual = self.downsample(residual) + if self.pos_enc is not None: + x = self.pos_enc(x) + x = self.conv1(x) if self.norm_before: x = self.bn1(x) @@ -369,7 +393,6 @@ def __init__( norm_layer=None, norm_before=True, ): - super().__init__() if norm_layer is None: diff --git a/hyperion/torch/layer_blocks/seresnet_blocks.py b/hyperion/torch/layer_blocks/seresnet_blocks.py index b13a7ff3..9c25055b 100644 --- a/hyperion/torch/layer_blocks/seresnet_blocks.py +++ b/hyperion/torch/layer_blocks/seresnet_blocks.py @@ -26,8 +26,10 @@ class SEResNetBasicBlock(ResNetBasicBlock): norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. se_r: squeeze-excitation compression ratio. - time_se: If true, squeeze is done only in time dimension. + se_type: type of squeeze excitation in [t-se, cw-se, fw-se, cfw-se] + freq_pos_enc: use frequency wise positional encoder. num_feats: Number of features in dimension 2, needed if time_se=True. + time_se: (legacy deprecated) If true, use t-se """ def __init__( @@ -43,10 +45,10 @@ def __init__( norm_before=True, se_r=16, se_type="cw-se", - time_se=False, + freq_pos_enc=False, num_feats=None, + time_se=False, ): - super().__init__( in_channels, channels, @@ -57,6 +59,8 @@ def __init__( dilation=dilation, norm_layer=norm_layer, norm_before=norm_before, + freq_pos_enc=freq_pos_enc, + num_feats=num_feats, ) if time_se: @@ -84,6 +88,9 @@ def forward(self, x, x_mask=None): """ residual = x + if self.pos_enc is not None: + x = self.pos_enc(x) + x = self.conv1(x) if self.norm_before: x = self.bn1(x) @@ -129,8 +136,10 @@ class SEResNetBNBlock(ResNetBNBlock): norm_layer: normalization layer constructor, if None BatchNorm2d is used. norm_before: if True, normalization layer is before the activation, after otherwise. se_r=None: squeeze-excitation compression ratio. - time_se: If true, squeeze is done only in time dimension. + se_type: type of squeeze excitation in [t-se, cw-se, fw-se, cfw-se] + freq_pos_enc: use frequency wise positional encoder. num_feats: Number of features in dimension 2, needed if time_se=True. + time_se: (legacy deprecated) If true, use t-se """ def __init__( @@ -146,10 +155,10 @@ def __init__( norm_before=True, se_r=16, se_type="cw-se", - time_se=False, + freq_pos_enc=False, num_feats=None, + time_se=False, ): - super().__init__( in_channels, channels, @@ -160,6 +169,8 @@ def __init__( dilation=dilation, norm_layer=norm_layer, norm_before=norm_before, + freq_pos_enc=freq_pos_enc, + num_feats=num_feats, ) if time_se: @@ -190,6 +201,9 @@ def forward(self, x, x_mask=None): if self.downsample is not None: residual = self.downsample(residual) + if self.pos_enc is not None: + x = self.pos_enc(x) + x = self.conv1(x) if self.norm_before: x = self.bn1(x) diff --git a/hyperion/torch/narchs/resnet.py b/hyperion/torch/narchs/resnet.py index 5d3b9793..7abe4e54 100644 --- a/hyperion/torch/narchs/resnet.py +++ b/hyperion/torch/narchs/resnet.py @@ -5,7 +5,6 @@ import logging import numpy as np - import torch import torch.nn as nn from torch.nn import BatchNorm1d, Conv1d, Linear @@ -65,6 +64,7 @@ class ResNet(NetArch): required when time_se=True to calculcate the size of the squeeze excitation matrices. res2net_scale: Res2Net scale parameter res2net_width_factor: Res2Net multiplier for the width of the bottlneck layers. + freq_pos_enc: use frequency wise positional encoder """ def __init__( @@ -96,8 +96,8 @@ def __init__( res2net_width_factor=1, resb_channels=None, time_se=False, + freq_pos_enc=False, ): - super().__init__() logging.info("{}".format(locals())) self.block = block @@ -128,6 +128,8 @@ def __init__( else: self._block = block + assert not self.has_se and not freq_pos_enc or in_feats is not None + self.num_layers = num_layers self.in_channels = in_channels self.conv_channels = conv_channels @@ -154,6 +156,7 @@ def __init__( self.multilevel = multilevel self.endpoint_channels = endpoint_channels + self.freq_pos_enc = freq_pos_enc self.norm_layer = norm_layer norm_groups = None @@ -195,7 +198,7 @@ def __init__( self._downsample_factor = self.in_block.downsample_factor if resb_channels is None: - resb_channels = [base_channels * (2 ** i) for i in range(4)] + resb_channels = [base_channels * (2**i) for i in range(4)] self.cur_in_channels = conv_channels self.layer1 = self._make_layer(self._block, resb_channels[0], num_layers[0]) @@ -308,6 +311,11 @@ def _make_layer(self, block, channels, num_blocks, stride=1, dilate=False): "num_feats": num_feats, } + if self.freq_pos_enc: + kwargs["freq_pos_enc"] = True + num_feats = int(self.in_feats / (self._downsample_factor * stride)) + kwargs["num_feats"] = num_feats + if self.is_res2net: kwargs["scale"] = self.res2net_scale kwargs["width_factor"] = self.res2net_width_factor @@ -595,6 +603,7 @@ def get_config(self): "res2net_scale": self.res2net_scale, "res2net_width_factor": self.res2net_width_factor, "resb_channels": self.resb_channels, + "freq_pos_enc": self.freq_pos_enc, } base_config = super().get_config() @@ -1106,6 +1115,7 @@ def __init__(self, in_channels, **kwargs): #################### Res2Net variants ######################## + # Standard Res2Nets class Res2Net18(ResNet): def __init__(self, in_channels, **kwargs): diff --git a/hyperion/torch/narchs/resnet_factory.py b/hyperion/torch/narchs/resnet_factory.py index 35ed9af0..ba9d21a5 100644 --- a/hyperion/torch/narchs/resnet_factory.py +++ b/hyperion/torch/narchs/resnet_factory.py @@ -162,8 +162,8 @@ def create( in_feats=None, res2net_scale=4, res2net_width_factor=1, + freq_pos_enc=False, ): - try: resnet_class = resnet_dict[resnet_type] except: @@ -190,15 +190,12 @@ def create( in_feats=in_feats, res2net_scale=res2net_scale, res2net_width_factor=res2net_width_factor, + freq_pos_enc=freq_pos_enc, ) return resnet def filter_args(**kwargs): - # if "norm_after" in kwargs: - # kwargs["norm_before"] = not kwargs["norm_after"] - # del kwargs["norm_after"] - if "no_maxpool" in kwargs: kwargs["do_maxpool"] = not kwargs["no_maxpool"] del kwargs["no_maxpool"] @@ -224,6 +221,7 @@ def filter_args(**kwargs): "se_r", "res2net_scale", "res2net_width_factor", + "freq_pos_enc", ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -319,9 +317,6 @@ def add_class_args(parser, prefix=None): help="Zero-initialize the last BN in each residual branch", ) - # parser.add_argument('--replace-stride-with-dilation', default=None, nargs='+', type=bool, - # help='replaces strides with dilations to increase context without downsampling') - parser.add_argument( "--se-r", default=16, @@ -353,12 +348,6 @@ def add_class_args(parser, prefix=None): help="batch normalizaton before activation", ) - # parser.add_argument( - # "--norm-after", - # default=False, - # action="store_true", - # help="batch normalizaton after activation", - # ) except: pass @@ -367,6 +356,13 @@ def add_class_args(parser, prefix=None): except: pass + parser.add_argument( + "--freq-pos-enc", + default=False, + action=ActionYesNo, + help="use frequency wise positional encoder", + ) + if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index a9000f38..666c9a9d 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -97,7 +97,7 @@ def train_epoch(self, data_loader): Args: data_loader: pytorch data loader returning features and class labels. """ - batch_keys = [self.input_key, self.target_key] + # batch_keys = [self.input_key, self.target_key] self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc(device=self.device) @@ -112,6 +112,7 @@ def train_epoch(self, data_loader): input_keys = self.get_augs_keys(data, self.input_key) loss_scale = self.grad_acc_steps * len(input_keys) + loss_acc = 0.0 for aug_key in input_keys: batch_keys = [aug_key, self.target_key] x, target = tensors_subset(data, batch_keys, self.device) @@ -119,6 +120,7 @@ def train_epoch(self, data_loader): with amp.autocast(enabled=self.use_amp): output = self.model(x, y=target) loss = self.loss(output, target) / loss_scale + loss_acc += loss.item() if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -130,7 +132,7 @@ def train_epoch(self, data_loader): self.update_model() self.save_checkpoint(partial=True) - batch_metrics["loss"] = loss.item() * loss_scale + batch_metrics["loss"] = loss_acc * self.grad_acc_steps for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) @@ -146,3 +148,44 @@ def train_epoch(self, data_loader): logs.update(lrs) logs.update(self._get_wds()) return logs + + def validation_epoch(self, data_loader, swa_update_bn=False): + """Validation epoch loop + + Args: + data_loader: PyTorch data loader return input/output pairs. + sw_update_bn: wheter or not, update batch-norm layers in SWA. + """ + # batch_keys = [self.input_key, self.target_key] + metric_acc = MetricAcc(self.device) + batch_metrics = ODict() + with torch.no_grad(): + if swa_update_bn: + log_tag = "train_" + self.model.train() + else: + log_tag = "val_" + self.model.eval() + + for batch, data in enumerate(data_loader): + input_keys = self.get_augs_keys(data, self.input_key) + loss_scale = len(input_keys) + loss_acc = 0.0 + for aug_key in input_keys: + batch_keys = [aug_key, self.target_key] + x, target = tensors_subset(data, batch_keys, self.device) + batch_size = x.size(0) + with amp.autocast(enabled=self.use_amp): + output = self.model(x) + loss = self.loss(output, target) / loss_scale + loss_acc += loss.item() + + batch_metrics["loss"] = loss_acc + for k, metric in self.metrics.items(): + batch_metrics[k] = metric(output, target) + + metric_acc.update(batch_metrics, batch_size) + + logs = metric_acc.metrics + logs = ODict((log_tag + k, v) for k, v in logs.items()) + return logs From ecdc31946552cfb0c755d18c91eadf81338a54ea Mon Sep 17 00:00:00 2001 From: System User Date: Thu, 18 Jan 2024 19:14:16 -0500 Subject: [PATCH 128/154] added some vox confs --- ...rain_ecapatdnn2048x4_xvec_stage1_v3.1.yaml | 101 ++++++++++++++++++ ...rain_ecapatdnn2048x4_xvec_stage2_v3.1.yaml | 74 +++++++++++++ .../train_fwseresnet34_xvec_stage1_v3.1.yaml | 76 +++++++++++++ .../train_fwseresnet34_xvec_stage2_v3.1.yaml | 73 +++++++++++++ ...rain_idrnd_resnet100_xvec_stage1_v3.1.yaml | 77 +++++++++++++ ...rain_idrnd_resnet100_xvec_stage2_v3.1.yaml | 74 +++++++++++++ .../conf/train_resnet34_xvec_stage1_v3.1.yaml | 75 +++++++++++++ .../conf/train_resnet34_xvec_stage2_v3.1.yaml | 73 +++++++++++++ ...onfig_fbank80_stmn_ecapatdnn2048x4.v3.1.sh | 44 ++++++++ .../config_fbank80_stmn_fwseresnet34.v3.1.sh | 44 ++++++++ ...onfig_fbank80_stmn_idrnd_resnet100.v3.1.sh | 44 ++++++++ .../config_fbank80_stmn_resnet34.v3.1.sh | 44 ++++++++ 12 files changed, 799 insertions(+) create mode 100644 egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.1.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.1.sh diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..b7fab34b --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml @@ -0,0 +1,101 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 2048 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + - 1 + resb_channels: + - 2048 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + - 5 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 4096 + norm_before: false + dropout_rate: 0.2 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.2 + norm_before: false +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + #min_lr: 1.0e-05 + min_lr: 1.0e-06 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..e147dbb3 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_ecapatdnn2048x4_xvec_stage2_v3.1.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + resnet_enc: + override_dropouts: true + dropout_rate: 0.25 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..ca15bbba --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage1_v3.1.yaml @@ -0,0 +1,76 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..ac859010 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34_xvec_stage2_v3.1.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..03897a19 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml @@ -0,0 +1,77 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 32 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseidrndresnet100 + in_channels: 1 + in_feats: 80 + conv_channels: 128 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.05 + se_r: 4 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..3b8d716a --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.1.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker + \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..0ec78598 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage1_v3.1.yaml @@ -0,0 +1,75 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: resnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.1.yaml new file mode 100644 index 00000000..ac859010 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_resnet34_xvec_stage2_v3.1.yaml @@ -0,0 +1,73 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.0 +trainer: + optim: + opt_type: sgd + lr: 1e-3 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 15 + eff_batch_size: 256 + swa_start: 10 + swa_lr: 1e-4 + swa_anneal_epochs: 2 + target_key: speaker \ No newline at end of file diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh new file mode 100644 index 00000000..5a9b6028 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh @@ -0,0 +1,44 @@ +# ECAPA-TDNN large + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn2048x4.v3.1 + +nnet_s1_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_ecapatdnn2048x4_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh new file mode 100644 index 00000000..12b86dd1 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh @@ -0,0 +1,44 @@ +# Freq-wise-SE ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34.v3.1 + +nnet_s1_base_cfg=conf/train_fwseresnet34_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=false #true +do_qmf=false #true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.1.sh new file mode 100644 index 00000000..f06bcbea --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.1.sh @@ -0,0 +1,44 @@ +# IdRnd ResNet100 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.1 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.1.sh new file mode 100644 index 00000000..e954b63d --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_resnet34.v3.1.sh @@ -0,0 +1,44 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_resnet34.v3.1 + +nnet_s1_base_cfg=conf/train_resnet34_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + From c26d17d9d72e35e16c5e6d1b2334d48aeb463c96 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Thu, 18 Jan 2024 19:16:55 -0500 Subject: [PATCH 129/154] pos enc in resnet xvector --- hyperion/torch/models/xvectors/resnet_xvector.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index efc24f27..a639bdb8 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -57,6 +57,7 @@ def __init__( se_r=16, res2net_scale=4, res2net_width_factor=1, + freq_pos_enc=False, bias_weight_decay=None, ): logging.info("making %s encoder network", resnet_type) @@ -80,6 +81,7 @@ def __init__( in_feats=in_feats, res2net_scale=res2net_scale, res2net_width_factor=res2net_width_factor, + freq_pos_enc=freq_pos_enc, ) super().__init__( @@ -168,6 +170,10 @@ def res2net_scale(self): def res2net_width_factor(self): return self.encoder_net.res2net_width_factor + @property + def freq_pos_enc(self): + return self.encoder_net.freq_pos_enc + def get_config(self): base_config = super().get_config() del base_config["encoder_cfg"] @@ -186,6 +192,7 @@ def get_config(self): "se_r": self.se_r, "res2net_scale": self.res2net_scale, "res2net_width_factor": self.res2net_width_factor, + "freq_pos_enc": self.freq_pos_enc, } config.update(base_config) From d99189e22d19b8561961a9376818b8509f2cd5b3 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 1 Mar 2024 19:03:23 -0500 Subject: [PATCH 130/154] saving progress in dino --- .../conf/train_ecapatdnn512x3_dino_v1.1.yaml | 115 ++++++++++++++++++ .../conf/train_lresnet34_dino_v1.0.yaml | 92 ++++++++++++++ .../conf/train_lresnet34_dino_v1.1.yaml | 92 ++++++++++++++ .../config_fbank80_stmn_lresnet34.v1.0.sh | 52 ++++++++ .../config_fbank80_stmn_lresnet34.v1.1.sh | 52 ++++++++ ...un_006_extract_dino_embeds_cluster_eval.sh | 54 ++++---- egs/voxceleb/v1.2/README.md | 65 +++++++++- hyperion/bin/cluster_embeddings.py | 11 ++ hyperion/torch/layers/global_pool.py | 36 +++--- .../torch/trainers/dino_xvector_trainer.py | 29 +++-- hyperion/torch/trainers/torch_trainer.py | 105 +++++----------- 11 files changed, 566 insertions(+), 137 deletions(-) create mode 100644 egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.1.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.0.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.1.yaml create mode 100644 egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.0.sh create mode 100644 egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh diff --git a/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.1.yaml b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.1.yaml new file mode 100644 index 00000000..5dec90f3 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.1.yaml @@ -0,0 +1,115 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.0 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.005 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 25 + use_amp: false + log_interval: 1000 + epochs: 140 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.0.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.0.yaml new file mode 100644 index 00000000..cb82c539 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.0.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: true + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 8.0 + min_chunk_length: 4.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: true + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 8.0 + min_chunk_length: 4.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.0025 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 25 + use_amp: true + log_interval: 1000 + epochs: 60 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.1.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.1.yaml new file mode 100644 index 00000000..ac185913 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.1.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.0025 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 25 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.0.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.0.sh new file mode 100644 index 00000000..0a621148 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.0.sh @@ -0,0 +1,52 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34_dino.v1.0 + +nnet_s1_base_cfg=conf/train_lresnet34_dino_v1.0.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0060.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# clustering +cluster_method=cos_ahc +cluster_name=${cluster_method}_1 +cluster_cfg=conf/ahc.yaml + +# plda +plda_cfg=conf/plda.yaml + +# back-end +do_plda=false +# do_snorm=true +# do_qmf=true +# do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh new file mode 100644 index 00000000..752f7048 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh @@ -0,0 +1,52 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34_dino.v1.1 + +nnet_s1_base_cfg=conf/train_lresnet34_dino_v1.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0080.pth + +nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# clustering +cluster_method=cos_ahc +cluster_name=${cluster_method}_1 +cluster_cfg=conf/ahc.yaml + +# plda +plda_cfg=conf/plda.yaml + +# back-end +do_plda=false +# do_snorm=true +# do_qmf=true +# do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh index 8973483c..d848b466 100755 --- a/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh +++ b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh @@ -7,7 +7,7 @@ . ./path.sh set -e -stage=2 +stage=1 nnet_stage=1 config_file=default_config.sh use_gpu=false @@ -38,31 +38,6 @@ score_cosine_dir=$score_dir/cosine score_plda_dir=$score_dir/${cluster_name}_plda if [ $stage -le 1 ]; then - # Extract xvectors for training LDA/PLDA - nj=100 - for name in voxceleb2cat_train - do - if [ -n "$vad_config" ];then - vad_args="--vad csv:data/$name/vad.csv" - fi - output_dir=$xvector_dir/$name - echo "Extracting x-vectors for $name" - $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ - hyp_utils/conda_env.sh --num-gpus $num_gpus \ - hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ - --part-idx JOB --num-parts $nj \ - --recordings-file data/$name/recordings.csv \ - --random-utt-length --min-utt-length 30 --max-utt-length 30 \ - --model-path $nnet \ - --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv - hyperion-tables cat \ - --table-type features \ - --output-file $output_dir/xvector.csv --num-tables $nj - - done -fi - -if [ $stage -le 2 ]; then # Extracts x-vectors for evaluation nj=100 if [ "$do_voxsrc22" == "true" ];then @@ -91,7 +66,7 @@ if [ $stage -le 2 ]; then done fi -if [ $stage -le 3 ];then +if [ $stage -le 2 ];then echo "Eval Voxceleb 1 with Cosine scoring" num_parts=8 @@ -127,6 +102,31 @@ if [ $stage -le 3 ];then exit fi +if [ $stage -le 3 ]; then + # Extract xvectors for training LDA/PLDA + nj=100 + for name in voxceleb2cat_train + do + if [ -n "$vad_config" ];then + vad_args="--vad csv:data/$name/vad.csv" + fi + output_dir=$xvector_dir/$name + echo "Extracting x-vectors for $name" + $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ + hyp_utils/conda_env.sh --num-gpus $num_gpus \ + hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + --part-idx JOB --num-parts $nj \ + --recordings-file data/$name/recordings.csv \ + --random-utt-length --min-utt-length 30 --max-utt-length 30 \ + --model-path $nnet \ + --output-spec ark,csv:$output_dir/xvector.JOB.ark,$output_dir/xvector.JOB.csv + hyperion-tables cat \ + --table-type features \ + --output-file $output_dir/xvector.csv --num-tables $nj + + done +fi + cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name if [ $stage -le 4 ];then diff --git a/egs/voxceleb/v1.2/README.md b/egs/voxceleb/v1.2/README.md index 6a2502e6..e1199a3b 100644 --- a/egs/voxceleb/v1.2/README.md +++ b/egs/voxceleb/v1.2/README.md @@ -85,10 +85,8 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr - `run_007_eval_be.sh` - Trains PLDA and evals PLDA and cosine scoring back-ends - ## Results - ### VoxCeleb 1 Original-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -102,18 +100,30 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.68 | 0.052 | 0.088 | | | | | Cosine + AS-Norm | 0.63 | 0.049 | 0.083 | | | | | Cosine + QMF | 0.57 | 0.037 | 0.071 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh | ECAPA-TDNN 2048x4 | Stage2: Subcenter ArcFace m=0.3/intertop_m=0.1/centers=2 Dropout=0.25 | Cosine | 0.62 | 0.049 | 0.076 | +| | | | Cosine + AS-Norm | 0.61 | 0.044 | 0.075 | +| | | | Cosine + QMF | 0.53 | 0.037 | 0.076 | | config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.59 | 0.1 | 0.172 | | | | | Cosine + AS-Norm | 1.54 | 0.927 | 0.140 | | | | | Cosine + QMF | 1.32 | 0.083 | 0.121 | | config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.048 | 0.071 | | | | | Cosine + AS-Norm | 0.70 | 0.039 | 0.048 | | | | | Cosine + QMF | 0.62 | 0.034 | 0.042 | +| config_fbank80_stmn_resnet34.v3.1.sh | ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.68 | 0.039 | 0.048 +| | | | Cosine + AS-Norm | 0.60 | 0.036 | 0.052 | +| | | | Cosine + QMF | 0.53 | 0.033 | 0.050 | | config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.76 | 0.048 | 0.071 | | | | | Cosine + AS-Norm | 0.70 | 0.041 | 0.061 | | | | | Cosine + QMF | 0.62 | 0.037 | 0.056 | | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.77 | 0.48 | 0.077 | | | | | Cosine + AS-Norm | 0.68 | 0.040 | 0.062| | | | | Cosine + QMF | 0.62 | 0.036 | 0.063 | +| config_fbank80_stmn_fwseresnet34.v3.1.sh | FwSE-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.66 | 0.046 | 0.060 | +| | | | Cosine + AS-Norm | 0.61 | 0.040 | 0.052 | +| | | | Cosine + QMF | 0.057 | 0.037 | 0.058 | +| config_fbank80_stmn_fwseresnet34pe.v3.1.sh | FwSE-ResNet34-FPE | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.73 | 0.042 | 0.053 | +| | | | Cosine + AS-Norm | 0.64 | 0.034 | 0.047 | +| | | | Cosine + QMF | 0.60 | 0.033 | 0.044 | | config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.78 | 0.053 | 0.082 | | | | | Cosine + AS-Norm | 0.70 | 0.043 | 0.076 | | | | | Cosine + QMF | 0.63 | 0.042 | 0.071 | @@ -123,6 +133,9 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.56 | 0.040 | 0.065 | | | | | Cosine + AS-Norm | 0.52 | 0.033 | 0.045 | | | | | Cosine + QMF | 0.45 | 0.027 | 0.043 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.50 | 0.035 | 0.038 | +| | | | Cosine + AS-Norm | 0.47 | 0.031 | 0.038 | +| | | | Cosine + QMF | 0.40 | 0.027 | 0.032 | | config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | | | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | | | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | @@ -141,18 +154,30 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 0.85 | 0.055 | 0.100 | | | | | Cosine + AS-Norm | 0.80 | 0.050 | 0.087 | | | | | Cosine + QMF | 0.76 | 0.047 | 0.083 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh | ECAPA-TDNN 2048x4 | Stage2: Subcenter ArcFace m=0.3/intertop_m=0.1/centers=2 Dropout=0.25 | Cosine | 0.83 | 0.052 | 0.096 | +| | | | Cosine + AS-Norm | 0.77 | 0.049 | 0.086 | +| | | | Cosine + QMF | 0.74 | 0.047 | 0.082 | | config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.69 | 0.103 | 0.174 | | | | | Cosine + AS-Norm | 1.62 | 0.096 | 0.156 | | | | | Cosine + QMF | 1.51 | 0.091 | 0.152 | | config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.86 | 0.054 | 0.098 | | | | | Cosine + AS-Norm | 0.81 | 0.049 | 0.087 | | | | | Cosine + QMF | 0.77 | 0.046 | 0.082 | +| config_fbank80_stmn_resnet34.v3.1.sh | ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.80 | 0.049 | 0.094 | +| | | | Cosine + AS-Norm | 0.76 | 0.046 | 0.081 | +| | | | Cosine + QMF | 0.70 | 0.043 | 0.074 | | config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.89 | 0.058 | 0.098 | | | | | Cosine + AS-Norm | 0.84 | 0.053 | 0.087| | | | | Cosine + QMF | 0.80 | 0.050 | 0.081 | | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.83 | 0.053 | 0.098 | | | | | Cosine + AS-Norm | 0.78 | 0.047| 0.085 | | | | | Cosine + QMF | 0.74 | 0.045 | 0.081 | +| config_fbank80_stmn_fwseresnet34.v3.1.sh | FwSE-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.80 | 0.051 | 0.090 | +| | | | Cosine + AS-Norm | 0.74 | 0.046 | 0.081 | +| | | | Cosine + QMF | 0.70 | 0.044 | 0.076 | +| config_fbank80_stmn_fwseresnet34pe.v3.1.sh | FwSE-ResNet34-FPE | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.80 | 0.052 | 0.094 | +| | | | Cosine + AS-Norm | 0.76 | 0.047 | 0.081 | +| | | | Cosine + QMF | 0.72 | 0.045 | 0.076 | | config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 0.91 | 0.057 | 0.100 | | | | | Cosine + AS-Norm | 0.85 | 0.052 | 0.089 | | | | | Cosine + QMF | 0.81 | 0.049 | 0.085 | @@ -162,12 +187,14 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.71 | 0.044 | 0.076| | | | | Cosine + AS-Norm | 0.66 | 0.040 | 0.069 | | | | | Cosine + QMF | 0.63 | 0.037 | 0.067 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.69 | 0.043 | 0.074 | +| | | | Cosine + AS-Norm | 0.65 | 0.039 | 0.068 | +| | | | Cosine + QMF | 0.63 | 0.036 | 0.065 | | config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | | | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | | | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | - ### VoxCeleb 1 Hard-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -181,18 +208,30 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 1.66 | 0.103 | 0.168 | | | | | Cosine + AS-Norm | 1.53 | 0.091 | 0.151 | | | | | Cosine + QMF | 1.44 | 0.087 | 0.145 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh | ECAPA-TDNN 2048x4 | Stage2: Subcenter ArcFace m=0.3/intertop_m=0.1/centers=2 Dropout=0.25 | Cosine | 1.65 | 0.0101 | 0.169 | +| | | | Cosine + AS-Norm | 1.53 | 0.090 | 0.149 | +| | | | Cosine + QMF | 1.46 | 0.087 | 0.144 | | config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.84 | 0.167 | 0.267 | | | | | Cosine + AS-Norm | 2.58 | 0.150 | 0.252 | | | | | Cosine + QMF | 2.45 | 0.144 | 0.234 | | config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.62 | 0.098 | 0.164 | | | | | Cosine + AS-Norm | 1.45 | 0.085 | 0.142 | | | | | Cosine + QMF | 1.36 | 0.082 | 0.137 | +| config_fbank80_stmn_resnet34.v3.1.sh | ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.56 | 0.091 | 0.157 | +| | | | Cosine + AS-Norm | 1.40 | 0.080 | 0.135 | +| | | | Cosine + QMF | 1.33 | 0.076 | 0.128 | | config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.70 | 0.1 | 0.165 | | | | | Cosine + AS-Norm | 1.50 | 0.086 | 0.138 | | | | | Cosine + QMF | 1.44 | 0.085 | 0.139 | | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.59 | 0.096 | 0.165 | | | | | Cosine + AS-Norm | 1.41 | 0.083 | 0.143 | | | | | Cosine + QMF | 1.34 | 0.079 | 0.136 | +| config_fbank80_stmn_fwseresnet34.v3.1.sh | FwSE-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.58 | 0.096 | 0.162 | +| | | | Cosine + AS-Norm | 1.43 | 0.083 | 0.140 | +| | | | Cosine + QMF | 0.134 | 0.079 | 0.134 | +| config_fbank80_stmn_fwseresnet34pe.v3.1.sh | FwSE-ResNet34-FPE | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.61 | 0.097 | 1.63 | +| | | | Cosine + AS-Norm | 1.44 | 0.085 | 0.138 | +| | | | Cosine + QMF | 1.37 | 0.080 | 0.132 | | config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 1.75 | 0.104 | 0.171 | | | | | Cosine + AS-Norm | 1.56 | 0.091 | 0.152 | | | | | Cosine + QMF | 1.50 | 0.087 | 0.145 | @@ -202,11 +241,15 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.30 | 0.076 | 0.125 | | | | | Cosine + AS-Norm | 1.15 | 0.066 | 0.109 | | | | | Cosine + QMF | 1.11 | 0.065 | 0.103 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.36 | 0.077 | 0.122 | +| | | | Cosine + AS-Norm | 1.23 | 0.069 | 0.112 | +| | | | Cosine + QMF | 1.17 | 0.065 | 0.110 | | config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | | | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | | | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | + ### VoxSRC2022 dev | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -220,18 +263,30 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_ecapatdnn2048x4.v3.0.sh | ECAPA-TDNN 2048x4 | Stage2: ArcFace m=0.3/intertop_m=0.1 Dropout=0.25 | Cosine | 2.33 | 0.156 | 0.260 | | | | | Cosine + AS-Norm | 2.19 | 0.144 | 0.263 | | | | | Cosine + QMF | 2.06 | 0.137 | 0.251 | +| config_fbank80_stmn_ecapatdnn2048x4.v3.1.sh | ECAPA-TDNN 2048x4 | Stage2: Subcenter ArcFace m=0.3/intertop_m=0.1/centers=2 Dropout=0.25 | Cosine | 2.34 | 0.152 | 0.275 | +| | | | Cosine + AS-Norm | 2.24 | 0.143 | 0.268 | +| | | | Cosine + QMF | 2.12 | 0.139 | 0.255 | | config_fbank80_stmn_lresnet34.v3.1.sh | Thin-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 3.74 | 0.239 | 0.394 | | | | | Cosine + AS-Norm | 3.45 | 0.225 | 0.377 | | | | | Cosine + QMF | 3.27 | 0.213 | 0.356 | | config_fbank80_stmn_resnet34.v3.0.sh | ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.19 | 0.142 | 0.242 | | | | | Cosine + AS-Norm | 2.00 | 0.133 | 0.254 | | | | | Cosine + QMF | 1.86 | 0.126 | 0.229 | +| config_fbank80_stmn_resnet34.v3.1.sh | ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.15 | 0.135 | 0.233 | +| | | | Cosine + AS-Norm | 1.98 | 0.126 | 0.245 | +| | | | Cosine + QMF | 1.86 | 0.119 | 0.222 | | config_fbank80_stmn_cwseresnet34.v3.0.sh | CwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.34 | 0.145 | 0.246 | | | | | Cosine + AS-Norm | 2.10 | 0.135 | 0.248 | | | | | Cosine + QMF | 2.01 | 0.127 | 0.218 | | config_fbank80_stmn_fwseresnet34.v3.0.sh | FwSE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.25 | 0.136 | 0.239 | | | | | Cosine + AS-Norm | 1.99 | 0.127 | 0.232 | | | | | Cosine + QMF | 1.87 | 0.119 | 0.216 | +| config_fbank80_stmn_fwseresnet34.v3.1.sh | FwSE-ResNet34 | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.14 | 0.134 | 0.228 | +| | | | Cosine + AS-Norm | 1.97 | 0.124 | 0.223 | +| | | | Cosine + QMF | 1.82 | 0.116 | 0.205 | +| config_fbank80_stmn_fwseresnet34pe.v3.1.sh | FwSE-ResNet34-FPE | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.27 | 0.138 | 0.238 | +| | | | Cosine + AS-Norm | 2.08 | 0.129 | 0.223 | +| | | | Cosine + QMF | 1.94 | 0.120 | 0.207 | | config_fbank80_stmn_tseresnet34.v3.0.sh | Time-SE-ResNet34 | Stage2: ArcFace m=0.3/intertop_m=0.1 | Cosine | 2.36 | 0.153 | 0.259 | | | | | Cosine + AS-Norm | 2.18 | 0.139 | 0.249 | | | | | Cosine + QMF | 2.08 | 0.128 | 0.222 | @@ -241,6 +296,10 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.0.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.92 | 0.124 | 0.208 | | | | | Cosine + AS-Norm | 1.71 | 0.109 | 0.212 | | | | | Cosine + QMF | 1.62 | 0.103 | 0.192 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.02 | 0.116 | 0.194 | +| | | | Cosine + AS-Norm | 1.81 | 0.107 | 0.199 | +| | | | Cosine + QMF | 1.72 | 0.099 | 0.186 | | config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | | | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | | | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | + diff --git a/hyperion/bin/cluster_embeddings.py b/hyperion/bin/cluster_embeddings.py index 998b1f17..10b6344e 100644 --- a/hyperion/bin/cluster_embeddings.py +++ b/hyperion/bin/cluster_embeddings.py @@ -9,6 +9,7 @@ import time from pathlib import Path +import matplotlib.pyplot as plt import numpy as np import pandas as pd from jsonargparse import ( @@ -124,6 +125,14 @@ def get_gmm_post(x, y): return p_max, p_2nd +def plot_score_hist(scores, fig_file): + mask = np.triu(np.ones_like(scores, dtype=bool)) + fig = plt.figure() + scores = scores[mask] + plt.hist(scores, bins=100, density=True) + fig.savefig(fig_file) + + def cos_ahc( segments_file, feats_file, @@ -155,6 +164,8 @@ def cos_ahc( x_lowprec = x_km scores = cosine_scoring(x_lowprec, x_lowprec) + fig_file = Path(output_file).parent / "score_hist.png" + plot_score_hist(scores, fig_file) logging.info("running AHC") ahc = AHC(method=linkage_method) diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index d314490c..aa14f743 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -781,9 +781,9 @@ def forward(self, x, x_lengths=None, weights=None): assert not torch.any( torch.isnan(x_inner) ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(x))} {torch.mean(x)} {torch.sum(torch.isinf(x))} {x.size()}" - assert not torch.any( - torch.isinf(x_inner) - ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(x))}" + # assert not torch.any( + # torch.isinf(x_inner) + # ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(x))}" if self.use_global_context: global_mus = self.stats_pool(x, weights=weights) @@ -791,9 +791,9 @@ def forward(self, x, x_lengths=None, weights=None): assert not torch.any( torch.isnan(x_inner) ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(global_mus))}" - assert not torch.any( - torch.isinf(x_inner) - ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(global_mus))}" + # assert not torch.any( + # torch.isinf(x_inner) + # ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(global_mus))}" attn = self.conv2( self.activation(self.norm_layer(x_inner)) @@ -814,23 +814,23 @@ def forward(self, x, x_lengths=None, weights=None): if weights is not None: attn = attn * weights - assert not torch.any( - torch.isnan(attn) - ), f"attn is nan {torch.sum(torch.isnan(attn))}" - assert not torch.any( - torch.isinf(attn) - ), f"attn is inf {torch.sum(torch.isinf(attn))}" + # assert not torch.any( + # torch.isnan(attn) + # ), f"attn is nan {torch.sum(torch.isnan(attn))}" + # assert not torch.any( + # torch.isinf(attn) + # ), f"attn is inf {torch.sum(torch.isinf(attn))}" mus = self.stats_pool(x, weights=attn) if self.keepdim: mus = mus.unsqueeze(self.dim) - assert not torch.any( - torch.isnan(mus) - ), f"mus is nan {torch.sum(torch.isnan(mus))}" - assert not torch.any( - torch.isinf(mus) - ), f"mus is inf {torch.sum(torch.isinf(mus))}" + # assert not torch.any( + # torch.isnan(mus) + # ), f"mus is nan {torch.sum(torch.isnan(mus))}" + # assert not torch.any( + # torch.isinf(mus) + # ), f"mus is inf {torch.sum(torch.isinf(mus))}" return mus def get_config(self): diff --git a/hyperion/torch/trainers/dino_xvector_trainer.py b/hyperion/torch/trainers/dino_xvector_trainer.py index 26d6a434..16a15304 100644 --- a/hyperion/torch/trainers/dino_xvector_trainer.py +++ b/hyperion/torch/trainers/dino_xvector_trainer.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import os from collections import OrderedDict as ODict @@ -15,7 +16,7 @@ from ...utils.misc import filter_func_args from ..optim import ExpMovingAvg as EMA from ..utils import MetricAcc, TorchDDP, tensors_subset -from .torch_trainer import DDPType, TorchTrainer +from .torch_trainer import AMPDType, DDPType, TorchTrainer class DINOXVectorTrainer(TorchTrainer): @@ -39,6 +40,7 @@ class DINOXVectorTrainer(TorchTrainer): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: float16 | bfloat16 log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -76,6 +78,7 @@ def __init__( train_mode="full", freeze_output_layer_steps=3000, use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, @@ -98,10 +101,7 @@ def __init__( def prepare_models_for_training(self): super().prepare_models_for_training() self.teacher_model, self.teacher_optimizer = self._prepare_model_for_ema( - self.teacher_model, - self.teacher_optim, - self.device, - self.ddp, + self.teacher_model, self.teacher_optim, self.device, self.ddp, ) def _prepare_model_for_ema(self, model, optim, device, ddp): @@ -177,25 +177,25 @@ def train_epoch(self, data_loader): teacher_keys = self.get_augs_keys(data, self.input_key, "teacher") student_keys = self.get_augs_keys(data, self.input_key, "student") - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): with torch.no_grad(): teacher_data = tensors_subset(data, teacher_keys, self.device) batch_size = teacher_data[0].size(0) num_teacher_crops = len(teacher_data) teacher_data = torch.cat(teacher_data, dim=0) teacher_out = self.teacher_model(teacher_data) + assert not torch.any(torch.isnan(teacher_out)), "teacher is nan" + assert not torch.any(torch.isinf(teacher_out)), "teacher is inf" if num_teacher_crops > 1: student_out1 = self.model(teacher_data) + assert not torch.any(torch.isnan(student_out1)), "s1 is nan" + assert not torch.any(torch.isinf(student_out1)), "s1 is inf" student_data = tensors_subset(data, student_keys, self.device) num_student_crops = len(student_data) student_data = torch.cat(student_data, dim=0) student_out2 = self.model(student_data) - assert not torch.any(torch.isnan(teacher_out)), "teacher is nan" - assert not torch.any(torch.isinf(teacher_out)), "teacher is inf" - assert not torch.any(torch.isnan(student_out1)), "s1 is nan" - assert not torch.any(torch.isinf(student_out1)), "s1 is inf" assert not torch.any(torch.isnan(student_out2)), "s2 is nan" assert not torch.any(torch.isinf(student_out2)), "s2 is inf" if num_teacher_crops > 1: @@ -261,7 +261,6 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.loss.eval() if swa_update_bn: - log_tag = "train_" self.model.train() else: log_tag = "val_" @@ -270,20 +269,26 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): teacher_keys = self.get_augs_keys(data, self.input_key, "teacher") student_keys = self.get_augs_keys(data, self.input_key, "student") - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): teacher_data = tensors_subset(data, teacher_keys, self.device) batch_size = teacher_data[0].size(0) num_teacher_crops = len(teacher_data) teacher_data = torch.cat(teacher_data, dim=0) teacher_out = self.teacher_model(teacher_data) + assert not torch.any(torch.isnan(teacher_out)), "teacher is nan" + assert not torch.any(torch.isinf(teacher_out)), "teacher is inf" if num_teacher_crops > 1: student_out1 = self.model(teacher_data) + assert not torch.any(torch.isnan(student_out1)), "s1 is nan" + assert not torch.any(torch.isinf(student_out1)), "s1 is inf" student_data = tensors_subset(data, student_keys, self.device) num_student_crops = len(student_data) student_data = torch.cat(student_data, dim=0) student_out2 = self.model(student_data) + assert not torch.any(torch.isnan(student_out2)), "s2 is nan" + assert not torch.any(torch.isinf(student_out2)), "s2 is inf" if num_teacher_crops > 1: student_out = torch.cat((student_out1, student_out2), dim=0) num_student_crops += num_teacher_crops diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 36a9a43f..8bbdcb47 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -44,6 +44,23 @@ class DDPType(str, Enum): OSS_SHARDED_DDP = "oss_sharded_ddp" FULLY_SHARDED_DDP = "fully_sharded_ddp" + @staticmethod + def choices(): + return [o.value for o in DDPType] + + +class AMPDType(str, Enum): + FLOAT16 = "float16" + BFLOAT16 = "bfloat16" + + @staticmethod + def choices(): + return [o.value for o in AMPDType] + + @staticmethod + def to_dtype(dtype): + return torch.float16 if dtype == AMPDType.FLOAT16 else torch.bfloat16 + ddp_choices = [o.value for o in DDPType] @@ -67,6 +84,7 @@ class TorchTrainer(object): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['full', 'frozen'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -101,6 +119,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, @@ -140,13 +159,13 @@ def __init__( self.device = device self.train_mode = train_mode self.use_amp = use_amp + self.amp_dtype = AMPDType.to_dtype(amp_dtype) self.grad_clip = grad_clip self.grad_clip_norm = grad_clip_norm self.swa_start = swa_start self.do_swa = swa_start > 0 self.swa_lr = swa_lr self.swa_anneal_epochs = swa_anneal_epochs - self.amp_args = {} self.input_key = input_key self.target_key = target_key self.ddp = ddp @@ -164,78 +183,6 @@ def __init__( self.set_train_mode() self.prepare_models_for_training() - # if device is not None: - # self.model.to(device) - # if loss is not None: - # self.loss.to(device) - - # if ddp: - # if ddp_type == DDPType.DDP or ddp_type == DDPType.OSS_DDP: - # self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) - # if self.rank == 0: - # logging.info( - # "training in multiple gpus with distributed-data-parallel" - # ) - # oss = False if ddp_type == DDPType.DDP else True - # self.optimizer = self._make_optimizer(optim, self.model, oss=oss) - # self.model = TorchDDP( - # self.model, - # device_ids=[device], - # output_device=device, - # ) - # elif ddp_type == DDPType.OSS_SHARDED_DDP: - # self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) - # if self.rank == 0: - # logging.info( - # "training in multiple gpus with fair sharded-distributed-data-parallel" - # ) - # self.optimizer = self._make_optimizer(optim, self.model, oss=True) - # self.model = FairShardedDDP(self.model, self.optimizer) - # else: - # if self.rank == 0: - # logging.info( - # "training in multiple gpus with fair fully-sharded-distributed-data-parallel" - # ) - # # syncbathcnorm is not supported here, it raises exception - # self.model = FairFullyShardedDDP( - # self.model, - # mixed_precision=self.use_amp, - # move_params_to_cpu=cpu_offload, - # ) - # self.optimizer = self._make_optimizer(optim, self.model, oss=False) - - # else: - # self.optimizer = self._make_optimizer(optim, self.model) - - # # make the learning rate scheduler - # self.lr_scheduler = self._make_lr_sched(lrsched, self.optimizer) - - # if self.use_amp: - # if ddp and ddp_type != DDPType.DDP: - # if self.rank == 0: - # logging.info( - # "using automatic mixed precision training with sharded-grad-scaler" - # ) - # self.grad_scaler = ShardedGradScaler() - # else: - # if self.rank == 0: - # logging.info( - # "using automatic mixed precision training with grad-scaler" - # ) - # self.grad_scaler = amp.GradScaler() - # self.amp_autocast = amp.autocast - # else: - # self.amp_autocast = contextlib.nullcontext - - # self.in_swa = False - # if self.do_swa: - # if self.rank == 0: - # logging.info("init SWA model") - # self.swa_model = AveragedModel(self.model) - # self.swa_scheduler = SWALR( - # self.optimizer, swa_lr=self.swa_lr, anneal_epochs=self.swa_anneal_epochs - # ) - def prepare_models_for_training(self): self.loss = self._prepare_loss_for_training(self.loss, self.device) ( @@ -329,6 +276,7 @@ def _prepare_model_for_training( # make weight decay scheduler if needed wd_scheduler = self._make_wd_sched(wdsched, optimizer) + grad_scaler = None if use_amp: if ddp and ddp_type != DDPType.DDP: if self.rank == 0: @@ -741,9 +689,9 @@ def checkpoint(self, logs=None): "model_cfg": self.model.get_config(), "model_state_dict": self.model.state_dict(), "optimizer_state_dict": self.optimizer.state_dict(), - "loss_state_dict": self.loss.state_dict() - if self.loss is not None - else None, + "loss_state_dict": ( + self.loss.state_dict() if self.loss is not None else None + ), } if self.lr_scheduler is not None: checkpoint["lr_scheduler_state_dict"] = self.lr_scheduler.state_dict() @@ -1093,7 +1041,7 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): parser.add_argument( "--ddp-type", default="ddp", - choices=ddp_choices, + choices=DDPType.choices(), help="DDP type in {}".format(ddp_choices), ) parser.add_argument( @@ -1102,6 +1050,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): default=False, help="use mixed precision training", ) + parser.add_argument( + "--amp-dtype", default=AMPDType.FLOAT16, choices=AMPDType.choices() + ) parser.add_argument( "--cpu-offload", action=ActionYesNo, From 12a1bd9250e8ca210bf53290d10bc3e772816c65 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 23 Apr 2024 18:54:32 -0400 Subject: [PATCH 131/154] added ft steps to voxceleb/ssl.v1 --- ..._ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml | 18 ++ ...uster_ecapatdnn512x3_v1.2_ft1_cos_ahc.yaml | 12 + ...patdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml | 20 ++ ...uster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml | 18 ++ .../cluster_lresnet34_v1.2_ft1_cos_ahc.yaml | 12 + ...r_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml | 18 ++ egs/voxceleb/ssl.v1/conf/plda.yaml | 11 + .../conf/train_ecapatdnn512x3_dino_v1.2.yaml | 115 ++++++++ ...ain_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml | 68 +++++ ...ain_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml | 68 +++++ .../conf/train_fwseresnet34_dino_v1.2.yaml | 93 ++++++ .../conf/train_lresnet34_dino_v1.2.yaml | 92 ++++++ .../train_lresnet34_xvec_stage1.1_v1.2.yaml | 70 +++++ .../train_lresnet34_xvec_stage1.2_v1.2.yaml | 70 +++++ ...config_fbank80_stmn_ecapatdnn512x3.v1.2.sh | 68 +++++ ...config_fbank80_stmn_fwseresnet34.v1.2.1.sh | 66 +++++ .../config_fbank80_stmn_fwseresnet34.v1.2.sh | 66 +++++ .../config_fbank80_stmn_lresnet34.v1.1.sh | 52 ++-- .../config_fbank80_stmn_lresnet34.v1.2.1.sh | 65 +++++ .../config_fbank80_stmn_lresnet34.v1.2.sh | 65 +++++ ...un_006_extract_dino_embeds_cluster_eval.sh | 99 +++++-- egs/voxceleb/ssl.v1/run_007_train_xvector.sh | 57 ++-- ...08_extract_ft1_xvec_embeds_cluster_eval.sh | 25 ++ .../ssl.v1/run_009_finetune_xvector_s2.sh | 22 ++ ...10_extract_ft2_xvec_embeds_cluster_eval.sh | 25 ++ egs/voxceleb/v1.2/README.md | 15 +- hyperion/bin/cluster_embeddings.py | 266 ++++++++++++++++-- hyperion/bin/hyperion_dataset.py | 77 ++++- hyperion/bin/hyperion_tables.py | 90 +++++- hyperion/np/pdfs/plda/frplda.py | 7 +- hyperion/np/pdfs/plda/plda.py | 11 +- hyperion/np/pdfs/plda/plda_base.py | 12 +- hyperion/np/pdfs/plda/splda.py | 9 +- hyperion/torch/data/audio_dataset.py | 33 +-- .../torch/models/wav2xvectors/wav2xvector.py | 9 +- hyperion/torch/models/xvectors/xvector.py | 103 +++++-- hyperion/torch/narchs/conformer_encoder_v1.py | 7 +- hyperion/torch/narchs/dino_head.py | 6 +- hyperion/torch/narchs/proj_head.py | 25 +- hyperion/torch/trainers/ae_trainer.py | 8 +- hyperion/torch/trainers/dvae_trainer.py | 6 +- hyperion/torch/trainers/plda_trainer.py | 6 +- hyperion/torch/trainers/transducer_trainer.py | 5 +- hyperion/torch/trainers/vae_trainer.py | 8 +- hyperion/torch/trainers/vq_dvae_trainer.py | 7 +- hyperion/torch/trainers/vq_vae_trainer.py | 35 +-- .../torch/trainers/xvector_adv_trainer.py | 4 + .../trainers/xvector_adv_trainer_from_wav.py | 4 + hyperion/torch/trainers/xvector_trainer.py | 5 +- .../trainers/xvector_trainer_deep_feat_reg.py | 4 + .../xvector_trainer_deep_feat_reg_from_wav.py | 5 +- .../trainers/xvector_trainer_from_wav.py | 4 + hyperion/utils/dataset.py | 76 +++-- hyperion/utils/info_table.py | 39 ++- 54 files changed, 1926 insertions(+), 255 deletions(-) create mode 100644 egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/plda.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.2.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.2.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.1_v1.2.yaml create mode 100644 egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.2_v1.2.yaml create mode 100644 egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh create mode 100644 egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh create mode 100644 egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh create mode 100644 egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.1.sh create mode 100644 egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh create mode 100755 egs/voxceleb/ssl.v1/run_008_extract_ft1_xvec_embeds_cluster_eval.sh create mode 100755 egs/voxceleb/ssl.v1/run_009_finetune_xvector_s2.sh create mode 100755 egs/voxceleb/ssl.v1/run_010_extract_ft2_xvec_embeds_cluster_eval.sh diff --git a/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml new file mode 100644 index 00000000..fb6673df --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml @@ -0,0 +1,18 @@ +pca: + pca_var_r: 0.995 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold_stage_1: 0.875 +threshold_stage_2: -100. +plda: + plda_type: splda + y_dim: 100 +max_samples_per_cluster: 50 +min_samples_per_cluster: 8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc.yaml new file mode 100644 index 00000000..c1bf8c94 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc.yaml @@ -0,0 +1,12 @@ +pca: + pca_var_r: 0.995 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold: 0.8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml new file mode 100644 index 00000000..1a04d084 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml @@ -0,0 +1,20 @@ +pca: + pca_var_r: 0.995 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +#threshold_stage_1: 0.75 +#threshold_stage_2: 25 +threshold_stage_1: 0.8 +threshold_stage_2: 30 +plda: + plda_type: splda + y_dim: 100 +max_samples_per_cluster: 50 +min_samples_per_cluster: 8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml new file mode 100644 index 00000000..3740d0e7 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml @@ -0,0 +1,18 @@ +pca: + pca_var_r: 0.99 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold_stage_1: 0.80 +threshold_stage_2: -400 +plda: + plda_type: splda + y_dim: 100 +max_samples_per_cluster: 50 +min_samples_per_cluster: 8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml new file mode 100644 index 00000000..9c446a2e --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml @@ -0,0 +1,12 @@ +pca: + pca_var_r: 0.99 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold: 0.6 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml new file mode 100644 index 00000000..cf3adf41 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml @@ -0,0 +1,18 @@ +pca: + pca_var_r: 0.99 +pre_kmeans: + samples_per_cluster: 4 + epochs: 10 + rtol: 0.01 + init_method: random +stop_criterion: threshold +threshold_stage_1: 0.6 +threshold_stage_2: 0 +plda: + plda_type: splda + y_dim: 100 +max_samples_per_cluster: 50 +min_samples_per_cluster: 8 +ahc_precision: single +num_workers: 32 +filter_by_gmm_post: 0.9 diff --git a/egs/voxceleb/ssl.v1/conf/plda.yaml b/egs/voxceleb/ssl.v1/conf/plda.yaml new file mode 100644 index 00000000..bbb8f051 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/plda.yaml @@ -0,0 +1,11 @@ +class_name: cluster +pca: + #pca_var_r: 0.975 + pca_var_r: 0.99 +do_lda: true +lda: + lda_dim: 120 +plda: + plda_type: splda + y_dim: 100 + epochs: 20 diff --git a/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.2.yaml new file mode 100644 index 00000000..ae9ab1fa --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_dino_v1.2.yaml @@ -0,0 +1,115 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_enc: + in_feats: 80 + in_conv_channels: 512 + in_kernel_size: 5 + in_stride: 1 + resb_type: seres2bn + resb_repeats: + - 1 + - 1 + - 1 + resb_channels: + - 512 + resb_kernel_sizes: + - 3 + resb_dilations: + - 2 + - 3 + - 4 + resb_strides: + - 1 + res2net_width_factor: 1 + res2net_scale: 8 + se_r: 4 + multilayer: true + multilayer_concat: true + endpoint_channels: 1536 + norm_before: false + dropout_rate: 0.002 + hid_act: swish + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.0 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.005 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 15 + use_amp: false + log_interval: 1000 + epochs: 120 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml new file mode 100644 index 00000000..480ae04f --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 10 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: false + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 + target_key: cluster + train_mode: ft-embed-affine + \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml new file mode 100644 index 00000000..8a7a700c --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml @@ -0,0 +1,68 @@ +data: + train: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 10 + intertop_margin: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + target_key: cluster + train_mode: full + \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.yaml new file mode 100644 index 00000000..24d09678 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.yaml @@ -0,0 +1,93 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + se_r: 4 + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.005 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 15 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.2.yaml new file mode 100644 index 00000000..fa6466ce --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_dino_v1.2.yaml @@ -0,0 +1,92 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: lresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +trainer: + optim: + opt_type: adamw + lr: 0.0025 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 15 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.1_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.1_v1.2.yaml new file mode 100644 index 00000000..945fd42b --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.1_v1.2.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 10 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.1 +trainer: + optim: + opt_type: sgd + lr: 0.1 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 16000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 30 + eff_batch_size: 256 + target_key: cluster + train_mode: ft-embed-affine + \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.2_v1.2.yaml b/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.2_v1.2.yaml new file mode 100644 index 00000000..e8fd36a2 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_lresnet34_xvec_stage1.2_v1.2.yaml @@ -0,0 +1,70 @@ +data: + train: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - cluster + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - cluster + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: cluster + seg_weight_mode: data-prior + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.2 + margin_warmup_epochs: 10 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.01 +trainer: + optim: + opt_type: sgd + lr: 0.01 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 70 + eff_batch_size: 256 + target_key: cluster + train_mode: full + \ No newline at end of file diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh new file mode 100644 index 00000000..de643f1e --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh @@ -0,0 +1,68 @@ +# ECAPA-TDNN 512x3 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet1d +nnet_name=${feat_type}_ecapatdnn512x3_dino.v1.2 + +nnet_s1_base_cfg=conf/train_ecapatdnn512x3_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0120.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_ecapatdnn512x3_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc_plda_ahc +cluster_ft_s1_cfg=conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s1_name=${cluster_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_ecapatdnn512x3_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s2_method=cos_ahc_plda_ahc +cluster_ft_s2_cfg=conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s2_name=${cluster_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name + + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh new file mode 100644 index 00000000..102fbaef --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh @@ -0,0 +1,66 @@ +# ECAPA-TDNN 512x3 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34_dino.v1.2 + +nnet_s1_base_cfg=conf/train_fwseresnet34_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0034.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s1_name=${cluster_method_ft_s1_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.1.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.1.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s2_name=${cluster_method_ft_s2_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh new file mode 100644 index 00000000..b3a6e963 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh @@ -0,0 +1,66 @@ +# ECAPA-TDNN 512x3 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34_dino.v1.2 + +nnet_s1_base_cfg=conf/train_fwseresnet34_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0034.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc_plda_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s1_name=${cluster_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc_plda_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s2_name=${cluster_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name + diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh index 752f7048..18fafd95 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.1.sh @@ -19,11 +19,6 @@ nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/teacher_model_ep0080.pth -nnet_s2_base_cfg=conf/train_resnet34_xvec_stage2_v3.0.yaml -nnet_s2_name=${nnet_name}.s2 -nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name -nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth - # clustering cluster_method=cos_ahc cluster_name=${cluster_method}_1 @@ -32,21 +27,34 @@ cluster_cfg=conf/ahc.yaml # plda plda_cfg=conf/plda.yaml -# back-end -do_plda=false -# do_snorm=true -# do_qmf=true -# do_voxsrc22=true - -plda_aug_config=conf/reverb_noise_aug.yaml -plda_num_augs=0 -if [ $plda_num_augs -eq 0 ]; then - plda_data=voxceleb2cat_train -else - plda_data=voxceleb2cat_train_augx${plda_num_augs} -fi -plda_type=splda -lda_dim=200 -plda_y_dim=150 -plda_z_dim=200 +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_stage1.1_v1.1.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0010.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_stage1.2_v1.1.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0080.pth + + +# # back-end +# do_plda=false +# # do_snorm=true +# # do_qmf=true +# # do_voxsrc22=true + +# plda_aug_config=conf/reverb_noise_aug.yaml +# plda_num_augs=0 +# if [ $plda_num_augs -eq 0 ]; then +# plda_data=voxceleb2cat_train +# else +# plda_data=voxceleb2cat_train_augx${plda_num_augs} +# fi +# plda_type=splda +# lda_dim=200 +# plda_y_dim=150 +# plda_z_dim=200 diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.1.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.1.sh new file mode 100644 index 00000000..7b822cf4 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.1.sh @@ -0,0 +1,65 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34_dino.v1.2 + +nnet_s1_base_cfg=conf/train_lresnet34_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0100.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s1_name=${cluster_ft_s1_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.1.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.1.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s2_name=${cluster_ft_s2_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh new file mode 100644 index 00000000..788b3b4b --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh @@ -0,0 +1,65 @@ +# ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_lresnet34_dino.v1.2 + +nnet_s1_base_cfg=conf/train_lresnet34_dino_v1.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0100.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc_plda_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s1_name=${cluster_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc_plda_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml +cluster_ft_s2_name=${cluster_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name diff --git a/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh index d848b466..5bf085ae 100755 --- a/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh +++ b/egs/voxceleb/ssl.v1/run_006_extract_dino_embeds_cluster_eval.sh @@ -9,9 +9,11 @@ set -e stage=1 nnet_stage=1 +ft_stage=0 config_file=default_config.sh -use_gpu=false +use_gpu=true xvec_chunk_length=120.0 +do_clustering=true . parse_options.sh || exit 1; . $config_file @@ -24,14 +26,39 @@ else num_gpus=0 fi -if [ $nnet_stage -eq 1 ];then - nnet=$nnet_s1 - nnet_name=$nnet_s1_name -elif [ $nnet_stage -eq 2 ];then - nnet=$nnet_s2 - nnet_name=$nnet_s2_name +if [ $ft_stage -eq 0 ];then + if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name + elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name + fi +elif [ $ft_stage -eq 1 ];then + if [ $nnet_stage -eq 1 ];then + nnet=$nnet_ft_s1_1 + nnet_name=$nnet_ft_s1_1_name + elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_ft_s1_2 + nnet_name=$nnet_ft_s1_2_name + fi + cluster_method=$cluster_ft_s1_method + cluster_cfg=$cluster_ft_s1_cfg + cluster_name=$cluster_ft_s1_name + cluster_dir=$cluster_ft_s1_dir +elif [ $ft_stage -eq 2 ];then + if [ $nnet_stage -eq 1 ];then + nnet=$nnet_ft_s2_1 + nnet_name=$nnet_ft_s2_1_name + elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_ft_s2_2 + nnet_name=$nnet_ft_s2_2_name + fi + cluster_method=$cluster_ft_s2_method + cluster_cfg=$cluster_ft_s2_cfg + cluster_name=$cluster_ft_s2_name + cluster_dir=$cluster_ft_s2_dir fi - xvector_dir=exp/xvectors/$nnet_name score_dir=exp/scores/$nnet_name score_cosine_dir=$score_dir/cosine @@ -99,22 +126,22 @@ if [ $stage -le 2 ];then --output-file $score_cosine_dir/voxceleb1_results.csv cat $score_cosine_dir/voxceleb1_results.csv - exit +fi + +if [ "$do_clustering" == "false" ];then + exit 0 fi if [ $stage -le 3 ]; then # Extract xvectors for training LDA/PLDA nj=100 - for name in voxceleb2cat_train + for name in voxceleb2cat_train_filtered do - if [ -n "$vad_config" ];then - vad_args="--vad csv:data/$name/vad.csv" - fi output_dir=$xvector_dir/$name echo "Extracting x-vectors for $name" $xvec_cmd JOB=1:$nj $output_dir/log/extract_xvectors.JOB.log \ hyp_utils/conda_env.sh --num-gpus $num_gpus \ - hyperion-extract-wav2xvectors ${xvec_args} ${vad_args} \ + hyperion-extract-wav2xvectors ${xvec_args} \ --part-idx JOB --num-parts $nj \ --recordings-file data/$name/recordings.csv \ --random-utt-length --min-utt-length 30 --max-utt-length 30 \ @@ -128,32 +155,46 @@ if [ $stage -le 3 ]; then fi -cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name if [ $stage -le 4 ];then echo "Cluster Vox2" mkdir -p $cluster_dir $train_cmd --mem 50G --num-threads 32 $cluster_dir/clustering.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV \ hyperion-cluster-embeddings $cluster_method --cfg $cluster_cfg \ - --segments-file data/voxceleb2cat_train_xvector_train/segments.csv \ - --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ - --output-file $cluster_dir/voxceleb2cat_train_xvector_train/segments.csv + --segments-file data/voxceleb2cat_train_filtered/segments.csv \ + --feats-file csv:$xvector_dir/voxceleb2cat_train_filtered/xvector.csv \ + --output-file $cluster_dir/voxceleb2cat_train/segments.csv fi if [ $stage -le 5 ];then + hyperion-dataset add_cols_to_segments \ + --dataset data/voxceleb2cat_train_filtered \ + --column-names cluster \ + --right-table $cluster_dir/voxceleb2cat_train/segments.csv \ + --output-dataset $cluster_dir/voxceleb2cat_train_clustered \ + --remove-missing --create-class-info + + hyperion-dataset remove_classes_few_toomany_segments \ + --dataset $cluster_dir/voxceleb2cat_train_clustered \ + --class-name cluster \ + --min-segs 10 \ + --max-segs 50 \ + --rebuild-idx \ + --output-dataset $cluster_dir/voxceleb2cat_train_clustered_filtered +fi + +if [ $stage -le 6 ];then echo "Train PLDA" $train_cmd $cluster_dir/plda.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV \ hyperion-train-plda --cfg $plda_cfg \ - --segments-file $cluster_dir/voxceleb2cat_train_xvector_train/segments.csv \ - --feats-file csv:$xvector_dir/voxceleb2cat_train/xvector.csv \ + --segments-file $cluster_dir/voxceleb2cat_train_clustered_filtered/segments.csv \ + --feats-file csv:$xvector_dir/voxceleb2cat_train_filtered/xvector.csv \ --preproc-file $cluster_dir/plda/preproc.h5 \ --plda-file $cluster_dir/plda/plda.h5 - - fi -if [ $stage -le 6 ];then +if [ $stage -le 7 ];then echo "Eval Voxceleb 1 with PLDA" num_parts=8 @@ -188,6 +229,14 @@ if [ $stage -le 6 ];then --output-file $score_plda_dir/voxceleb1_results.csv cat $score_plda_dir/voxceleb1_results.csv - exit fi -exit + +if [ $stage -le 8 ];then + hyperion-dataset split_train_val \ + --dataset $cluster_dir/voxceleb2cat_train_clustered_filtered \ + --val-prob 0.03 \ + --seed 1123581321 \ + --train-dataset $cluster_dir/voxceleb2cat_train_clustered_train \ + --val-dataset $cluster_dir/voxceleb2cat_train_clustered_val +fi + diff --git a/egs/voxceleb/ssl.v1/run_007_train_xvector.sh b/egs/voxceleb/ssl.v1/run_007_train_xvector.sh index 40aceb07..9732078a 100755 --- a/egs/voxceleb/ssl.v1/run_007_train_xvector.sh +++ b/egs/voxceleb/ssl.v1/run_007_train_xvector.sh @@ -7,6 +7,7 @@ . ./path.sh set -e +ft_stage=1 stage=1 ngpu=4 config_file=default_config.sh @@ -19,8 +20,27 @@ use_wandb=false . $config_file . datapath.sh -train_data_dir=data/${nnet_data}_xvector_train -val_data_dir=data/${nnet_data}_xvector_val +if [ $ft_stage -eq 1 ];then + nnet_s1_base_cfg=$nnet_ft_s1_1_base_cfg + nnet_s2_base_cfg=$nnet_ft_s1_2_base_cfg + nnet_s1_dir=$nnet_ft_s1_1_dir + nnet_s2_dir=$nnet_ft_s1_2_dir + nnet_s0=$nnet_s1 + nnet_s1=$nnet_ft_s1_1 + nnet_s2=$nnet_ft_s1_2 + train_data_dir=$cluster_dir/${nnet_data}_clustered_train + val_data_dir=$cluster_dir/${nnet_data}_clustered_val +elif [ $ft_stage -eq 2 ];then + nnet_s1_base_cfg=$nnet_ft_s2_1_base_cfg + nnet_s2_base_cfg=$nnet_ft_s2_2_base_cfg + nnet_s1_dir=$nnet_ft_s2_1_dir + nnet_s2_dir=$nnet_ft_s2_2_dir + nnet_s0=$nnet_ft_s1_2 + nnet_s1=$nnet_ft_s2_1 + nnet_s2=$nnet_ft_s2_2 + train_data_dir=$cluster_ft_s1_dir/${nnet_data}_clustered_train + val_data_dir=$cluster_ft_s1_dir/${nnet_data}_clustered_val +fi #add extra args from the command line arguments if [ -n "$num_workers" ];then @@ -30,45 +50,32 @@ if [ "$use_tb" == "true" ];then extra_args="$extra_args --trainer.use-tensorboard" fi if [ "$use_wandb" == "true" ];then - extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" + extra_args="$extra_args --trainer.use-wandb --trainer.wandb.project voxceleb-ssl.v1.1 --trainer.wandb.name $nnet_name.$(date -Iminutes)" fi if [ "$interactive" == "true" ];then export cuda_cmd=run.pl fi -xvector_dir=exp/xvectors/$nnet_s1_name/voxceleb2cat_train -output_dir=exp/clustering/$nnet_s1_name/$cluster_method/voxceleb2cat_train_xvector_train -if [ $stage -le 1 ];then - mkdir -p $output_dir - $train_cmd --mem 50G --num-threads 32 $output_dir/clustering.log \ - hyp_utils/conda_env.sh --conda-env $HYP_ENV \ - hyperion-cluster-embeddings $cluster_method --cfg $cluster_cfg \ - --segments-file data/voxceleb2cat_train_xvector_train/segments.csv \ - --feats-file csv:$xvector_dir/xvector.csv \ - --output-file $output_dir/segments.csv -fi -exit -# Network Training -if [ $stage -le 2 ]; then - +# Fine-tune last layer and embedding projection +if [ $stage -le 1 ]; then mkdir -p $nnet_s1_dir/log $cuda_cmd \ --gpu $ngpu $nnet_s1_dir/log/train.log \ hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ - hyperion-train-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ + hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s1_base_cfg $nnet_s1_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ - --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.train.dataset.class-files $train_data_dir/cluster.csv \ --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ --data.val.dataset.segments-file $val_data_dir/segments.csv \ --trainer.exp-path $nnet_s1_dir \ - --num-gpus $ngpu \ - + --in-model-file $nnet_s0 \ + --num-gpus $ngpu fi -# Large Margin Fine-tuning +# Fine-tune full model if [ $stage -le 2 ]; then if [ "$use_wandb" == "true" ];then extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)" @@ -80,11 +87,11 @@ if [ $stage -le 2 ]; then hyperion-finetune-wav2xvector $nnet_type --cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \ --data.train.dataset.recordings-file $train_data_dir/recordings.csv \ --data.train.dataset.segments-file $train_data_dir/segments.csv \ - --data.train.dataset.class-files $train_data_dir/speaker.csv \ + --data.train.dataset.class-files $train_data_dir/cluster.csv \ --data.val.dataset.recordings-file $val_data_dir/recordings.csv \ --data.val.dataset.segments-file $val_data_dir/segments.csv \ --in-model-file $nnet_s1 \ --trainer.exp-path $nnet_s2_dir \ - --num-gpus $ngpu \ + --num-gpus $ngpu fi diff --git a/egs/voxceleb/ssl.v1/run_008_extract_ft1_xvec_embeds_cluster_eval.sh b/egs/voxceleb/ssl.v1/run_008_extract_ft1_xvec_embeds_cluster_eval.sh new file mode 100755 index 00000000..71cab44a --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_008_extract_ft1_xvec_embeds_cluster_eval.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=true +xvec_chunk_length=120.0 +do_clustering=true +. parse_options.sh || exit 1; + +./run_006_extract_dino_embeds_cluster_eval.sh \ + --config-file $config_file \ + --stage $stage \ + --nnet-stage $nnet_stage \ + --ft-stage 1 \ + --use-gpu $use_gpu \ + --xvec-chunk-length $xvec_chunk_length \ + --do-clustering $do_clustering diff --git a/egs/voxceleb/ssl.v1/run_009_finetune_xvector_s2.sh b/egs/voxceleb/ssl.v1/run_009_finetune_xvector_s2.sh new file mode 100755 index 00000000..ca7d058a --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_009_finetune_xvector_s2.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright +# 2019 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=4 +config_file=default_config.sh +interactive=false +. parse_options.sh || exit 1; + +./run_007_train_xvector.sh \ + --config-file $config_file \ + --ngpu $ngpu \ + --stage $stage \ + --ft-stage 2 \ + --interactive $interactive + diff --git a/egs/voxceleb/ssl.v1/run_010_extract_ft2_xvec_embeds_cluster_eval.sh b/egs/voxceleb/ssl.v1/run_010_extract_ft2_xvec_embeds_cluster_eval.sh new file mode 100755 index 00000000..4f09dfaf --- /dev/null +++ b/egs/voxceleb/ssl.v1/run_010_extract_ft2_xvec_embeds_cluster_eval.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nnet_stage=2 +config_file=default_config.sh +use_gpu=true +xvec_chunk_length=120.0 +do_clustering=true +. parse_options.sh || exit 1; + +./run_006_extract_dino_embeds_cluster_eval.sh \ + --config-file $config_file \ + --stage $stage \ + --nnet-stage $nnet_stage \ + --ft-stage 2 \ + --use-gpu $use_gpu \ + --xvec-chunk-length $xvec_chunk_length \ + --do-clustering $do_clustering diff --git a/egs/voxceleb/v1.2/README.md b/egs/voxceleb/v1.2/README.md index e1199a3b..f6fec0a6 100644 --- a/egs/voxceleb/v1.2/README.md +++ b/egs/voxceleb/v1.2/README.md @@ -136,11 +136,13 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.50 | 0.035 | 0.038 | | | | | Cosine + AS-Norm | 0.47 | 0.031 | 0.038 | | | | | Cosine + QMF | 0.40 | 0.027 | 0.032 | +| config_fbank80_stmn_idrnd_resnet100.v3.2.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.49 | 0.032 | 0.038 | +| | | | Cosine + AS-Norm | 0.43 | 0.025 | 0.034 | +| | | | Cosine + QMF | 0.37 | 0.024 | 0.033 | | config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.60 | 0.043 | 0.071 | | | | | Cosine + AS-Norm | 0.53 | 0.034 | 0.063 | | | | | Cosine + QMF | 0.49 | 0.033 | 0.054 | - ### VoxCeleb 1 Entire-Clean trial list | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -190,6 +192,9 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.69 | 0.043 | 0.074 | | | | | Cosine + AS-Norm | 0.65 | 0.039 | 0.068 | | | | | Cosine + QMF | 0.63 | 0.036 | 0.065 | +| config_fbank80_stmn_idrnd_resnet100.v3.2.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 0.66 | 0.040 | 0.072 | +| | | | Cosine + AS-Norm | 0.62 | 0.037 | 0.066 | +| | | | Cosine + QMF | 0.59 | 0.035 | 0.064 | | config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 0.75 | 0.047 | 0.077 | | | | | Cosine + AS-Norm | 0.70 | 0.042 | 0.072 | | | | | Cosine + QMF | 0.68 | 0.040 | 0.069 | @@ -244,12 +249,14 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.36 | 0.077 | 0.122 | | | | | Cosine + AS-Norm | 1.23 | 0.069 | 0.112 | | | | | Cosine + QMF | 1.17 | 0.065 | 0.110 | +| config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.27 | 0.072 | 0.121 | +| | | | Cosine + AS-Norm | 1.15 | 0.065 | 0.107 | +| | | | Cosine + QMF | 1.10 | 0.062 | 0.102 | | config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.41 | 0.081 | 0.132 | | | | | Cosine + AS-Norm | 1.28 | 0.071 | 0.116 | | | | | Cosine + QMF | 1.21 | 0.069 | 0.113 | - ### VoxSRC2022 dev | Config | Model Type | Model Details | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | @@ -299,7 +306,9 @@ run_007_eval_be.sh --config-file config_fbank80_stmn_resnet34_arcs30m0.3_adam_lr | config_fbank80_stmn_idrnd_resnet100.v3.1.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 2.02 | 0.116 | 0.194 | | | | | Cosine + AS-Norm | 1.81 | 0.107 | 0.199 | | | | | Cosine + QMF | 1.72 | 0.099 | 0.186 | +| config_fbank80_stmn_idrnd_resnet100.v3.2.sh | ResNet100 / BasicBlock 128-256 ch. | Stage2: Subcenter-ArcFace m=0.3/intertop_m=0.1/centers=2 | Cosine | 1.91 | 0.111 | 0.192 | +| | | | Cosine + AS-Norm | 1.75 | 0.105 | 0.194 | +| | | | Cosine + QMF | 1.64 | 0.098 | 0.181 | | config_fbank80_stmn_res2net50w26s8.v3.0.sh | Res2Net50 w26 scale=8 | Stage2: ArcFace m=0.4/intertop_m=0.1 | Cosine | 1.96 | 0.124 | 0.211 | | | | | Cosine + AS-Norm | 1.79 | 0.118 | 0239 | | | | | Cosine + QMF | 1.68 | 0.114 | 0.216 | - diff --git a/hyperion/bin/cluster_embeddings.py b/hyperion/bin/cluster_embeddings.py index 10b6344e..fb30fcae 100644 --- a/hyperion/bin/cluster_embeddings.py +++ b/hyperion/bin/cluster_embeddings.py @@ -24,15 +24,12 @@ from hyperion.hyp_defs import config_logger from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.np.clustering import AHC, KMeans, KMeansInitMethod, SpectralClustering -from hyperion.np.pdfs import DiagGMM +from hyperion.np.pdfs import SPLDA, DiagGMM, PLDAFactory from hyperion.np.transforms import PCA, LNorm from hyperion.utils import SegmentSet from hyperion.utils.math_funcs import cosine_scoring -subcommand_list = [ - "cos_ahc", - "spectral_clustering", -] +subcommand_list = ["cos_ahc", "spectral_clustering", "cos_ahc_plda_ahc"] def add_common_args(parser): @@ -98,6 +95,59 @@ def do_kmeans(x, samples_per_cluster, epochs, rtol, init_method, num_workers): return x_km, idx_km +def change_precision(x, precision=None): + if precision == "single": + return x.astype(np.float32) + elif precision == "half": + return x.astype(np.float16) + else: + return x + + +def do_cosine_scoring(x, precision=None): + logging.info("compute cosine affinity matrix") + x = change_precision(x) + return cosine_scoring(x, x) + + +def train_plda(x, y, plda, min_samples_per_cluster, max_samples_per_cluster=None): + logging.info("Train Centering/Whitening + PLDA") + _, cluster_idx, counts = np.unique(y, return_inverse=True, return_counts=True) + max_samples_per_cluster = ( + np.max(counts) if max_samples_per_cluster is None else max_samples_per_cluster + ) + transforms = LNorm() + transforms.fit(x) + if plda["y_dim"] > x.shape[1]: + plda["y_dim"] = x.shape[1] + plda_model = PLDAFactory.create(**plda) + + counts = counts[cluster_idx] + keep = np.logical_and( + counts >= min_samples_per_cluster, counts <= max_samples_per_cluster + ) + x = x[keep] + cluster_idx = cluster_idx[keep] + _, cluster_idx = np.unique(cluster_idx, return_inverse=True) + plda_model.fit(x, class_ids=cluster_idx) + + return transforms, plda_model + + +def do_ahc(scores, linkage_method, stop_criterion, threshold, num_clusters): + logging.info( + f"running AHC stop_criterion: {stop_criterion} thr: {threshold} num_clusters: {num_clusters}", + ) + ahc = AHC(method=linkage_method) + ahc.fit(scores) + if stop_criterion == "threshold": + y = ahc.get_flat_clusters_from_thr(threshold) + else: + y = ahc.get_flat_clusters_from_num_clusters(num_clusters) + + return y + + def get_gmm_post(x, y): logging.info("computing cluster posteriors with gmm") num_comp = np.max(y) + 1 @@ -119,8 +169,6 @@ def get_gmm_post(x, y): gmm.Mstep(N, u_x) p = gmm.compute_pz(x, mode="std") p_max = p[np.arange(x.shape[0]), y] - zz = p_max < 0.5 - print(np.mean(p[zz]), np.max(p[zz]), p_max[zz]) p_2nd = np.sort(p, axis=1, kind="heapsort")[:, -2] return p_max, p_2nd @@ -129,10 +177,32 @@ def plot_score_hist(scores, fig_file): mask = np.triu(np.ones_like(scores, dtype=bool)) fig = plt.figure() scores = scores[mask] + logging.info( + f"score-mean=%f score-std=%f score-max=%f score-min=%f", + scores.mean(), + scores.std(), + scores.max(), + scores.min(), + ) + if np.any(scores < -1.1) or np.any(scores > 1.1): + # if scores come from plda we limit the max and min val + thr = 2 * np.std(scores) + scores = scores.copy() + scores[scores > thr] = thr + scores[scores < -thr] = -thr + plt.hist(scores, bins=100, density=True) fig.savefig(fig_file) +def plot_cluster_size_hist(y, fig_file): + _, counts = np.unique(y, return_counts=True) + fig = plt.figure() + bins = np.arange(1, np.max(counts) + 1) + plt.hist(counts, bins=bins, density=False) + fig.savefig(fig_file) + + def cos_ahc( segments_file, feats_file, @@ -148,34 +218,17 @@ def cos_ahc( num_workers, filter_by_gmm_post, ): + Path(output_file).parent.mkdir(exist_ok=True, parents=True) segments, x = load_data(segments_file, feats_file) if lnorm: x = LNorm()(x) x = do_pca(x, pca) x_km, idx_km = do_kmeans(x, num_workers=num_workers, **pre_kmeans) - - logging.info("compute affinity matrix") - if ahc_precision == "single": - x_lowprec = x_km.astype(np.float32) - elif ahc_precision == "half": - x_lowprec = x_km.astype(np.float16) - else: - x_lowprec = x_km - - scores = cosine_scoring(x_lowprec, x_lowprec) + scores = do_cosine_scoring(x_km, ahc_precision) fig_file = Path(output_file).parent / "score_hist.png" plot_score_hist(scores, fig_file) - - logging.info("running AHC") - ahc = AHC(method=linkage_method) - ahc.fit(scores) - if stop_criterion == "threshold": - y = ahc.get_flat_clusters_from_thr(threshold) - else: - y = ahc.get_flat_clusters_from_num_clusters(num_clusters) - - del ahc + y = do_ahc(scores, linkage_method, stop_criterion, threshold, num_clusters) if idx_km is not None: y = y[idx_km] del x_km @@ -189,6 +242,8 @@ def cos_ahc( segments = SegmentSet(segments.loc[idx]) segments.save(output_file) + fig_file = Path(output_file).parent / "cluster_size_hist.png" + plot_cluster_size_hist(segments["cluster"], fig_file) def make_cos_ahc_parser(): @@ -229,7 +284,158 @@ def make_cos_ahc_parser(): ) parser.add_argument("--pre_kmeans.epochs", default=100, type=int) parser.add_argument("--pre_kmeans.rtol", default=0.001, type=float) - parser.add_argument("--num_workers", default=1, type=int) + parser.add_argument("--num-workers", default=1, type=int) + return parser + + +def cos_ahc_plda_ahc( + segments_file, + feats_file, + output_file, + lnorm, + pca, + linkage_method, + stop_criterion, + num_clusters_stage_1, + threshold_stage_1, + num_clusters_stage_2, + threshold_stage_2, + min_samples_per_cluster, + max_samples_per_cluster, + plda, + ahc_precision, + pre_kmeans, + num_workers, + filter_by_gmm_post, +): + Path(output_file).parent.mkdir(exist_ok=True, parents=True) + segments, x = load_data(segments_file, feats_file) + if lnorm: + x = LNorm()(x) + + x = do_pca(x, pca) + + # stage 1 + x_km, idx_km = do_kmeans(x, num_workers=num_workers, **pre_kmeans) + scores = do_cosine_scoring(x_km, ahc_precision) + fig_file = Path(output_file).parent / "cosine_score_hist.png" + plot_score_hist(scores, fig_file) + y = do_ahc( + scores, linkage_method, stop_criterion, threshold_stage_1, num_clusters_stage_1 + ) + if idx_km is not None: + y = y[idx_km] + del x_km + + fig_file = Path(output_file).parent / "cosine_cluster_size_hist.png" + plot_cluster_size_hist(y, fig_file) + # stage 2 + transform, plda_model = train_plda( + x, y, plda, min_samples_per_cluster, max_samples_per_cluster + ) + x = transform(x) + z = plda_model.compute_py_g_x(x) + _, idx_km = do_kmeans(z, num_workers=num_workers, **pre_kmeans) + + if idx_km is None: + scores = plda_model.llr_1vs1(x, x) + else: + scores = plda_model.llr_NvsM(x, x, ids1=idx_km, ids2=idx_km) + + scores = change_precision(scores, ahc_precision) + fig_file = Path(output_file).parent / "plda_score_hist.png" + plot_score_hist(scores, fig_file) + y = do_ahc( + scores, linkage_method, stop_criterion, threshold_stage_2, num_clusters_stage_2 + ) + if idx_km is not None: + y = y[idx_km] + + p_max, p_2nd = get_gmm_post(x, y) + segments["cluster"] = y + segments["post_cluster"] = p_max + segments["post_cluster_2nd"] = p_2nd + if filter_by_gmm_post > 0: + idx = segments["post_cluster"] > filter_by_gmm_post + segments = SegmentSet(segments.loc[idx]) + + segments.save(output_file) + fig_file = Path(output_file).parent / "plda_cluster_size_hist.png" + plot_cluster_size_hist(segments["cluster"], fig_file) + + +def make_cos_ahc_plda_ahc_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + add_common_args(parser) + parser.add_argument("--lnorm", default=False, action=ActionYesNo) + PCA.add_class_args(parser, prefix="pca") + parser.add_argument( + "--linkage-method", + default="average", + choices=["single", "complete", "average", "weighted", "ward"], + help="linkage method", + ) + parser.add_argument( + "--stop-criterion", + default="threshold", + choices=["threshold", "num_clusters"], + help="stopping criterion", + ) + parser.add_argument( + "--num-clusters-stage-1", + default=None, + type=int, + help="number of AHC clusters for first stage", + ) + parser.add_argument( + "--threshold-stage-1", + default=0, + type=float, + help="stopping threshold for first stage", + ) + parser.add_argument( + "--num-clusters-stage-2", + default=None, + type=int, + help="number of AHC clusters for first stage", + ) + parser.add_argument( + "--threshold-stage-2", + default=0, + type=float, + help="stopping threshold for first stage", + ) + parser.add_argument( + "--ahc-precision", default="single", choices=["half", "single", "double"] + ) + parser.add_argument( + "--min-samples-per-cluster", + default=8, + type=int, + help="minimum samples/cluster for a cluster to be used to train PLDA", + ) + parser.add_argument( + "--max-samples-per-cluster", + default=50, + type=int, + help="maximum samples/cluster for a cluster to be used to train PLDA", + ) + PLDAFactory.add_class_args(parser, prefix="plda") + parser.add_argument( + "--pre_kmeans.samples-per-cluster", + default=1, + type=int, + help="first k-means is done to recuce the computing cost of AHC", + ) + parser.add_argument( + "--pre_kmeans.init_method", + default=KMeansInitMethod.max_dist, + choices=KMeansInitMethod.choices(), + ) + parser.add_argument("--pre_kmeans.epochs", default=100, type=int) + parser.add_argument("--pre_kmeans.rtol", default=0.001, type=float) + parser.add_argument("--num-workers", default=1, type=int) return parser @@ -269,6 +475,7 @@ def spectral_clustering( spectral_clustering, filter_by_gmm_post, ): + Path(output_file).parent.mkdir(exist_ok=True, parents=True) segments, x = load_data(segments_file, feats_file) if lnorm: x = LNorm()(x) @@ -294,6 +501,9 @@ def spectral_clustering( segments.save(output_file) output_file = Path(output_file) + fig_file = Path(output_file).parent / "cluster_size_hist.png" + plot_cluster_size_hist(segments["cluster"], fig_file) + fig_file = output_file.with_stem(output_file.stem + "_eigengap").with_suffix(".png") sc.plot_eigengap_stats(eigengap_stats, num_clusters, fig_file) diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py index 17fff2ba..2bd01f2d 100755 --- a/hyperion/bin/hyperion_dataset.py +++ b/hyperion/bin/hyperion_dataset.py @@ -34,6 +34,7 @@ "remove_short_segments", "rebuild_class_idx", "remove_classes_few_segments", + "remove_classes_few_toomany_segments", "split_train_val", "copy", "add_cols_to_segments", @@ -281,6 +282,55 @@ def remove_classes_few_segments( dataset.save(output_dataset) +def make_remove_classes_few_toomany_segments_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--class-name", required=True, help="""name of the class type e.g.: speaker""" + ) + parser.add_argument( + "--min-segs", default=1, type=int, help="""min. num. of segments/class""" + ) + parser.add_argument( + "--max-segs", default=None, type=int, help="""max. num. of segments/class""" + ) + parser.add_argument( + "--rebuild-idx", + default=False, + action=ActionYesNo, + help="""regenerate class indexes from 0 to new_num_classes-1""", + ) + parser.add_argument( + "--output-dataset", + default=None, + help="""output dataset dir, if None, we use the same as input""", + ) + + add_common_args(parser) + return parser + + +def remove_classes_few_toomany_segments( + dataset: PathLike, + class_name: str, + min_segs: int, + max_segs: Union[int, None], + rebuild_idx: bool, + output_dataset: PathLike, +): + if output_dataset is None: + output_dataset = dataset + + dataset = Dataset.load(dataset, lazy=True) + dataset.remove_classes_few_toomany_segments( + class_name, min_segs, max_segs, rebuild_idx + ) + dataset.save(output_dataset) + + def make_split_train_val_parser(): parser = ArgumentParser() parser.add_argument("--cfg", action=ActionConfigFile) @@ -397,7 +447,7 @@ def make_add_cols_to_segments_parser(): "--right-table", required=True, help="table where the new data is" ) parser.add_argument( - "--columns", + "--column-names", required=True, nargs="+", help="""columns to copy to segments table""", @@ -421,6 +471,20 @@ def make_add_cols_to_segments_parser(): help="""output dataset dir, if None, we use the same as input""", ) + parser.add_argument( + "--remove-missing", + default=False, + action=ActionYesNo, + help="remove dataset entries that don't have a value in the right table", + ) + + parser.add_argument( + "--create-class-info", + default=False, + action=ActionYesNo, + help="creates class-info tables for the new columns added to the dataset", + ) + add_common_args(parser) return parser @@ -432,12 +496,21 @@ def add_cols_to_segments( on: List[str], right_on: List[str], output_dataset: PathLike, + remove_missing: bool = False, + create_class_info: bool = False, ): if output_dataset is None: output_dataset = dataset dataset = Dataset.load(dataset, lazy=True) - dataset.add_cols_to_segments(right_table, column_names, on, right_on) + dataset.add_cols_to_segments( + right_table, + column_names, + on, + right_on, + remove_missing=remove_missing, + create_class_info=create_class_info, + ) dataset.save(output_dataset) diff --git a/hyperion/bin/hyperion_tables.py b/hyperion/bin/hyperion_tables.py index 59472d83..3f847d29 100755 --- a/hyperion/bin/hyperion_tables.py +++ b/hyperion/bin/hyperion_tables.py @@ -7,9 +7,12 @@ from pathlib import Path from typing import List, Optional, Union +import numpy as np +import pandas as pd from jsonargparse import ( ActionConfigFile, ActionParser, + ActionYesNo, ArgumentParser, namespace_to_dict, ) @@ -25,7 +28,7 @@ SegmentSet, ) -subcommand_list = ["cat"] +subcommand_list = ["cat", "filter", "make_class_file_from_column"] table_dict = { "segments": SegmentSet, "recordings": RecordingSet, @@ -108,6 +111,91 @@ def cat( output_table.save(output_file) +def make_filter_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input-file", required=True, help="input table file") + parser.add_argument( + "--filter-file", required=True, help="table file that we use as filter" + ) + parser.add_argument( + "--filter-by", default="id", help="column that we use to filter " + ) + parser.add_argument( + "--output-file", + required=True, + help="""output table file""", + ) + parser.add_argument( + "--raise-if-missing", + default=True, + action=ActionYesNo, + help="raise exception if filter values are not in input file", + ) + add_common_args(parser) + return parser + + +def filter( + table_type: str, + input_file: PathLike, + filter_file: PathLike, + output_file: PathLike, + filter_by: str, + raise_if_missing: bool, +): + + input_file = Path(input_file) + filter_file = Path(filter_file) + output_file = Path(output_file) + + table_class = table_dict[table_type] + input_table = table_class.load(input_file) + filter_table = table_class.load(filter_file) + output_table = input_table.filter( + items=filter_table[filter_by], by=filter_by, raise_if_missing=raise_if_missing + ) + output_table.save(output_file) + + +def make_make_class_file_from_column_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input-file", required=True, help="input table file") + + parser.add_argument( + "--column", + required=True, + help="column that we want to use to create a class-file", + ) + parser.add_argument( + "--output-file", + required=True, + help="""output class-file table""", + ) + + add_common_args(parser) + return parser + + +def make_class_file_from_column( + table_type: str, + input_file: PathLike, + output_file: PathLike, + column: str, +): + + input_file = Path(input_file) + output_file = Path(output_file) + + table_class = table_dict[table_type] + input_table = table_class.load(input_file) + class_ids = np.unique(input_table[column]) + df = pd.DataFrame({"id": class_ids}) + output_table = ClassInfo(df) + output_table.save(output_file) + + def main(): parser = ArgumentParser(description="Tool to manipulates the Hyperion data tables") parser.add_argument("--cfg", action=ActionConfigFile) diff --git a/hyperion/np/pdfs/plda/frplda.py b/hyperion/np/pdfs/plda/frplda.py index 591948f9..84cf0ace 100644 --- a/hyperion/np/pdfs/plda/frplda.py +++ b/hyperion/np/pdfs/plda/frplda.py @@ -120,7 +120,12 @@ def compute_py_g_x( assert self.is_init - N, F, S = D + if isinstance(D, tuple): + N, F, S = D + else: + F = D + N = np.ones((F.shape[0],), dtype=F.dtype) + S = None M = F.shape[0] y_dim = self.y_dim diff --git a/hyperion/np/pdfs/plda/plda.py b/hyperion/np/pdfs/plda/plda.py index 35b133c2..92f77090 100644 --- a/hyperion/np/pdfs/plda/plda.py +++ b/hyperion/np/pdfs/plda/plda.py @@ -172,8 +172,13 @@ def compute_py_g_x( Py accumlator for MD step with shape (y_dim, y_dim) """ assert self.is_init + if isinstance(D, tuple): + N, F, S = D + else: + F = D + N = np.ones((F.shape[0],), dtype=F.dtype) + S = None - N, F, S = D Fc = F - self.mu M = F.shape[0] @@ -535,9 +540,7 @@ def log_probx_g_yz(self, x, y, z): logD = np.sum(np.log(self.D)) delta = x - self.mu - np.dot(y, self.V) - np.dot(z, self.U) logp = ( - -x.shape[-1] * np.log(2 * np.pi) - + logD - - np.sum(self.D * delta**2, axis=-1) + -x.shape[-1] * np.log(2 * np.pi) + logD - np.sum(self.D * delta**2, axis=-1) ) logp /= 2 return logp diff --git a/hyperion/np/pdfs/plda/plda_base.py b/hyperion/np/pdfs/plda/plda_base.py index 09544cae..a4a308e0 100644 --- a/hyperion/np/pdfs/plda/plda_base.py +++ b/hyperion/np/pdfs/plda/plda_base.py @@ -47,7 +47,7 @@ def __init__( epochs=20, ml_md="ml+md", md_epochs=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.mu = mu @@ -380,7 +380,9 @@ def llr_NvsM_book(self, D1, D2): """ pass - def llr_NvsM(self, x1, x2, ids1=None, ids2=None, method="vavg-lnorm"): + def llr_NvsM( + self, x1, x2, ids1=None, ids2=None, method=PLDALLRNvsMMethod.lnorm_vavg + ): """log-likelihood ratio between target and non-target hypothesis for the case of N segments/enrollment-side and M segments/test-side @@ -411,6 +413,8 @@ def llr_NvsM(self, x1, x2, ids1=None, ids2=None, method="vavg-lnorm"): if method == PLDALLRNvsMMethod.lnorm_vavg: return self.llr_NvsM_vavg(D1, D2, do_lnorm=True) + raise ValueError(f"wrong llr {method}") + def llr_NvsM_vavg(self, D1, D2, do_lnorm=True): """log-likelihood ratio between target and non-target hypothesis for the case of N segments/enrollment-side and M segments/test-side @@ -455,7 +459,7 @@ def llr_NvsM_savg(self, x1, ids1, x2, ids2): scores = F.T / N return scores - def llr_Nvs1(self, x1, x2, ids1=None, method="vavg-lnorm"): + def llr_Nvs1(self, x1, x2, ids1=None, method=PLDALLRNvsMMethod.lnorm_vavg): """log-likelihood ratio between target and non-target hypothesis for the case of N segments/enrollment-side and M segments/test-side @@ -484,6 +488,8 @@ def llr_Nvs1(self, x1, x2, ids1=None, method="vavg-lnorm"): if method == PLDALLRNvsMMethod.lnorm_vavg: return self.llr_Nvs1_vavg(D1, x2, do_lnorm=True) + raise ValueError(f"wrong llr {method}") + def llr_Nvs1_vavg(self, D1, x2, do_lnorm=True): """log-likelihood ratio between target and non-target hypothesis for the case of N segments/enrollment-side and M segments/test-side diff --git a/hyperion/np/pdfs/plda/splda.py b/hyperion/np/pdfs/plda/splda.py index 9e0c2a20..32fc4628 100644 --- a/hyperion/np/pdfs/plda/splda.py +++ b/hyperion/np/pdfs/plda/splda.py @@ -2,6 +2,7 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import numpy as np from scipy import linalg as sla @@ -122,7 +123,13 @@ def compute_py_g_x( Ry accumlator for ML step with shape (y_dim, y_dim) Py accumlator for MD step with shape (y_dim, y_dim) """ - N, F, S = D + if isinstance(D, tuple): + N, F, S = D + else: + F = D + N = np.ones((F.shape[0],), dtype=F.dtype) + S = None + Fc = F - self.mu M = F.shape[0] diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index e19ec329..83f314e1 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -360,17 +360,12 @@ def _resample(self, x, fs): return self.resampler(x, fs) - # try: - # if self.target_sample_freq is None or fs == self.target_sample_freq: - # return x, fs - # resampler = self._get_resampler(fs) - # return resampler(x), self.target_sample_freq - # except: - # return x, fs - def __getitem__(self, segment): seg_id, start, duration = self._parse_segment_item(segment) x, fs = self._read_audio(seg_id, start, duration) + assert ( + len(x) > 0 + ), f"read audio empty seg_id={seg_id}, start={start}, dur={duration}" x, fs = self._resample(x, fs) data = {"seg_id": seg_id, "sample_freq": fs} x_augs = self._apply_augs(x, duration, fs) @@ -384,28 +379,6 @@ def filter_args(**kwargs): args = filter_func_args(AudioDataset.__init__, kwargs) return args - # @staticmethod - # def filter_args(**kwargs): - - # ar_args = AR.filter_args(**kwargs) - # valid_args = ( - # "recordings_file", - # "segments_file", - # "aug_cfgs", - # "num_augs", - # "class_names", - # "class_files", - # "bpe_model", - # "text_file", - # "return_segment_info", - # "return_orig", - # "time_durs_file", - # "target_sample_freq", - # ) - # args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - # args.update(ar_args) - # return args - @staticmethod def add_class_args(parser, prefix=None, skip=set()): if prefix is not None: diff --git a/hyperion/torch/models/wav2xvectors/wav2xvector.py b/hyperion/torch/models/wav2xvectors/wav2xvector.py index 501fa7f8..69e7b3ca 100644 --- a/hyperion/torch/models/wav2xvectors/wav2xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2xvector.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import contextlib import logging @@ -155,14 +156,18 @@ def trainable_param_groups(self): def set_train_mode(self, mode): if mode == self._train_mode: return - + logging.info("setting Wav2XVector train mode to %s", mode) if mode == "full-feats-grad": self._feats_context = contextlib.nullcontext() xvector_mode = "full" else: logging.info("using torch.no_grad for feats") self._feats_context = torch.no_grad() + xvector_mode = mode + logging.info( + "setting Wav2XVector XVector object train mode to %s", xvector_mode + ) self.xvector.set_train_mode(xvector_mode) self._train_mode = mode @@ -173,7 +178,7 @@ def _train(self, train_mode: str): elif train_mode in ["full-feats-grad", "full"]: self.xvector._train("full") elif train_mode == "ft-embed-affine": - self.xvector._train("ft-embed_affine") + self.xvector._train(train_mode) else: raise ValueError(f"invalid train_mode={train_mode}") diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index b4926533..c20f5520 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -2,10 +2,10 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import logging -# from enum import Enum +import logging from dataclasses import dataclass +from enum import Enum from typing import List, Optional import torch @@ -21,6 +21,15 @@ from ...utils import eval_nnet_by_chunks, scale_seq_lengths +class XVectorHeadType(str, Enum): + XVECTOR = "x-vector" + DINO = "dino" + + @staticmethod + def choices(): + return [o.value for o in XVectorHeadType] + + @dataclass class XVectorOutput(HypDataClass): loss: torch.Tensor @@ -63,7 +72,7 @@ def __init__( embed_layer=0, in_feats=None, proj_feats=None, - head_type="x-vector", + head_type=XVectorHeadType.XVECTOR, bias_weight_decay=None, ): super().__init__(bias_weight_decay=bias_weight_decay) @@ -143,7 +152,7 @@ def __init__( self.proj_head_norm_before = proj_head_norm_before self.dropout_rate = dropout_rate self.embed_layer = embed_layer - if self.head_type == "x-vector": + if self.head_type == XVectorHeadType.XVECTOR: self.proj_head_net = None self.classif_net = ClassifHead( pool_feats, @@ -164,10 +173,11 @@ def __init__( dropout_rate=dropout_rate, use_in_norm=head_use_in_norm, ) - elif self.head_type == "dino": + elif self.head_type == XVectorHeadType.DINO: self.proj_head_net = ProjHead( pool_feats, embed_dim, + norm_layer=head_norm_layer, use_norm=proj_head_use_norm, norm_before=proj_head_norm_before, ) @@ -199,53 +209,53 @@ def num_classes(self): @property def cos_scale(self): - if self.head_type == "x-vector": + if self.head_type == XVectorHeadType.XVECTOR: return self.classif_net.cos_scale - elif self.head_type == "dino": + elif self.head_type == XVectorHeadType.DINO: return 1 else: raise ValueError @property def margin(self): - if self.head_type == "x-vector": + if self.head_type == XVectorHeadType.XVECTOR: return self.classif_net.margin else: return 0.0 @property def margin_warmup_epochs(self): - if self.head_type == "x-vector": + if self.head_type == XVectorHeadType.XVECTOR: return self.classif_net.margin_warmup_epochs else: return 0 @property def intertop_k(self): - if self.head_type == "x-vector": + if self.head_type == XVectorHeadType.XVECTOR: return self.classif_net.intertop_k else: return 0 @property def intertop_margin(self): - if self.head_type == "x-vector": + if self.head_type == XVectorHeadType.XVECTOR: return self.classif_net.intertop_margin else: return 0.0 @property def num_subcenters(self): - if self.head_type == "x-vector": + if self.head_type == XVectorHeadType.XVECTOR: return self.classif_net.num_subcenters else: return 0 @property def loss_type(self): - if self.head_type == "x-vector": + if self.head_type == XVectorHeadType.XVECTOR: return self.classif_net.loss_type - elif self.head_type == "dino": + elif self.head_type == XVectorHeadType.DINO: return self.classif_net.output_type else: raise ValueError() @@ -260,13 +270,13 @@ def loss_type(self): # return new_self # def before_cloning(self): - # if self.head_type == "dino": + # if self.head_type == XVectorHeadType.DINO: # return self.classif_net.before_cloning() # else: # return None, None # def after_cloning(self, output): - # if self.head_type == "dino": + # if self.head_type == XVectorHeadType.DINO: # self.classif_net.after_cloning(output) def _make_pool_net(self, pool_net, enc_feats=None): @@ -643,6 +653,7 @@ def change_config( intertop_k=5, intertop_margin=0.0, num_subcenters=2, + head_type=XVectorHeadType.XVECTOR, ): logging.info("changing x-vector config") if override_output: @@ -655,6 +666,7 @@ def change_config( intertop_k=intertop_k, intertop_margin=intertop_margin, num_subcenters=num_subcenters, + head_type=head_type, ) if override_dropouts: @@ -672,7 +684,60 @@ def rebuild_output_layer( intertop_k=5, intertop_margin=0.0, num_subcenters=2, + head_type=XVectorHeadType.XVECTOR, ): + + if head_type != self.head_type: + # only from dino to x-vector + assert self.head_type == XVectorHeadType.DINO + logging.info("transforming dino head into x-vector head") + self.num_embed_layers = 1 + self.head_use_in_norm = ( + self.proj_head_use_norm and self.proj_head_norm_before + ) + self.head_use_norm = ( + self.proj_head_use_norm and not self.proj_head_norm_before + ) + self.classif_net = ClassifHead( + self.proj_head_net.in_feats, + num_classes, + embed_dim=self.proj_head_net.out_feats, + num_embed_layers=1, + hid_act=None, + loss_type=loss_type, + cos_scale=cos_scale, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + intertop_k=intertop_k, + intertop_margin=intertop_margin, + num_subcenters=num_subcenters, + norm_layer=self.head_norm_layer, + use_norm=self.proj_head_use_norm, + norm_before=self.norm_before, + dropout_rate=self.dropout_rate, + use_in_norm=self.head_use_in_norm, + ) + + if ( + self.classif_net.fc_blocks[0].linear.bias is not None + and self.proj_head_net.proj.bias is not None + ): + self.classif_net.fc_blocks[0].linear.bias.data.copy_( + self.proj_head_net.proj.bias.data + ) + + self.classif_net.fc_blocks[0].linear.weight.data.copy_( + self.proj_head_net.proj.weight.data + ) + if self.head_use_norm: + self.classif_net.fc_blocks[0].bn1.load_state_dict( + self.proj_head_net._norm_layer.state_dict() + ) + del self.proj_head_net + self.proj_head_net = None + self.head_type = XVectorHeadType.XVECTOR + return + if ( (self.num_classes is not None and self.num_classes != num_classes) or (self.loss_type != loss_type) @@ -733,7 +798,7 @@ def set_train_mode(self, mode): else: raise ValueError(f"invalid train_mode={mode}") - if self.head_type == "dino": + if self.head_type == XVectorHeadType.DINO: self.classif_net.freeze_output_g() self._train_mode = mode @@ -780,8 +845,8 @@ def add_class_args(parser, prefix=None, skip=set()): parser.add_argument( "--head-type", - default="x-vector", - choices=["x-vector", "dino"], + default=XVectorHeadType.XVECTOR, + choices=XVectorHeadType.choices(), help="type of classification head in [x-vector, dino]", ) diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index f232c986..ff36096b 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -221,9 +221,8 @@ def _make_in_layer(self): else: raise Exception(f"wrong pos-enc-type={self.pos_enc_type}") - hid_act = AF.create(self.hid_act) - if self.in_layer_type == "linear": + hid_act = AF.create(self.hid_act) self.in_layer = nn.Sequential( nn.Linear(in_feats, d_model), nn.LayerNorm(d_model), @@ -235,7 +234,7 @@ def _make_in_layer(self): self.in_layer = Conv2dSubsampler( in_feats, d_model, - hid_act, + self.hid_act, self.in_stride, pos_enc, time_dim=self.in_time_dim, @@ -244,7 +243,7 @@ def _make_in_layer(self): self.in_layer = Conv1dSubsampler( in_feats, d_model, - hid_act, + self.hid_act, self.in_stride, pos_enc, time_dim=self.in_time_dim, diff --git a/hyperion/torch/narchs/dino_head.py b/hyperion/torch/narchs/dino_head.py index 87c8daae..9f05aa7f 100644 --- a/hyperion/torch/narchs/dino_head.py +++ b/hyperion/torch/narchs/dino_head.py @@ -2,6 +2,7 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + from typing import Optional import torch @@ -80,7 +81,8 @@ def __init__( if num_hid_layers == 1: self.fc_layers = nn.Linear(in_feats, bottleneck_feats) else: - layers = [nn.Linear(in_feats, hid_feats)] + use_bias = False if use_norm and norm_before else True + layers = [nn.Linear(in_feats, hid_feats, bias=use_bias)] if use_norm and norm_before: layers.append(self._norm_layer(hid_feats)) layers.append(AF.create(hid_act)) @@ -90,7 +92,7 @@ def __init__( layers.append(nn.Dropout(self.dropout_rate)) for _ in range(num_hid_layers - 2): - layers.append(nn.Linear(hid_feats, hid_feats)) + layers.append(nn.Linear(hid_feats, hid_feats, bias=use_bias)) if use_norm and norm_before: layers.append(self._norm_layer(hid_feats)) layers.append(AF.create(hid_act)) diff --git a/hyperion/torch/narchs/proj_head.py b/hyperion/torch/narchs/proj_head.py index e2838013..63a5e128 100644 --- a/hyperion/torch/narchs/proj_head.py +++ b/hyperion/torch/narchs/proj_head.py @@ -3,7 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import torch import torch.nn as nn from jsonargparse import ActionParser, ActionYesNo, ArgumentParser @@ -40,7 +39,12 @@ class ProjHead(NetArch): """ def __init__( - self, in_feats, out_feats=256, norm_layer=None, use_norm=True, norm_before=True, + self, + in_feats, + out_feats=256, + norm_layer=None, + use_norm=True, + norm_before=True, ): super().__init__() @@ -49,7 +53,7 @@ def __init__( self.norm_layer = norm_layer self.use_norm = use_norm self.norm_before = norm_before - + use_bias = True if use_norm: norm_groups = None if norm_layer == "group-norm": @@ -59,26 +63,21 @@ def __init__( self._norm_layer = _norm_layer(in_feats) else: self._norm_layer = _norm_layer(out_feats) + use_bias = False else: self._norm_layer = None - self.proj = nn.Linear(in_feats, out_feats) + self.proj = nn.Linear(in_feats, out_feats, bias=use_bias) def forward(self, x, y=None): if self.use_norm and self.norm_before: x = self._norm_layer(x) - # assert not torch.any( - # torch.isnan(x) - # ), f"x before proj is nan {x.size()} {torch.sum(torch.isnan(x))}" + x = self.proj(x) - # assert not torch.any( - # torch.isnan(x) - # ), f"x after proj is nan {x.size()} {torch.sum(torch.isnan(x))}" + if self.use_norm and not self.norm_before: x = self._norm_layer(x) - # assert not torch.any( - # torch.isnan(x) - # ), f"x after bn is nan {x.size()} {torch.sum(torch.isnan(x))}" + return x def get_config(self): diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 9939797e..4004a565 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -14,7 +14,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset -from .torch_trainer import TorchTrainer +from .torch_trainer import AMPDType, TorchTrainer class AETrainer(TorchTrainer): @@ -36,6 +36,7 @@ class AETrainer(TorchTrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -70,6 +71,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, @@ -173,7 +175,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - super().add_class_args(parser, train_modes, skip=skip.union({"target_key"})) + TorchTrainer.add_class_args( + parser, train_modes, skip=skip.union({"target_key"}) + ) if "target_key" not in skip: parser.add_argument( "--target-key", default="x", help="dict. key for nnet targets" diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index f128db44..10bc2edc 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -14,7 +14,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset -from .torch_trainer import TorchTrainer +from .torch_trainer import AMPDType, TorchTrainer class DVAETrainer(TorchTrainer): @@ -35,6 +35,7 @@ class DVAETrainer(TorchTrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -68,6 +69,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, @@ -209,7 +211,7 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - super().add_class_args( + TorchTrainer.add_class_args( parser, train_modes, skip=skip.union({"input_key", "target_key"}) ) if "input_key" not in skip: diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index 71845a4b..cd0b17e8 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import os from collections import OrderedDict as ODict @@ -14,7 +15,7 @@ from ..losses import BCEWithLLR from ..utils import MetricAcc, tensors_subset from ..utils.misc import get_selfsim_tarnon -from .torch_trainer import TorchTrainer +from .torch_trainer import AMPDType, TorchTrainer class PLDATrainer(TorchTrainer): @@ -36,9 +37,9 @@ class PLDATrainer(TorchTrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) loss: if None, it uses cross-entropy loss_weights: dictionary with weights for multiclass and binary cross-entropies - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -75,6 +76,7 @@ def __init__( p_tar=0.5, train_mode="train", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 1d4665cf..808cce3e 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -2,6 +2,7 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import os from collections import OrderedDict as ODict @@ -14,7 +15,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset -from .torch_trainer import TorchTrainer +from .torch_trainer import AMPDType, TorchTrainer class TransducerTrainer(TorchTrainer): @@ -37,6 +38,7 @@ class TransducerTrainer(TorchTrainer): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -69,6 +71,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index 79526122..dbf5dfdd 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -14,7 +14,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset -from .torch_trainer import TorchTrainer +from .torch_trainer import AMPDType, TorchTrainer class VAETrainer(TorchTrainer): @@ -35,6 +35,7 @@ class VAETrainer(TorchTrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger @@ -68,6 +69,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, @@ -211,7 +213,9 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - super().add_class_args(parser, train_modes, skip=skip.union({"target_key"})) + TorchTrainer.add_class_args( + parser, train_modes, skip=skip.union({"target_key"}) + ) if "target_key" not in skip: parser.add_argument( "--target-key", default="x", help="dict. key for nnet targets" diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index ff3f85cc..43aa59a5 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -2,6 +2,7 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import math import os @@ -14,7 +15,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset -from .dvae_trainer import DVAETrainer +from .dvae_trainer import AMPDType, DVAETrainer class VQDVAETrainer(DVAETrainer): @@ -35,6 +36,7 @@ class VQDVAETrainer(DVAETrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -68,6 +70,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, @@ -174,7 +177,7 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - super().add_class_args( + DVAETrainer.add_class_args( parser, train_modes, skip=skip.union({"input_key", "target_key"}) ) if "input_key" not in skip: diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 4ec04fde..64db2e64 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -2,6 +2,7 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import math import os @@ -14,7 +15,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset -from .vae_trainer import VAETrainer +from .vae_trainer import AMPDType, VAETrainer class VQVAETrainer(VAETrainer): @@ -35,6 +36,7 @@ class VQVAETrainer(VAETrainer): ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -68,6 +70,7 @@ def __init__( ddp_type="ddp", train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, @@ -85,34 +88,6 @@ def __init__( super_args = filter_func_args(super().__init__, locals()) super().__init__(**super_args) - # super().__init__( - # model, - # optim, - # epochs, - # exp_path, - # cur_epoch=cur_epoch, - # grad_acc_steps=grad_acc_steps, - # eff_batch_size=eff_batch_size, - # device=device, - # metrics=metrics, - # lrsched=lrsched, - # loggers=loggers, - # ddp=ddp, - # ddp_type=ddp_type, - # train_mode=train_mode, - # use_amp=use_amp, - # log_interval=log_interval, - # use_tensorboard=use_tensorboard, - # use_wandb=use_wandb, - # wandb=wandb, - # grad_clip=grad_clip, - # grad_clip_norm=grad_clip_norm, - # swa_start=swa_start, - # swa_lr=swa_lr, - # swa_anneal_epochs=swa_anneal_epochs, - # cpu_offload=cpu_offload, - # ) - def train_epoch(self, data_loader): batch_keys = [self.input_key, self.target_key] metric_acc = MetricAcc(device=self.device) @@ -203,7 +178,7 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - super().add_class_args(parser, train_modes, skip=skip.union({"target_key"})) + VAETrainer.add_class_args(parser, train_modes, skip=skip.union({"target_key"})) if "target_key" not in skip: parser.add_argument( "--target-key", default="x", help="dict. key for nnet targets" diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index e19945d1..b9dd67d7 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import os import time @@ -14,6 +15,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType from .xvector_trainer import XVectorTrainer @@ -39,6 +41,7 @@ class XVectorAdvTrainer(XVectorTrainer): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger @@ -76,6 +79,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index ad6a3262..8ece7de2 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import os import time @@ -14,6 +15,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType from .xvector_trainer_from_wav import XVectorTrainerFromWav @@ -41,6 +43,7 @@ class XVectorAdvTrainerFromWav(XVectorTrainerFromWav): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -78,6 +81,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 666c9a9d..2902f23d 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import os from collections import OrderedDict as ODict @@ -13,7 +14,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset -from .torch_trainer import TorchTrainer +from .torch_trainer import AMPDType, TorchTrainer class XVectorTrainer(TorchTrainer): @@ -36,6 +37,7 @@ class XVectorTrainer(TorchTrainer): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -70,6 +72,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index d80f03f1..98bc404d 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import os from collections import OrderedDict as ODict @@ -13,6 +14,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset +from .torch_trainer import AMPDType from .xvector_trainer import XVectorTrainer @@ -41,6 +43,7 @@ class XVectorTrainerDeepFeatReg(XVectorTrainer): reg_loss: nn.Module loss used for regularization, if None it uses L1 loss. train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -81,6 +84,7 @@ def __init__( reg_loss=None, train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index cf956dc7..1005435f 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import os from collections import OrderedDict as ODict @@ -12,7 +13,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, tensors_subset -from .torch_trainer import TorchTrainer +from .torch_trainer import AMPDType from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg @@ -41,6 +42,7 @@ class XVectorTrainerDeepFeatRegFromWav(XVectorTrainerDeepFeatReg): reg_loss: nn.Module loss used for regularization, if None it uses L1 loss. train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -82,6 +84,7 @@ def __init__( reg_loss=None, train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=10, use_tensorboard=False, use_wandb=False, diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 89c9b9a7..2a238a06 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import os from collections import OrderedDict as ODict @@ -12,6 +13,7 @@ from ...utils.misc import filter_func_args from ..utils import MetricAcc, TorchDDP, tensors_subset +from .torch_trainer import AMPDType from .xvector_trainer import XVectorTrainer @@ -35,6 +37,7 @@ class XVectorTrainerFromWav(XVectorTrainer): loss: if None, it uses cross-entropy train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] use_amp: uses mixed precision training. + amp_dtype: "float16" | "bfloat16" log_interval: number of optim. steps between log outputs use_tensorboard: use tensorboard logger use_wandb: use wandb logger @@ -70,6 +73,7 @@ def __init__( loss=None, train_mode="full", use_amp=False, + amp_dtype=AMPDType.FLOAT16, log_interval=1000, use_tensorboard=False, use_wandb=False, diff --git a/hyperion/utils/dataset.py b/hyperion/utils/dataset.py index 1b35364d..e485f1a5 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/dataset.py @@ -2,6 +2,7 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import math from copy import deepcopy @@ -619,8 +620,6 @@ def save_all( def update_from_disk(self): self.segments() self.recordings() - # for k, v in self.recordings(): - # pass for k, v in self.features(): pass @@ -810,18 +809,6 @@ def remove_recordings( self._recordings = None self._recordings_path = None - # def remove_recordings( - # self, - # recordings_name: str, - # ): - # if self._recordingsr_paths[recordings_name] is not None: - # file_path = Path(self._recordings_paths[recordings_name]) - # if file_path.is_file(): - # file_path.unlink() - - # del self._recordings[recordings_name] - # del self._recordings_paths[recordings_name] - def remove_classes(self, classes_name: str): if self._classes_paths[classes_name] is not None: self._files_to_delete.append(self._class_paths[classes_name]) @@ -855,6 +842,8 @@ def add_cols_to_segments( column_names: Union[None, str, List[str], np.ndarray] = None, on: Union[str, List[str], np.ndarray] = "id", right_on: Union[None, str, List[str], np.ndarray] = None, + remove_missing: bool = False, + create_class_info: bool = False, ): if isinstance(right_table, (str, Path)): file_path = Path(right_table) @@ -871,30 +860,50 @@ def add_cols_to_segments( raise ValueError("%s not found", right_table) segments = self.segments(keep_loaded=True) - segments.add_columns(right_table, column_names, on=on, right_on=right_on) + num_segs_0 = len(segments) + segments.add_columns( + right_table, + column_names, + on=on, + right_on=right_on, + remove_missing=remove_missing, + ) + if remove_missing and len(segments) < num_segs_0: + self.clean() + + if create_class_info and column_names is not None: + self.create_class_info_from_col(column_names) + + def create_class_info_from_col( + self, + column_names: Union[str, List[str], np.ndarray], + ): + if isinstance(column_names, str): + column_names = [column_names] + + for col in column_names: + if col not in self._classes: + df = pd.DataFrame( + {"id": np.unique(self.segments(keep_loaded=True)[col])} + ) + class_info = ClassInfo(df) + self.add_classes(col, class_info) def clean(self, rebuild_class_idx=False): + rec_ids = self.segments().recordings() - # for k, table in self.recordings(): - # # table = table.loc[table["id"].isin(rec_ids)].copy() - # # self._recordings[k] = RecordingSet(table) self._recordings = self.recordings().filter(lambda df: df["id"].isin(rec_ids)) ids = self.segments()["id"].values for k, table in self.features(): self._features[k] = table.filter(lambda df: df["id"].isin(ids)) - # table = table.loc[table["id"].isin(ids)].copy() - # self._features[k] = FeatureSet(table) for k, table in self.classes(): class_ids = self.segments()[k].unique() self._classes[k] = table.filter(lambda df: df["id"].isin(class_ids)) - # table = table[table["id"].isin(class_ids)].copy() - # self._classes[k] = ClassInfo(table) remove_keys = [] for k, table in self.enrollments(): - # table = table.loc[table["segmentid"].isin(ids)].copy() table = table.filter(lambda df: df["segmentid"].isin(ids)) if len(table) > 0: self._enrollments[k] = table @@ -1054,6 +1063,27 @@ def remove_classes_few_segments( class_info = self.classes_value(class_name) class_info.add_class_idx() + def remove_classes_few_toomany_segments( + self, + class_name: str, + min_segs: int, + max_segs: int, + rebuild_idx: bool = False, + ): + segments = self.segments() + classes, counts = np.unique(segments[class_name], return_counts=True) + if max_segs is None: + keep_classes = classes[counts >= min_segs] + else: + keep_classes = classes[ + np.logical_and(counts >= min_segs, counts <= max_segs) + ] + self._segments = segments.filter(lambda df: df[class_name].isin(keep_classes)) + self.clean() + if rebuild_idx: + class_info = self.classes_value(class_name) + class_info.add_class_idx() + def rebuild_class_idx(self, class_name: str): class_info = self.classes_value(class_name) class_info.add_class_idx() diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index b94d9752..ea03f058 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -8,7 +8,7 @@ from collections import OrderedDict from copy import deepcopy from pathlib import Path -from typing import Optional, Union, List +from typing import List, Optional, Union import numpy as np import pandas as pd @@ -194,7 +194,14 @@ def cat(cls, tables): return cls(df) def filter( - self, predicate=None, items=None, iindex=None, columns=None, by="id", keep=True + self, + predicate=None, + items=None, + iindex=None, + columns=None, + by="id", + keep=True, + raise_if_missing=True, ): """Filters the table and produce a new table with the elements to keep @@ -243,15 +250,20 @@ def filter( elif items is not None: if by != "id": missing = [False if v in df[by] else True for v in items] - if any(missing): + if any(missing) and raise_if_missing: raise Exception(f"{items[missing]} not found in table") items = [True if v in items else False for v in df[by]] + elif not raise_if_missing: + items = [item for item in items if item in df.index] if columns is None: df = df.loc[items] else: df = df.loc[items, columns] else: + if not raise_if_missing: + iindex = iindex[iindex < len(df)] + if iindex is not None: df = self.df.iloc[iindex] @@ -327,6 +339,7 @@ def add_columns( column_names: Union[None, str, List[str], np.ndarray] = None, on: Union[str, List[str], np.ndarray] = "id", right_on: Union[None, str, List[str], np.ndarray] = None, + remove_missing: bool = False, ): if isinstance(right_table, InfoTable): right_table = right_table.df @@ -337,7 +350,25 @@ def add_columns( if right_on is None: right_on = on - self.df = self.df.merge(right_table, how="left", left_on=on, right_on=right_on) + how = "inner" if remove_missing else "left" + left_index = False + right_index = False + if on == "id" or on == ["id"]: + on = None + left_index = True + + if (right_on == "id" or right_on == ["id"]) and "id" in right_table: + right_on = None + right_index = True + + self.df = self.df.merge( + right_table, + how=how, + left_on=on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) # def __len__(self): From ab8c2859fc3531da01cb7003e2881f9271130ba6 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 26 Apr 2024 16:17:15 -0400 Subject: [PATCH 132/154] added results to README in voxceleb/ssl.v1 --- egs/voxceleb/ssl.v1/README.md | 188 ++++++++++++++++++ egs/voxceleb/ssl.v1/cmd.sh | 2 +- egs/voxceleb/ssl.v1/default_config.sh | 1 + hyperion/bin/hyperion_dataset.py | 60 +++++- .../split_dataset_into_trials_and_cohort.py | 4 +- hyperion/data_prep/musan.py | 7 +- hyperion/data_prep/rirs.py | 18 +- hyperion/data_prep/voxceleb1.py | 7 +- hyperion/data_prep/voxceleb2.py | 7 +- hyperion/data_prep/voxsrc22.py | 7 +- hyperion/torch/lr_schedulers/cos_lr.py | 6 +- hyperion/torch/lr_schedulers/lr_scheduler.py | 2 +- hyperion/utils/__init__.py | 4 +- hyperion/utils/{dataset.py => hyp_dataset.py} | 107 ++++++++-- 14 files changed, 369 insertions(+), 51 deletions(-) create mode 100644 egs/voxceleb/ssl.v1/README.md create mode 120000 egs/voxceleb/ssl.v1/default_config.sh rename hyperion/utils/{dataset.py => hyp_dataset.py} (91%) diff --git a/egs/voxceleb/ssl.v1/README.md b/egs/voxceleb/ssl.v1/README.md new file mode 100644 index 00000000..03b2e1c4 --- /dev/null +++ b/egs/voxceleb/ssl.v1/README.md @@ -0,0 +1,188 @@ +# VoxCeleb SSL V1 + +Recipe for the Unsupervised VoxCeleb Speaker Verification Task: + - Trains embedding extractor using DINO + - Cluster embeddings of VoxCeleb2 to get pseuso-speaker labels + - Embedding Model is fine-tuned with Large Margin Softmax loss on the pseudo-speaker labels + - Repeakt embedding clustering to get new pseuso-speaker labels + - Embedding Model is fine-tuned with Large Margin Softmax loss on the new pseudo-speaker labels + +## Citing + +If you use our DINO implementation, please, cite these works: + +``` +@ARTICLE{9852303, + author={Cho, Jaejin and Villalba, Jesús and Moro-Velazquez, Laureano and Dehak, Najim}, + journal={IEEE Journal of Selected Topics in Signal Processing}, + title={Non-Contrastive Self-Supervised Learning for Utterance-Level Information Extraction From Speech}, + year={2022}, + volume={16}, + number={6}, + pages={1284-1295}, + keywords={Alzheimer's disease;Transfer learning;Speech processing;Feature extraction;Self-supervised learning;Training;Emotion recognition;Self-supervised learning;transfer learning;speaker verification;emotion recognition;Alzheimer's disease;distillation;non-contrastive}, + doi={10.1109/JSTSP.2022.3197315}} + +@inproceedings{cho22c_interspeech, + author={Jaejin Cho and Raghavendra Pappagari and Piotr Żelasko and Laureano Moro Velazquez and Jesus Villalba and Najim Dehak}, + title={{Non-contrastive self-supervised learning of utterance-level speech representations}}, + year=2022, + booktitle={Proc. Interspeech 2022}, + pages={4028--4032}, + doi={10.21437/Interspeech.2022-11141} +} +``` + +## Training Data + + - x-Vector network is trained on Voxceleb2 dev + test with augmentations + - MUSAN noise + - RIR reverberation + +## Test data + + - Test data is VoxCeleb 1 + - We evaluate the 3 conditions (with cleaned lists): + - VoxCeleb-O (Original): Original Voxceleb test set with 40 speakers + - VoxCeleb-E (Entire): List using all utterances of VoxCeleb1 + - VoxCeleb-H (Hard): List of hard trials between all utterances of VoxCeleb1, same gender and nationality trials. + +## Usage + + - Run the run_0*.sh scripts in sequence + - By default it will use config global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh + - To use other configs: +```bash +run_xxx_xxxx.sh --config-file global_conf/other_config.sh +``` + + +## Recipe Steps: + + - `run_001_prepare_data.sh` + - Data preparation script to generate Kaldi style data directories for + - VoxCeleb2 train+test + - VoxCeleb1 O/E/H eval sets + + - `run_002_compute_evad.sh` + - Computes Energy VAD for all datasets + + - `run_003_prepare_noises_rirs.sh` + - Prepares MUSAN noises, music to be used by SpeechAugment class. + - Creates Babble noise from MUSAN speech to be used by SpeechAugment class. + - Prepares RIRs by compacting then into HDF5 files, to be used by SpeechAugment class. + + - `run_004_prepare_xvec_train_data.sh` + - Transforms all the audios that we are going to use to train the x-vector into a common format, e.g., .flac. + - Removes silence from the audios + - Removes utterances shorter than 4secs and speakers with less than 8 utterances. + - Creates training and validation lists for x-vector training + + - `run_005_train_dino.sh` + - Trains DINO embeddings + + - `run_006_extract_dino_embeds_cluster_eval.sh` + - Extracts DINO embeddings for Vox2 and Vox1 + - Evaluates SV metrics in Vox1-O/E/H using Cosine Scoring + - Clusters Vox2 Embeddings into pseudo-speakers + - Trains PLDA on Vox2 pseudo-speakers + - Evaluates SV metrics in Vox1-O/E/H using PLDA + + - `run_007_train_xvector.sh` + - Fine-tunes DINO model in x-vector style using pseudo-labels from previous step + - First, it finetunes x-vector projection and output layer with the rest of network frozen + - Second, it finetunes full network + + - `run_008_extract_ft1_xvec_embeds_cluster_eval.sh` + - Extracts X-Vector embeddings for Vox2 and Vox1 + - Evaluates SV metrics in Vox1-O/E/H using Cosine Scoring + - Clusters Vox2 Embeddings into pseudo-speakers + - Trains PLDA on Vox2 pseudo-speakers + - Evaluates SV metrics in Vox1-O/E/H using PLDA + + - `run_009_finetune_xvector_s2.sh` + - Fine-tunes X-Vector model in x-vector style using pseudo-labels from previous step + - First, it finetunes x-vector projection and output layer with the rest of network frozen + - Second, it finetunes full network + + - `run_010_extract_ft2_xvec_embeds_cluster_eval.sh` + - Extracts X-Vector embeddings for Vox2 and Vox1 + - Evaluates SV metrics in Vox1-O/E/H using Cosine Scoring + - Clusters Vox2 Embeddings into pseudo-speakers + - Trains PLDA on Vox2 pseudo-speakers + - Evaluates SV metrics in Vox1-O/E/H using PLDA + + +## Results + +### VoxCeleb 1 Original-Clean trial list + +| Config | Model Type | DINO Clustering | X-Vector Clustering | Stage | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | --------------- | ------------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34.v1.2.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 3.96 | 0.276 | 0.423 | +| | | | | | PLDA | 3.18 | 0.182 | 0.273 | +| | | | | FT-1 | Cosine | 1.97 | 0.139 | 0.214 | +| | | | | FT-2 | Cosine | 1.80 | 0.133 | 0.200 | +| config_fbank80_stmn_lresnet34.v1.2.1.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.75 | 0.124 | 0.197 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.sh | ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 4.14 | 0.274 | 0.405 | +| | | | | | PLDA | 4.16 | 0.225 | 0.361 | +| | | | | FT-1 | Cosine | 2.68 | 0.173 | 0.258 | +| | | | | FT-2 | Cosine | 2.57 | 0.151 | 0.244 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.1.sh| ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 2.71 | 0.169 | 0.243 | +| config_fbank80_stmn_fwseresnet34.v1.2.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 4.57 | 0.344 | 0.553 | +| | | | | | PLDA | 2.92 | 0.232 | 0.410 | +| | | | | FT-1 | Cosine | 2.11 | 0.135 | 0.223 | +| | | | | FT-1 | PLDA | 1.75 | 0.137 | 0.236 | +| | | | | FT-2 | Cosine | 1.65 | 0.116 | 0.168 | +| | | | | FT-2 | PLDA | 1.67 | 0.137 | 0.193 | +| config_fbank80_stmn_fwseresnet34.v1.2.1.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.49 | 0.101 | 0.161 | +| | | | | FT-2 | PLDA | 1.53 | 0.109 | 0.168| + + +### VoxCeleb 1 Entire-Clean trial list + +| Config | Model Type | DINO Clustering | X-Vector Clustering | Stage | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | --------------- | ------------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34.v1.2.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 4.94 | 0.304 | 0.483 | +| | | | | | PLDA | 3.72 | 0.184 | 0.300 | +| | | | | FT-1 | Cosine | 2.35 | 0.136 | 0.217 | +| | | | | FT-2 | Cosine | 2.02 | 0.118 | 0.195 | +| config_fbank80_stmn_lresnet34.v1.2.1.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.98 | 0.116 | 0.185 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.sh | ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 4.61 | 0.293 | 0.455| +| | | | | | PLDA | 3.91 | 0.223 | 0.356 | +| | | | | FT-1 | Cosine | 3.04 | 0.168 | 0.263 | +| | | | | FT-2 | Cosine | 2.83 | 0.155 | 0.248 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.1.sh| ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 3.06 | 0.164 | 0.256 | +| config_fbank80_stmn_fwseresnet34.v1.2.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 5.50 | 0.426 | 0.664 | +| | | | | | PLDA | 3.33 | 0.245 | 0.425 | +| | | | | FT-1 | Cosine | 2.42 | 0.147 | 0.243 | +| | | | | FT-1 | PLDA | 2.03 | 0.144 | 0.255 | +| | | | | FT-2 | Cosine | 1.86 | 0.112 | 0.186 | +| | | | | FT-2 | PLDA | 1.77 | 0.121 | 0.208 | +| config_fbank80_stmn_fwseresnet34.v1.2.1.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.83 | 0.106 | 0.170 | +| | | | | FT-2 | PLDA | 1.68 | 0.109 | 0.188 | + + +### VoxCeleb 1 Hard-Clean trial list + +| Config | Model Type | DINO Clustering | X-Vector Clustering | Stage | Back-end | EER(%) | MinDCF(p=0.05) | MinDCF(p=0.01) | +| ------ | ---------- | ------------- | --------------- | ------------------- | -------- | :----: | :------------: | :------------: | +| config_fbank80_stmn_lresnet34.v1.2.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 8.33 | 0.462 | 0.664 | +| | | | | | PLDA | 5.91 | 0.304 | 0.481 | +| | | | | FT-1 | Cosine | 3.89 | 0.215 | 0.340 | +| | | | | FT-2 | Cosine | 3.44 | 0.192 | 0.303 | +| config_fbank80_stmn_lresnet34.v1.2.1.sh | LResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 3.33 | 0.185 | 0.290 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.sh | ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 8.38 | 0.458 | 0.635 | +| | | | | | PLDA | 6.48 | 0.360 | 0.532 | +| | | | | FT-1 | Cosine | 4.93 | 0.259 | 0.383 | +| | | | | FT-2 | Cosine | 4.73 | 0.251 | 0.375 | +| config_fbank80_stmn_ecapatdnn512x3.v1.2.1.sh| ECAPA-TDNN 512x3 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 4.90 | 0.251 | 0.378 | +| config_fbank80_stmn_fwseresnet34.v1.2.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC+PLDA+AHC | DINO | Cosine | 10.9 | 0.644 | 0.822 | +| | | | | | PLDA | 6.86 | 0.481 | 0.745 | +| | | | | FT-1 | Cosine | 4.35 | 0.25 | 0.393 | +| | | | | FT-1 | PLDA | 4.21 | 0.281 | 0.452 +| | | | | FT-2 | Cosine | 3.37 | 0.194 | 0.309 | +| | | | | FT-2 | PLDA | 3.51 | 0.219 | 0.351 | +| config_fbank80_stmn_fwseresnet34.v1.2.1.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 3.11 | 0.172 | 0.270 | +| | | | | FT-2 | PLDA | 3.15 | 0.186 | 0.294 | + diff --git a/egs/voxceleb/ssl.v1/cmd.sh b/egs/voxceleb/ssl.v1/cmd.sh index 040f458b..4efc96e1 100755 --- a/egs/voxceleb/ssl.v1/cmd.sh +++ b/egs/voxceleb/ssl.v1/cmd.sh @@ -14,7 +14,7 @@ if [ "$(hostname -d)" == "cm.gemini" ];then #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" - #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" + export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 40G" export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" diff --git a/egs/voxceleb/ssl.v1/default_config.sh b/egs/voxceleb/ssl.v1/default_config.sh new file mode 120000 index 00000000..f8aa12d5 --- /dev/null +++ b/egs/voxceleb/ssl.v1/default_config.sh @@ -0,0 +1 @@ +global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh \ No newline at end of file diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py index 2bd01f2d..3bb61fb0 100755 --- a/hyperion/bin/hyperion_dataset.py +++ b/hyperion/bin/hyperion_dataset.py @@ -18,9 +18,9 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import ( ClassInfo, - Dataset, EnrollmentMap, FeatureSet, + HypDataset, InfoTable, PathLike, RecordingSet, @@ -81,7 +81,7 @@ def add_features( if output_dataset is None: output_dataset = dataset - dataset = Dataset.load(dataset, lazy=True) + dataset = HypDataset.load(dataset, lazy=True) dataset.add_features(features_name, features_file) dataset.save(output_dataset) @@ -128,7 +128,7 @@ def set_recordings( if output_dataset is None: output_dataset = dataset - dataset = Dataset.load(dataset, lazy=True) + dataset = HypDataset.load(dataset, lazy=True) dataset.set_recordings(recordings_file, update_seg_durs) if remove_features is not None: for features_name in remove_features: @@ -161,7 +161,7 @@ def make_from_recordings( rec_df = pd.read_csv(recordings_file) seg_df = rec_df[["id"]] segments = SegmentSet(seg_df) - dataset = Dataset(segments, recordings=recordings_file) + dataset = HypDataset(segments, recordings=recordings_file) dataset.save(output_dataset) @@ -202,7 +202,7 @@ def remove_short_segments( if output_dataset is None: output_dataset = dataset - dataset = Dataset.load(dataset, lazy=True) + dataset = HypDataset.load(dataset, lazy=True) dataset.remove_short_segments(min_length, length_name) dataset.save(output_dataset) @@ -234,7 +234,7 @@ def rebuild_class_idx( if output_dataset is None: output_dataset = dataset - dataset = Dataset.load(dataset, lazy=True) + dataset = HypDataset.load(dataset, lazy=True) dataset.rebuild_class_idx(class_name) dataset.save(output_dataset) @@ -277,7 +277,7 @@ def remove_classes_few_segments( if output_dataset is None: output_dataset = dataset - dataset = Dataset.load(dataset, lazy=True) + dataset = HypDataset.load(dataset, lazy=True) dataset.remove_classes_few_segments(class_name, min_segs, rebuild_idx) dataset.save(output_dataset) @@ -324,7 +324,7 @@ def remove_classes_few_toomany_segments( if output_dataset is None: output_dataset = dataset - dataset = Dataset.load(dataset, lazy=True) + dataset = HypDataset.load(dataset, lazy=True) dataset.remove_classes_few_toomany_segments( class_name, min_segs, max_segs, rebuild_idx ) @@ -394,7 +394,7 @@ def split_train_val( train_dataset: PathLike, val_dataset: PathLike, ): - dataset = Dataset.load(dataset, lazy=True) + dataset = HypDataset.load(dataset, lazy=True) train_ds, val_ds = dataset.split_train_val( val_prob, joint_classes, disjoint_classes, min_train_samples, seed ) @@ -433,7 +433,7 @@ def copy( dataset: PathLike, output_dataset: PathLike, ): - dataset = Dataset.load(dataset, lazy=True) + dataset = HypDataset.load(dataset, lazy=True) dataset.save(output_dataset) @@ -502,7 +502,7 @@ def add_cols_to_segments( if output_dataset is None: output_dataset = dataset - dataset = Dataset.load(dataset, lazy=True) + dataset = HypDataset.load(dataset, lazy=True) dataset.add_cols_to_segments( right_table, column_names, @@ -514,6 +514,44 @@ def add_cols_to_segments( dataset.save(output_dataset) +def make_from_lhotse_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--cuts-file", + default=None, + help="lhotse cuts file", + ) + parser.add_argument( + "--recordings-file", + default=None, + help="lhotse recordings set file", + ) + parser.add_argument( + "--supervisions-file", + default=None, + help="lhotse supervisions file", + ) + + +def from_lhotse( + dataset: PathLike, + cuts_file: Optional[PathLike] = None, + recordings_file: Optional[PathLike] = None, + supervisions_file: Optional[PathLike] = None, +): + + assert cuts_file is not None or supervisions_file is not None + dataset_dir = dataset + dataset = HypDataset.from_lhotse( + cuts=cuts_file, recordings=recordings_file, supervisions=supervisions_file + ) + dataset.save(dataset) + + def main(): parser = ArgumentParser(description="Tool to manipulates the Hyperion dataset") parser.add_argument("--cfg", action=ActionConfigFile) diff --git a/hyperion/bin/split_dataset_into_trials_and_cohort.py b/hyperion/bin/split_dataset_into_trials_and_cohort.py index 50c2f1f2..a5935910 100755 --- a/hyperion/bin/split_dataset_into_trials_and_cohort.py +++ b/hyperion/bin/split_dataset_into_trials_and_cohort.py @@ -15,7 +15,7 @@ ) from hyperion.hyp_defs import config_logger -from hyperion.utils import Dataset +from hyperion.utils import HypDataset def main(): @@ -63,7 +63,7 @@ def main(): del args.trials_dir args = namespace_to_dict(args) - dataset = Dataset.load(data_dir) + dataset = HypDataset.load(data_dir) trials_dataset, cohort_dataset = dataset.split_into_trials_and_cohort(**args) trials_dataset.save(trials_dir) cohort_dataset.save(cohort_dir) diff --git a/hyperion/data_prep/musan.py b/hyperion/data_prep/musan.py index abf7a46c..b14785b8 100644 --- a/hyperion/data_prep/musan.py +++ b/hyperion/data_prep/musan.py @@ -2,8 +2,9 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import logging + import glob +import logging import re from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -13,7 +14,7 @@ from jsonargparse import ActionYesNo from tqdm import tqdm -from ..utils import Dataset, RecordingSet, SegmentSet +from ..utils import HypDataset, RecordingSet, SegmentSet from ..utils.misc import PathLike, urlretrieve_progress from .data_prep import DataPrep @@ -95,7 +96,7 @@ def prepare(self): segments = SegmentSet(segments) segments.sort() logging.info("making dataset") - dataset = Dataset( + dataset = HypDataset( segments, recordings=recs, ) diff --git a/hyperion/data_prep/rirs.py b/hyperion/data_prep/rirs.py index 066819a8..accf7bad 100644 --- a/hyperion/data_prep/rirs.py +++ b/hyperion/data_prep/rirs.py @@ -2,8 +2,9 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import logging + import glob +import logging import re from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -13,7 +14,7 @@ from jsonargparse import ActionYesNo from tqdm import tqdm -from ..utils import Dataset, RecordingSet, SegmentSet +from ..utils import HypDataset, RecordingSet, SegmentSet from ..utils.misc import PathLike, urlretrieve_progress from .data_prep import DataPrep @@ -88,16 +89,23 @@ def prepare(self): logging.info("making SegmentsSet") segments = pd.DataFrame( - {"id": rec_ids, "duration": recs.loc[rec_ids, "duration"].values,} + { + "id": rec_ids, + "duration": recs.loc[rec_ids, "duration"].values, + } ) if room_ids is not None: segments["room_id"] = room_ids segments = SegmentSet(segments) segments.sort() logging.info("making dataset") - dataset = Dataset(segments, recordings=recs,) + dataset = HypDataset( + segments, + recordings=recs, + ) logging.info("saving dataset at %s", self.output_dir) dataset.save(self.output_dir) logging.info( - "datasets containts %d segments", len(segments), + "datasets containts %d segments", + len(segments), ) diff --git a/hyperion/data_prep/voxceleb1.py b/hyperion/data_prep/voxceleb1.py index 025fad37..56cf0c59 100644 --- a/hyperion/data_prep/voxceleb1.py +++ b/hyperion/data_prep/voxceleb1.py @@ -2,18 +2,19 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + +import glob import logging import re from concurrent.futures import ThreadPoolExecutor from pathlib import Path -import glob import numpy as np import pandas as pd from jsonargparse import ActionYesNo from tqdm import tqdm -from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils import ClassInfo, HypDataset, RecordingSet, SegmentSet from ..utils.misc import PathLike, urlretrieve_progress from .data_prep import DataPrep @@ -328,7 +329,7 @@ def prepare(self): enrollments, trials = self.make_trials() logging.info("making dataset") - dataset = Dataset( + dataset = HypDataset( segments, classes={"speaker": speakers, "language_est": languages}, recordings=recs, diff --git a/hyperion/data_prep/voxceleb2.py b/hyperion/data_prep/voxceleb2.py index 969f2228..550af3a8 100644 --- a/hyperion/data_prep/voxceleb2.py +++ b/hyperion/data_prep/voxceleb2.py @@ -2,8 +2,9 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import logging + import glob +import logging import re from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -13,7 +14,7 @@ from jsonargparse import ActionYesNo from tqdm import tqdm -from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils import ClassInfo, HypDataset, RecordingSet, SegmentSet from ..utils.misc import PathLike, urlretrieve_progress from .data_prep import DataPrep @@ -250,7 +251,7 @@ def prepare(self): languages = ClassInfo(pd.DataFrame({"id": languages})) logging.info("making dataset") - dataset = Dataset( + dataset = HypDataset( segments, {"speaker": speakers, "language_est": languages}, recs, diff --git a/hyperion/data_prep/voxsrc22.py b/hyperion/data_prep/voxsrc22.py index f81f6eaf..60192029 100644 --- a/hyperion/data_prep/voxsrc22.py +++ b/hyperion/data_prep/voxsrc22.py @@ -2,8 +2,9 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import logging + import glob +import logging import re from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -13,7 +14,7 @@ from jsonargparse import ActionYesNo from tqdm import tqdm -from ..utils import ClassInfo, Dataset, RecordingSet, SegmentSet +from ..utils import ClassInfo, HypDataset, RecordingSet, SegmentSet from ..utils.misc import PathLike, urlretrieve_progress from .data_prep import DataPrep @@ -149,7 +150,7 @@ def prepare_track12_dev(self): segments.sort() logging.info("making dataset") - dataset = Dataset( + dataset = HypDataset( segments, recordings=recs, enrollments=enrollments, diff --git a/hyperion/torch/lr_schedulers/cos_lr.py b/hyperion/torch/lr_schedulers/cos_lr.py index b9e7d069..c2ea8ec3 100644 --- a/hyperion/torch/lr_schedulers/cos_lr.py +++ b/hyperion/torch/lr_schedulers/cos_lr.py @@ -3,7 +3,6 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - import logging import math @@ -32,7 +31,6 @@ class CosineLR(LRScheduler): optimizer: Pytorch optimizer object. T: period of the cycle. T_mul: period multiplier, after each cycle the period is multiplied by T_mul. - hold_steps: number of steps until the lr starts decaying. min_lr: minimum learning rate. warmup_steps: number of warm up steps to get the lr from 0 to the maximum lr. warm_restarts: whether or not to do warm restarts. @@ -103,7 +101,7 @@ def get_lr(self, step): else: return self.min_lrs - alpha = self.gamma ** self.num_restarts + alpha = self.gamma**self.num_restarts r = math.pi / self.T return [ @@ -182,7 +180,7 @@ def get_lr(self, step): else: return self.min_lrs - alpha = self.gamma ** self.num_restarts + alpha = self.gamma**self.num_restarts r = math.pi / self.T return [ diff --git a/hyperion/torch/lr_schedulers/lr_scheduler.py b/hyperion/torch/lr_schedulers/lr_scheduler.py index 5008e1be..d609bf26 100644 --- a/hyperion/torch/lr_schedulers/lr_scheduler.py +++ b/hyperion/torch/lr_schedulers/lr_scheduler.py @@ -90,7 +90,7 @@ def load_state_dict(self, state_dict): def get_warmup_lr(self): x = self.step return [ - (base_lr - min_lr) / self.warmup_steps * x + min_lr + (base_lr - min(min_lr, 1e-8)) / self.warmup_steps * x + min(min_lr, 1e-8) for base_lr, min_lr in zip(self.base_lrs, self.min_lrs) ] diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index e8ad5056..9bc51181 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -3,12 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .info_table import InfoTable from .class_info import ClassInfo -from .dataset import Dataset from .enrollment_map import EnrollmentMap from .feature_set import FeatureSet from .hyp_dataclass import HypDataClass +from .hyp_dataset import HypDataset +from .info_table import InfoTable from .kaldi_matrix import KaldiCompressedMatrix, KaldiMatrix from .misc import PathLike from .recording_set import RecordingSet diff --git a/hyperion/utils/dataset.py b/hyperion/utils/hyp_dataset.py similarity index 91% rename from hyperion/utils/dataset.py rename to hyperion/utils/hyp_dataset.py index e485f1a5..dbf268da 100644 --- a/hyperion/utils/dataset.py +++ b/hyperion/utils/hyp_dataset.py @@ -26,7 +26,7 @@ from .trial_ndx import TrialNdx -class Dataset: +class HypDataset: """Class that contains all objects (segments, recordings, features, class_infos) that conform a dataset @@ -390,7 +390,7 @@ def save_changed( self.table_sep = table_sep table_ext = ".tsv" if table_sep == "\t" else ".csv" - dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + dataset_dir, dataset_file = HypDataset.resolve_dataset_path(dataset_path) dataset = {} file_name = f"segments{table_ext}" dataset["segments"] = file_name @@ -536,7 +536,7 @@ def save_all( self.table_sep = table_sep table_ext = ".tsv" if table_sep == "\t" else ".csv" - dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + dataset_dir, dataset_file = HypDataset.resolve_dataset_path(dataset_path) dataset = {} file_name = f"segments{table_ext}" dataset["segments"] = file_name @@ -647,12 +647,12 @@ def load( sparse_trials: load trial keys using the SparseTrialKey class instead of TrialKey class """ - dataset_dir, dataset_file = Dataset.resolve_dataset_path(dataset_path) + dataset_dir, dataset_file = HypDataset.resolve_dataset_path(dataset_path) with open(dataset_file, "r") as f: dataset = yaml.safe_load(f) assert "segments" in dataset - segments = Dataset.resolve_file_path(dataset_dir, dataset["segments"]) + segments = HypDataset.resolve_file_path(dataset_dir, dataset["segments"]) classes = None recordings = None features = None @@ -661,28 +661,30 @@ def load( if "classes" in dataset: classes = {} for k, v in dataset["classes"].items(): - classes[k] = Dataset.resolve_file_path(dataset_dir, v) + classes[k] = HypDataset.resolve_file_path(dataset_dir, v) if "recordings" in dataset: - recordings = Dataset.resolve_file_path(dataset_dir, dataset["recordings"]) + recordings = HypDataset.resolve_file_path( + dataset_dir, dataset["recordings"] + ) # recordings = {} # for k, v in dataset["recordings"].items(): - # recordings[k] = Dataset.resolve_file_path(dataset_dir, v) + # recordings[k] = HypDataset.resolve_file_path(dataset_dir, v) if "features" in dataset: features = {} for k, v in dataset["features"].items(): - features[k] = Dataset.resolve_file_path(dataset_dir, v) + features[k] = HypDataset.resolve_file_path(dataset_dir, v) if "enrollments" in dataset: enrollments = {} for k, v in dataset["enrollments"].items(): - enrollments[k] = Dataset.resolve_file_path(dataset_dir, v) + enrollments[k] = HypDataset.resolve_file_path(dataset_dir, v) if "trials" in dataset: trials = {} for k, v in dataset["trials"].items(): - trials[k] = Dataset.resolve_file_path(dataset_dir, v) + trials[k] = HypDataset.resolve_file_path(dataset_dir, v) dataset = cls( segments, @@ -991,8 +993,8 @@ def split_into_trials_and_cohort( intra_gender: if True, no cross gender trials are done. Returns: - Dataset used for trials with trial list. - Dataset used for cohort. + HypDataset used for trials with trial list. + HypDataset used for cohort. """ num_tar_trials = num_1k_tar_trials * 1000 if intra_gender: @@ -1258,4 +1260,83 @@ def from_lhotse( recordings: Optional[Union[lhotse.RecordingSet, PathLike]] = None, supervisions: Optional[Union[lhotse.SupervisionSet, PathLike]] = None, ): + """Creates a Hyperion Dataset from a lhotse CutSet or + from a lhotse RecordingSet + SupervisionSet + + Args: + cuts: lhotse CutSet manifest or file + recordings: lhotse RecordingSet manifest or file + supervisions: lhotse SupervisionSet manifest or file. + + Returns + HypDataset object + """ + assert cuts is not None or supervisions is not None + if cuts is not None: + if isinstance(cuts, (str, Path)): + cuts = lhotse.CutSet.from_file(cuts) + else: + if isinstance(supervisions, (str, Path)): + supervisions = lhotse.SupervisionSet.from_file(supervisions) + + if recordings is not None and isinstance(recordings, (str, Path)): + recordings = lhotse.RecordingSet.from_file(recordings) + + cuts = lhotse.CutSet.from_manifests( + recordings=recordings, supervisions=supervisions + ) + + from lhotse import MonoCut, Recording, SupervisionSegment + + supervision_keys = ["speaker", "gender", "language", "text", "duration"] + recs_df = [] + segs_df = [] + for cut in cuts: + supervision = cut.supervisions[0] + recording = cut.recording + seg_dict = {"id": cut.id} + recording = cut.recording + if recording is not None: + if recording.id != cut.id: + seg_dict["recording_id"] = recording.id + + rec_dict = { + "id": recording.id, + "sampling_rate": recording.sampling_rate, + "duration": recording.duration, + } + source = recording.sources[0] + assert len(recording.source) == 1 + assert source.type in ["file", "command"] + rec_dict["storage_path"] = source.source + assert recording.transforms is None, f"{recording.transforms}" + recs_df.append(rec_dict) + + for key in supervision_keys: + if hasattr(supervision, key): + val = getattr(supervision, key) + if val is not None: + seg_dict[key] = val + + if supervision.custom is not None: + for key, val in supervision.custom: + if val is not None: + seg_dict[key] = val + + segs_df = seg_dict + + recs_df = pd.DataFrame(recs_df) + segs_df = pd.DataFrame(segs_df) + recordings = RecordingSet(recs_df) + segments = SegmentSet(segs_df) + class_names = ["speaker", "language", "emotion", "gender"] + classes = {} + for key in class_names: + if key in segments: + uniq_classes = np.unique(segments[key]) + classes[key] = pd.DataFrame({"id": uniq_classes}) + + dataset = cls(segments=segments, classes=classes, recordings=recordings) + return dataset + return None From dcabebe78e2ba201e56227aa6094942646bbbcfa Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 3 May 2024 14:57:58 -0400 Subject: [PATCH 133/154] added cosine loss to dino --- .../conf/train_fwseresnet34_dino_v1.2.2.yaml | 96 +++++++++++++ ...config_fbank80_stmn_fwseresnet34.v1.2.2.sh | 67 +++++++++ hyperion/bin/train_dino_wav2xvector.py | 20 ++- hyperion/torch/losses/__init__.py | 2 +- hyperion/torch/losses/dino_loss.py | 81 +++++++++++ hyperion/torch/models/xvectors/xvector.py | 109 ++++++++++++-- .../torch/trainers/dino_xvector_trainer.py | 134 ++++++++++++++---- .../torch/trainers/xvector_adv_trainer.py | 6 +- .../trainers/xvector_adv_trainer_from_wav.py | 9 +- hyperion/torch/trainers/xvector_trainer.py | 4 +- .../trainers/xvector_trainer_deep_feat_reg.py | 7 +- .../xvector_trainer_deep_feat_reg_from_wav.py | 8 +- .../trainers/xvector_trainer_from_wav.py | 4 +- 13 files changed, 481 insertions(+), 66 deletions(-) create mode 100644 egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml create mode 100644 egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh diff --git a/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml new file mode 100644 index 00000000..6d6e60a9 --- /dev/null +++ b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml @@ -0,0 +1,96 @@ +data: + train: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 + val: + dataset: + teacher_aug_cfg: conf/teacher_reverb_noise_aug.yaml + student_aug_cfg: conf/reverb_noise_aug.yaml + student_chunk_length: 2. + teacher_chunk_length: 4. + num_teacher_chunks: 2 + num_student_chunks: 4 + same_teacher_student_chunks: false + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 12.0 + min_chunk_length: 6.0 + data_loader: + num_workers: 8 +student_model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + dropout_rate: 0.01 + norm_before: false + hid_act: swish + se_r: 4 + head_type: dino + embed_dim: 192 + num_embed_layers: 3 + loss_type: softmax + head_use_norm: true + head_hid_dim: 768 + head_bottleneck_dim: 192 + proj_head_use_norm: true + proj_head_norm_before: false +teacher_model: + xvector: + override_dropouts: true + dropout_rate: 0.0 +dino_loss: + num_classes: 65536 + temp_warmup_epochs: 0 + teacher_temp: 0.04 +cosine_loss: + warmup_epochs: 20 + scale: 0.1 +trainer: + optim: + opt_type: adamw + lr: 0.005 + amsgrad: false + beta1: 0.9 + beta2: 0.99 + weight_decay: 1e-1 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 60000 + hold_steps: 15000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + teacher_optim: + init_momentum: 0.996 + momentum: 1.0 + warmup_steps: 500000 + grad_clip: 15 + use_amp: true + log_interval: 1000 + epochs: 100 + eff_batch_size: 256 + train_mode: full + freeze_output_layer_steps: 1500 diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh new file mode 100644 index 00000000..13a72732 --- /dev/null +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh @@ -0,0 +1,67 @@ +# ECAPA-TDNN 512x3 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34_dino.v1.2.2 + +nnet_s1_base_cfg=conf/train_fwseresnet34_dino_v1.2.2.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/teacher_model_ep0034.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0025.pth + +# clustering of dino embeddings +cluster_method=cos_ahc_plda_ahc +cluster_cfg=conf/cluster_lresnet34_v1.2_cos_ahc_plda_ahc.yaml +cluster_name=${cluster_method} +cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name + +# plda +plda_cfg=conf/plda.yaml + +# finetuning stage 1.1 +nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 +nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth + +# finetuning stage 1.2 +nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 +nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name +nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 1.2 +cluster_ft_s1_method=cos_ahc +cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s1_name=${cluster_method_ft_s1_method} +cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name + +# finetuning stage 2.1 +nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 +nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth + +# finetuning stage 2.2 +nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_name=$nnet_name.s1.ft.s2.2 +nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name +nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth + +# clustering of ft embeddings from stage 2.2 +cluster_ft_s2_method=cos_ahc +cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml +cluster_ft_s2_name=${cluster_method_ft_s2_method} +cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name + diff --git a/hyperion/bin/train_dino_wav2xvector.py b/hyperion/bin/train_dino_wav2xvector.py index d1cd108e..cb541f55 100755 --- a/hyperion/bin/train_dino_wav2xvector.py +++ b/hyperion/bin/train_dino_wav2xvector.py @@ -19,7 +19,7 @@ from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import DINOAudioDataset as AD from hyperion.torch.data import SegSamplerFactory -from hyperion.torch.losses import DINOLoss +from hyperion.torch.losses import CosineDINOLoss, DINOLoss from hyperion.torch.metrics import CategoricalAccuracy # from hyperion.torch.models import EfficientNetXVector as EXVec @@ -109,6 +109,21 @@ def init_dino_loss(rank, **kwargs): return loss +def init_cosine_loss(rank, **kwargs): + loss_args = kwargs["cosine_loss"] + if rank == 0: + logging.info(f"cosine loss args={loss_args}") + + if loss_args["scale"] <= 0: + return None + + loss = CosineDINOLoss(**loss_args) + if rank == 0: + logging.info(f"cosine-loss={loss}") + + return loss + + def train_xvec(gpu_id, args): config_logger(args.verbose) del args.verbose @@ -126,6 +141,7 @@ def train_xvec(gpu_id, args): val_loader = init_data(partition="val", **kwargs) dino_loss = init_dino_loss(**kwargs) + cosine_loss = init_cosine_loss(**kwargs) student_model = init_student_xvector(num_classes=dino_loss.num_classes, **kwargs) kwargs["student_model"] = student_model teacher_model = init_teacher_xvector(**kwargs) @@ -138,6 +154,7 @@ def train_xvec(gpu_id, args): student_model, teacher_model, dino_loss, + cosine_loss=cosine_loss, device=device, metrics=metrics, ddp=world_size > 1, @@ -185,6 +202,7 @@ def make_parser(xvec_class): xvec_class.add_class_args(parser, prefix="student_model") xvec_class.add_dino_teacher_args(parser, prefix="teacher_model") DINOLoss.add_class_args(parser, prefix="dino_loss") + CosineDINOLoss.add_class_args(parser, prefix="cosine_loss") Trainer.add_class_args( parser, prefix="trainer", train_modes=xvec_class.valid_train_modes() ) diff --git a/hyperion/torch/losses/__init__.py b/hyperion/torch/losses/__init__.py index 6f68ad45..56ad2a5d 100644 --- a/hyperion/torch/losses/__init__.py +++ b/hyperion/torch/losses/__init__.py @@ -4,4 +4,4 @@ """ from .bce_with_llr import BCEWithLLR -from .dino_loss import DINOLoss +from .dino_loss import CosineDINOLoss, DINOLoss diff --git a/hyperion/torch/losses/dino_loss.py b/hyperion/torch/losses/dino_loss.py index 55f8e846..c5f499c8 100644 --- a/hyperion/torch/losses/dino_loss.py +++ b/hyperion/torch/losses/dino_loss.py @@ -2,6 +2,7 @@ Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import torch @@ -162,3 +163,83 @@ def add_class_args(parser, prefix=None): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + +class CosineDINOLoss(nn.Module): + """Cosine Loss to regularize DINO + and enforze DINO embeddings to be suitable for cosine scoring + + """ + + def __init__( + self, + scale: float = 1.0, + warmup_epochs: int = 30, + ): + super().__init__() + self.scale = scale + self.warmup_epochs = warmup_epochs + self.cur_scale = scale + + def update_scale(self, epoch: int): + if epoch < self.warmup_epochs: + self.cur_scale = self.scale * epoch / self.warmup_epochs + logging.info("updating cosine-loss scale=%.3f", self.cur_scale) + else: + self.cur_scale = self.scale + + def forward( + self, + student_embed: torch.Tensor, + teacher_embed: torch.Tensor, + num_student_crops: int, + num_teacher_crops: int, + ): + """ + Cosine scoring between embeddings of the teacher and student networks. + """ + if self.scale == 0: + return 0 + + student_embed = torch.nn.functional.normalize(student_embed, dim=-1) + teacher_embed = torch.nn.functional.normalize(teacher_embed, dim=-1) + student_embed = student_embed.chunk(num_student_crops) + teacher_embed = teacher_embed.detach() + teacher_embed = teacher_embed.chunk(num_teacher_crops) + + total_loss = 0 + n_loss_terms = 0 + for iq, q in enumerate(teacher_embed): + for ip, p in enumerate(student_embed): + if ip == iq and num_teacher_crops > 1: + # we skip cases where student and teacher operate on the same view + continue + loss = 1 - torch.sum(q * p, dim=-1) + total_loss += loss.mean() + n_loss_terms += 1 + total_loss /= n_loss_terms + + return self.cur_scale * total_loss, total_loss + + @staticmethod + def filter_args(**kwargs): + return filter_func_args(CosineDINOLoss.__init__, kwargs) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + parser.add_argument( + "--scale", default=0, type=float, help="Scale of Cosine loss to reg. DINO" + ) + parser.add_argument( + "--warmup-epochs", + default=30, + type=int, + help="warmup epochs for the scale", + ) + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/xvectors/xvector.py b/hyperion/torch/models/xvectors/xvector.py index c20f5520..17d77116 100644 --- a/hyperion/torch/models/xvectors/xvector.py +++ b/hyperion/torch/models/xvectors/xvector.py @@ -384,11 +384,15 @@ class logits tensor with shape=(batch, num_classes). x = x[0] x, x_lengths = self._post_enc(x, x_lengths, max_in_length) p = self.pool_net(x, x_lengths=x_lengths) + xvector = None if self.proj_head_net is not None: p = self.proj_head_net(p) - y = self.classif_net(p, y) + xvector = p - return y + logits = self.classif_net(p, y) + # return logits + output = XVectorOutput(None, logits, xvector) + return output def forward_hid_feats( self, @@ -432,14 +436,67 @@ def forward_hid_feats( ) if return_logits: h_classif, y_pred = h_classif - output["h_classif"] = h_classif - output["logits"] = y_pred - return output + else: + y_pred = None + + if h_classif is not None: + xvector = h_classif[0] + else: + xvector = None - output["h_classif"] = h_classif + output = XVectorOutput(None, y_pred, xvector, h_enc, h_classif) return output - def extract_embed( + # def forward_hid_feats( + # self, + # x, + # x_lengths=None, + # y=None, + # return_enc_layers=None, + # return_classif_layers=None, + # return_logits=False, + # ): + # """forwards hidden representations in the x-vector network + + # Args: + # x: input features tensor with shape=(batch, in_feats, time). + # x_lengths: time lengths of the features with shape=(batch,). + # y: target classes torch.long tensor with shape=(batch,). + # return_enc_layers: list of integers indicating, which encoder layers + # we should return. If None, no encoder layers are returned. + # return_enc_layers: list of integers indicating, which classification head layers + # we should return. If None, no head layers are returned. + # return_logits: if True, it adds the logits to the output dictionary. + # Returns: + # Dictionary with "logits", "h_enc" (list of hidden encoder layers), + # "h_classif" (list hidden classification head layers). + # """ + # max_in_length = x.size(-1) + # x = self._pre_enc(x) + # h_enc, x = self.encoder_net.forward_hid_feats( + # x, return_enc_layers, return_output=True + # ) + # output = {"h_enc": h_enc} + # if not return_logits and return_classif_layers is None: + # return output + + # x, x_lengths = self._post_enc(x, x_lengths, max_in_length) + # p = self.pool_net(x, x_lengths=x_lengths) + # if self.proj_head_net is not None: + # p = self.proj_head_net(p) + # h_classif = self.classif_net.forward_hid_feats( + # p, y, return_classif_layers, return_logits=return_logits + # ) + # if return_logits: + # h_classif, y_pred = h_classif + # output["h_classif"] = h_classif + # output["logits"] = y_pred + # return output + + # output["h_classif"] = h_classif + # return output + + def extract_embed_impl( self, x, x_lengths=None, chunk_length=0, embed_layer=None, detach_chunks=False ): if embed_layer is None: @@ -447,12 +504,17 @@ def extract_embed( max_in_length = x.size(-1) x = self._pre_enc(x) - x = eval_nnet_by_chunks( - x, self.encoder_net, chunk_length, detach_chunks=detach_chunks - ) + if max_in_length <= chunk_length or chunk_length == 0: + x = self.encoder_net(x, x_lengths=x_lengths) + if isinstance(x, tuple): + x = x[0] + else: + x = eval_nnet_by_chunks( + x, self.encoder_net, chunk_length, detach_chunks=detach_chunks + ) - if x.device != self.device: - x = x.to(self.device) + if x.device != self.device: + x = x.to(self.device) x, x_lengths = self._post_enc(x, x_lengths, max_in_length) p = self.pool_net(x, x_lengths=x_lengths) @@ -462,6 +524,29 @@ def extract_embed( y = self.classif_net.extract_embed(p, embed_layer) return y + def extract_embed( + self, x, x_lengths=None, chunk_length=0, embed_layer=None, detach_chunks=False + ): + + if x.size(-1) <= chunk_length or chunk_length == 0: + return self.extract_embed_impl(x, x_lengths, 0, embed_layer) + else: + e = [] + for i in range(x.size(0)): + x_i = x[i : i + 1] + if x_lengths is not None: + x_i = x_i[..., x_lengths[i]] + + e_i = self.extract_embed_impl( + x_i, + chunk_length=chunk_length, + embed_layer=embed_layer, + detach_chunks=detach_chunks, + ) + e.append(e_i) + + return torch.cat(e, dim=0) + def extract_embed_slidwin( self, x, diff --git a/hyperion/torch/trainers/dino_xvector_trainer.py b/hyperion/torch/trainers/dino_xvector_trainer.py index 16a15304..6573c21a 100644 --- a/hyperion/torch/trainers/dino_xvector_trainer.py +++ b/hyperion/torch/trainers/dino_xvector_trainer.py @@ -63,6 +63,7 @@ def __init__( loss, optim, teacher_optim, + cosine_loss=None, epochs=100, exp_path="./train", cur_epoch=0, @@ -96,12 +97,16 @@ def __init__( self.teacher_model = teacher_model self.teacher_optim = teacher_optim self.freeze_output_layer_steps = freeze_output_layer_steps + self.cosine_loss = cosine_loss super().__init__(student_model, **super_args) def prepare_models_for_training(self): super().prepare_models_for_training() self.teacher_model, self.teacher_optimizer = self._prepare_model_for_ema( - self.teacher_model, self.teacher_optim, self.device, self.ddp, + self.teacher_model, + self.teacher_optim, + self.device, + self.ddp, ) def _prepare_model_for_ema(self, model, optim, device, ddp): @@ -168,6 +173,8 @@ def train_epoch(self, data_loader): self.teacher_model.train() self.loss.update_temp(self.cur_epoch) self.loss.train() + if self.cosine_loss is not None: + self.cosine_loss.update_scale(self.cur_epoch) for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) @@ -184,32 +191,54 @@ def train_epoch(self, data_loader): num_teacher_crops = len(teacher_data) teacher_data = torch.cat(teacher_data, dim=0) teacher_out = self.teacher_model(teacher_data) - assert not torch.any(torch.isnan(teacher_out)), "teacher is nan" - assert not torch.any(torch.isinf(teacher_out)), "teacher is inf" + assert not torch.any( + torch.isnan(teacher_out.logits) + ), "teacher is nan" + assert not torch.any( + torch.isinf(teacher_out.logits) + ), "teacher is inf" if num_teacher_crops > 1: student_out1 = self.model(teacher_data) - assert not torch.any(torch.isnan(student_out1)), "s1 is nan" - assert not torch.any(torch.isinf(student_out1)), "s1 is inf" + assert not torch.any(torch.isnan(student_out1.logits)), "s1 is nan" + assert not torch.any(torch.isinf(student_out1.logits)), "s1 is inf" student_data = tensors_subset(data, student_keys, self.device) num_student_crops = len(student_data) student_data = torch.cat(student_data, dim=0) student_out2 = self.model(student_data) - assert not torch.any(torch.isnan(student_out2)), "s2 is nan" - assert not torch.any(torch.isinf(student_out2)), "s2 is inf" + assert not torch.any(torch.isnan(student_out2.logits)), "s2 is nan" + assert not torch.any(torch.isinf(student_out2.logits)), "s2 is inf" if num_teacher_crops > 1: - student_out = torch.cat((student_out1, student_out2), dim=0) + student_out_logits = torch.cat( + (student_out1.logits, student_out2.logits), dim=0 + ) + if self.cosine_loss is not None: + student_out_embeds = torch.cat( + (student_out1.xvector, student_out2.xvector), dim=0 + ) num_student_crops += num_teacher_crops else: - student_out = student_out2 - - loss = ( - self.loss( - student_out, teacher_out, num_student_crops, num_teacher_crops - ) - / self.grad_acc_steps + student_out_logits = student_out2.logits + student_out_embeds = student_out2.xvector + + loss_dino = self.loss( + student_out_logits, + teacher_out.logits, + num_student_crops, + num_teacher_crops, ) + loss = loss_dino + if self.cosine_loss is not None: + scaled_loss_cosine, loss_cosine = self.cosine_loss( + student_out_embeds, + teacher_out.xvector, + num_student_crops, + num_teacher_crops, + ) + loss = loss_dino + scaled_loss_cosine + + loss = loss / self.grad_acc_steps assert not torch.isnan( loss ), f"loss is nan {batch} {torch.mean(teacher_out)} {torch.mean(student_out1)} {torch.mean(student_out2)}" @@ -229,8 +258,9 @@ def train_epoch(self, data_loader): self.save_checkpoint(partial=True) batch_metrics["loss"] = loss.item() * self.grad_acc_steps - # for k, metric in self.metrics.items(): - # batch_metrics[k] = metric(output, target) + if self.cosine_loss is not None: + batch_metrics["loss_dino"] = loss_dino.item() + batch_metrics["loss_cosine"] = loss_cosine.item() metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -275,31 +305,53 @@ def validation_epoch(self, data_loader, swa_update_bn=False): num_teacher_crops = len(teacher_data) teacher_data = torch.cat(teacher_data, dim=0) teacher_out = self.teacher_model(teacher_data) - assert not torch.any(torch.isnan(teacher_out)), "teacher is nan" - assert not torch.any(torch.isinf(teacher_out)), "teacher is inf" + assert not torch.any(torch.isnan(teacher_out.logits)), "teacher is nan" + assert not torch.any(torch.isinf(teacher_out.logits)), "teacher is inf" if num_teacher_crops > 1: student_out1 = self.model(teacher_data) - assert not torch.any(torch.isnan(student_out1)), "s1 is nan" - assert not torch.any(torch.isinf(student_out1)), "s1 is inf" + assert not torch.any(torch.isnan(student_out1.logits)), "s1 is nan" + assert not torch.any(torch.isinf(student_out1.logits)), "s1 is inf" student_data = tensors_subset(data, student_keys, self.device) num_student_crops = len(student_data) student_data = torch.cat(student_data, dim=0) student_out2 = self.model(student_data) - assert not torch.any(torch.isnan(student_out2)), "s2 is nan" - assert not torch.any(torch.isinf(student_out2)), "s2 is inf" + assert not torch.any(torch.isnan(student_out2.logits)), "s2 is nan" + assert not torch.any(torch.isinf(student_out2.logits)), "s2 is inf" if num_teacher_crops > 1: - student_out = torch.cat((student_out1, student_out2), dim=0) + student_out_logits = torch.cat( + (student_out1.logits, student_out2.logits), dim=0 + ) + if self.cosine_loss is not None: + student_out_embeds = torch.cat( + (student_out1.xvector, student_out2.xvector), dim=0 + ) num_student_crops += num_teacher_crops else: - student_out = student_out2 - - loss = self.loss( - student_out, teacher_out, num_student_crops, num_teacher_crops + student_out_logits = student_out2.logits + student_out_embeds = student_out2.xvector + + loss_dino = self.loss( + student_out_logits, + teacher_out.logits, + num_student_crops, + num_teacher_crops, ) + loss = loss_dino + if self.cosine_loss is not None: + scaled_loss_cosine, loss_cosine = self.cosine_loss( + student_out_embeds, + teacher_out.xvector, + num_student_crops, + num_teacher_crops, + ) + loss = loss_dino + scaled_loss_cosine batch_metrics["loss"] = loss.item() + if self.cosine_loss is not None: + batch_metrics["loss_dino"] = loss_dino.item() + batch_metrics["loss_cosine"] = loss_cosine.item() # for k, metric in self.metrics.items(): # batch_metrics[k] = metric(output, target) @@ -317,17 +369,26 @@ def _old_load_checkpoint(self, checkpoint): ) return super()._load_checkpoint(checkpoint) - def _load_checkpoint(self, checkpoint, teacher_checkpoint): + def _load_checkpoint(self, checkpoint, teacher_checkpoint, loss_checkpoint=None): self.teacher_model.load_state_dict(teacher_checkpoint["model_state_dict"]) self.teacher_optimizer.load_state_dict( teacher_checkpoint["optimizer_state_dict"] ) + if loss_checkpoint is not None: + self.loss.load_state_dict(loss_checkpoint["model_state_dict"]) return super()._load_checkpoint(checkpoint) def load_checkpoint(self, epoch, step): checkpoint = self.load_model_checkpoint("model", epoch, step) teacher_checkpoint = self.load_model_checkpoint("teacher_model", epoch, step) - return self._load_checkpoint(checkpoint, teacher_checkpoint) + try: + loss_checkpoint = self.load_model_checkpoint("dino_loss", epoch, step) + except: + logging.warning( + "dino loss checkpoint not found, initial center will be zero-vector" + ) + loss_checkpoint = None + return self._load_checkpoint(checkpoint, teacher_checkpoint, loss_checkpoint) def checkpoint(self, logs=None): checkpoint = super().checkpoint(logs) @@ -357,6 +418,16 @@ def teacher_checkpoint(self, logs=None): return checkpoint + def dino_loss_checkpoint(self, logs=None): + self.loss.train() + checkpoint = { + "epoch": self.cur_epoch, + "batch": self.cur_batch, + "global_step": self.global_step, + "model_state_dict": self.loss.state_dict(), + } + return checkpoint + def save_checkpoint(self, logs=None, partial: bool = False): """Saves a checkpoint of the training status @@ -386,6 +457,9 @@ def save_checkpoint(self, logs=None, partial: bool = False): teacher_checkpoint = self.teacher_checkpoint(logs) self.save_model_checkpoint("teacher_model", teacher_checkpoint, partial=partial) + loss_checkpoint = self.dino_loss_checkpoint() + self.save_model_checkpoint("dino_loss", loss_checkpoint, partial=partial) + @staticmethod def filter_args(**kwargs): args = filter_func_args(DINOXVectorTrainer.__init__, kwargs) diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index b9dd67d7..8603b22a 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -140,7 +140,7 @@ def train_epoch(self, data_loader): with amp.autocast(enabled=self.use_amp): output = self.model(input_data, target) - loss = self.loss(output, target).mean() / self.grad_acc_steps + loss = self.loss(output.logits, target) / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -194,9 +194,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): with amp.autocast(enabled=self.use_amp): output = self.model(data, **self.amp_args) - loss = self.loss(output, target) + loss = self.loss(output.logits, target) - batch_metrics["loss"] = loss.mean().item() + batch_metrics["loss"] = loss.item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 8ece7de2..ccafecdd 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -135,9 +135,6 @@ def train_epoch(self, data_loader): self.model.eval() data_adv = self.attack.generate(input_data, target) max_delta = torch.max(torch.abs(data_adv - data)).item() - # z = torch.abs(data_adv-data) > 100 - # logging.info('zz {} {}'.format(data[z], data_adv[z])) - # logging.info('adv attack max perturbation=%f' % (max_delta)) input_data = data_adv self.model.train() @@ -148,7 +145,7 @@ def train_epoch(self, data_loader): with amp.autocast(enabled=self.use_amp): output = self.model(feats, y=target) - loss = self.loss(output, target).mean() / self.grad_acc_steps + loss = self.loss(output.logits, target) / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -202,9 +199,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False): feats = self.feat_extractor(input_data) with amp.autocast(enabled=self.use_amp): output = self.model(feats) - loss = self.loss(output, target) + loss = self.loss(output.logits, target) - batch_metrics["loss"] = loss.mean().item() + batch_metrics["loss"] = loss.item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 2902f23d..151993e0 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -122,7 +122,7 @@ def train_epoch(self, data_loader): batch_size = x.size(0) with amp.autocast(enabled=self.use_amp): output = self.model(x, y=target) - loss = self.loss(output, target) / loss_scale + loss = self.loss(output.logits, target) / loss_scale loss_acc += loss.item() if self.use_amp: @@ -180,7 +180,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_size = x.size(0) with amp.autocast(enabled=self.use_amp): output = self.model(x) - loss = self.loss(output, target) / loss_scale + loss = self.loss(output.logits, target) / loss_scale loss_acc += loss.item() batch_metrics["loss"] = loss_acc diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 98bc404d..1c9209f6 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -147,12 +147,10 @@ def train_epoch(self, data_loader): h_enc, h_classif, output = ( outputs["h_enc"], outputs["h_classif"], - outputs["output"], + outputs["logits"], ) - loss = self.loss( - output, target - ).mean() # you need to take the mean here because of the multi-gpu training + loss = self.loss(output, target) batch_metrics["loss-classif"] = loss.item() prior_outputs = self.prior_model( @@ -269,4 +267,3 @@ def add_class_args(parser, prefix=None, skip=[]): if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) - # help='trainer options') diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 1005435f..4b1d23ba 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -140,12 +140,12 @@ def train_epoch(self, data_loader): h_enc, h_classif, output = ( outputs["h_enc"], outputs["h_classif"], - outputs["output"], + outputs["logits"], ) loss = self.loss( output, target - ).mean() # you need to take the mean here because of the multi-gpu training + ) # you need to take the mean here because of the multi-gpu training batch_metrics["loss-classif"] = loss.item() prior_outputs = self.prior_model( @@ -231,9 +231,9 @@ def validation_epoch(self, data_loader, swa_update_bn=False): feats = self.feat_extractor(input_data) with amp.autocast(enabled=self.use_amp): output = self.model(feats) - loss = self.loss(output, target) + loss = self.loss(output.logits, target) - batch_metrics["loss"] = loss.mean().item() + batch_metrics["loss"] = loss.item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index 2a238a06..f46b2109 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -119,7 +119,7 @@ def train_epoch(self, data_loader): with amp.autocast(enabled=self.use_amp): output = self.model(feats, feats_lengths, y=target) - loss = self.loss(output, target).mean() / self.grad_acc_steps + loss = self.loss(output.logits, target) / self.grad_acc_steps if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -173,7 +173,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): feats, feats_lengths = self.feat_extractor(audio) with amp.autocast(enabled=self.use_amp): output = self.model(feats, feats_lengths) - loss = self.loss(output, target) + loss = self.loss(output.logits, target) batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): From 2b707d058eead25d434b7932cd2df306937d6694 Mon Sep 17 00:00:00 2001 From: System User Date: Fri, 3 May 2024 15:05:17 -0400 Subject: [PATCH 134/154] xxx --- ...rain_fwseres2net50s8_xvec_stage1_v1.1.yaml | 78 ------- .../open.v2.8k/run_030_extract_xvectors.sh | 217 ++++++++++++++++++ egs/voxceleb/v1.2/cmd.sh | 2 +- ...train_fwseresnet34pe_xvec_stage1_v3.1.yaml | 78 +++++++ ...rain_idrnd_resnet100_xvec_stage2_v3.2.yaml | 74 ++++++ .../config_fbank80_stmn_fwseresnet34.v3.1.sh | 4 +- ...config_fbank80_stmn_fwseresnet34pe.v3.1.sh | 44 ++++ ...onfig_fbank80_stmn_idrnd_resnet100.v3.2.sh | 45 ++++ egs/voxceleb/v1.2/run_007_eval_be.sh | 4 +- hyperion/torch/layer_blocks/resnet_blocks.py | 3 +- 10 files changed, 465 insertions(+), 84 deletions(-) delete mode 100644 egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.1.yaml create mode 100755 egs/lre22/open.v2.8k/run_030_extract_xvectors.sh create mode 100644 egs/voxceleb/v1.2/conf/train_fwseresnet34pe_xvec_stage1_v3.1.yaml create mode 100644 egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.2.yaml create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34pe.v3.1.sh create mode 100644 egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.2.sh diff --git a/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.1.yaml b/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.1.yaml deleted file mode 100644 index 28b1f641..00000000 --- a/egs/lre22/fixed.v1.8k/conf/train_fwseres2net50s8_xvec_stage1_v1.1.yaml +++ /dev/null @@ -1,78 +0,0 @@ -data: - train: - dataset: - class_names: - - class_id - aug_cfgs: - - conf/reverb_noise_aug.yaml - return_segment_info: - - class_id - sampler: - sampler_type: seg_chunk_sampler - min_batch_size: 24 - max_chunk_length: 3.0 - min_chunk_length: 3.0 - data_loader: - num_workers: 8 - val: - dataset: - class_names: - - class_id - aug_cfgs: - - conf/reverb_noise_aug.yaml - return_segment_info: - - class_id - sampler: - sampler_type: seg_chunk_sampler - min_batch_size: 24 - max_chunk_length: 3.0 - min_chunk_length: 3.0 - data_loader: - num_workers: 8 -feats: fbank64_specaug1_stmn_8k.yaml -model: - resnet_type: fwseres2net50 - in_channels: 1 - in_feats: 64 - in_kernel_size: 3 - in_stride: 1 - no_maxpool: true - res2net_width_factor: 3.25 - res2net_scale: 8 - se_r: 4 - pool_net: - pool_type: ch-wise-att-mean+stddev - inner_feats: 128 - embed_dim: 192 - loss_type: subcenter-arc-softmax - num_subcenters: 2 - cos_scale: 30.0 - margin: 0.0 - intertop_margin: 0.0 - margin_warmup_epochs: 3.0 - dropout_rate: 0.1 - norm_before: false - hid_act: swish -trainer: - optim: - opt_type: adam - lr: 0.01 - amsgrad: true - beta1: 0.9 - beta2: 0.95 - weight_decay: 2.0e-05 - lrsched: - lrsch_type: exp_lr - decay_rate: 0.5 - decay_steps: 40000 - hold_steps: 65000 - warmup_steps: 15000 - min_lr: 1.0e-06 - update_lr_on_opt_step: true - use_amp: true - swa_start: 12 - swa_lr: 1e-5 - swa_anneal_epochs: 2 - log_interval: 1000 - epochs: 15 - eff_batch_size: 256 diff --git a/egs/lre22/open.v2.8k/run_030_extract_xvectors.sh b/egs/lre22/open.v2.8k/run_030_extract_xvectors.sh new file mode 100755 index 00000000..d7e2775b --- /dev/null +++ b/egs/lre22/open.v2.8k/run_030_extract_xvectors.sh @@ -0,0 +1,217 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=2 +nnet_stage=1 +config_file=default_config.sh +use_gpu=false +do_tsne=false +split_dev=false +hf_chunk_length=120 #seconds +xvec_chunk_length=120 #seconds +. parse_options.sh || exit 1; +. $config_file + +if [ "$use_gpu" == "true" ];then + xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length" + xvec_cmd="$cuda_eval_cmd --mem 6G" +else + xvec_cmd="$train_cmd --mem 12G" +fi +if [ $nnet_stages -lt $nnet_stage ];then + nnet_stage=$nnet_stages +fi + +if [ $nnet_stage -eq 1 ];then + nnet=$nnet_s1 + nnet_name=$nnet_s1_name +elif [ $nnet_stage -eq 2 ];then + nnet=$nnet_s2 + nnet_name=$nnet_s2_name +elif [ $nnet_stage -eq 3 ];then + nnet=$nnet_s3 + nnet_name=$nnet_s3_name +elif [ $nnet_stage -eq 4 ];then + nnet=$nnet_s4 + nnet_name=$nnet_s4_name +elif [ $nnet_stage -eq 5 ];then + nnet=$nnet_s5 + nnet_name=$nnet_s5_name +elif [ $nnet_stage -eq 6 ];then + nnet=$nnet_s6 + nnet_name=$nnet_s6_name +fi + +xvector_dir=exp/xvectors/$nnet_name + +# if [ $stage -le 1 ]; then +# # Extract xvectors for training +# for name in lre17_proc_audio_no_sil \ +# voxlingua107_codecs_proc_audio_no_sil \ +# babel_sre_proc_audio_no_sil \ +# cv_codecs_proc_audio_no_sil \ +# others_afr_proc_audio_no_sil +# do +# steps_xvec/extract_wav2vec2xvectors.sh \ +# --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ +# --use-bin-vad false \ +# --random-utt-length true --min-utt-length 3 --max-utt-length 30 \ +# $nnet data/${name} \ +# $xvector_dir/${name} +# done +# fi + +if [ $stage -le 2 ]; then + # Extract xvectors for training + for name in lre22_dev + do + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd" --nj 100 ${xvec_args} \ + --use-bin-vad true --num-augs 10 --aug-config conf/reverb_noise_aug.yaml \ + --random-utt-length true --min-utt-length 3 --max-utt-length 30 \ + $nnet data/${name} \ + $xvector_dir/${name}_aug \ + data/${name}_aug + done +fi + + +if [ $stage -le 3 ]; then + # Extracts x-vectors for dev and eval + for name in lre22_dev lre22_eval + do + num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}') + nj=$(($num_spk < 100 ? $num_spk:100)) + steps_xvec/extract_wav2vec2xvectors.sh \ + --cmd "$xvec_cmd --mem 6G" --nj $nj ${xvec_args} \ + $nnet data/$name \ + $xvector_dir/$name + done +fi + + +if [ $stage -le 4 ]; then + for name in lre22_dev + do + if [ "$do_tsne" == "true" ] || [ "$split_dev" == "true" ];then + $train_cmd \ + $xvector_dir/$name/tsne/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 + + $train_cmd \ + $xvector_dir/$name/tsne_per_class/tsne.log \ + hyp_utils/conda_env.sh \ + plot_embedding_tsne_per_class.py \ + --train-list data/$name/utt2lang \ + --train-v-file scp:$xvector_dir/$name/xvector.scp \ + --output-dir $xvector_dir/$name/tsne_per_class \ + --pca-var-r 0.975 \ + --lnorm \ + --prob-plot 1. \ + --tsne.metric cosine \ + --tsne.early-exaggeration 12 --tsne.perplexity 30 \ + --do-ahc --cluster-tsne --ahc-thr -5 + + if [ "$split_dev" == "true" ];then + hyp_utils/conda_env.sh \ + local/split_dev.py \ + --segs-file $xvector_dir/$name/tsne_per_class/segments.csv \ + --output-dir ./resources/dev_splits \ + --num-folds 2 + + # delete the split data dirs so they are regenerated later + rm -rf data/lre22_dev_p{1,2} + + fi + fi + done +fi + +if [ $stage -le 5 ]; then + if [ ! -d data/lre22_dev_p1 ];then + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/train_segments.csv \ + > p1.lst + awk -F "," '$1!="id" { print $1}' \ + ./resources/dev_splits/fold_0/test_segments.csv \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev data/lre22_dev_$p + done + fi +fi + +if [ $stage -le 6 ]; then + if [ -d data/lre22_dev_aug ] && [ ! -d data/lre22_dev_aug_p1 ];then + awk -v fsegs=./resources/dev_splits/fold_0/train_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1] +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p1.lst + + awk -v fsegs=./resources/dev_splits/fold_0/test_segments.csv ' +BEGIN{FS=","; +getline; +while(getline < fsegs) +{ + segs[$1]=1; +} +FS=" "; +} +{ if($2 in segs){ print $1}}' data/lre22_dev_aug/augm2clean \ + > p2.lst + + for p in p1 p2 + do + utils/subset_data_dir.sh \ + --utt-list $p.lst \ + data/lre22_dev_aug data/lre22_dev_aug_$p + done + fi +fi + +if [ $stage -le 7 ];then + if [ -f $xvector_dir/lre22_dev_aug/xvector.scp ];then + mkdir -p $xvector_dir/lre22_dev_aug_clean + cat $xvector_dir/lre22_dev/xvector.scp \ + $xvector_dir/lre22_dev_aug/xvector.scp \ + > $xvector_dir/lre22_dev_aug_clean/xvector.scp + + for p in "" _p1 _p2 + do + if [ ! -d data/lre22_dev_aug_clean$p ]; then + utils/combine_data.sh \ + data/lre22_dev_aug_clean$p \ + data/lre22_dev$p \ + data/lre22_dev_aug$p + fi + done + fi +fi + +exit diff --git a/egs/voxceleb/v1.2/cmd.sh b/egs/voxceleb/v1.2/cmd.sh index 040f458b..381b14e0 100755 --- a/egs/voxceleb/v1.2/cmd.sh +++ b/egs/voxceleb/v1.2/cmd.sh @@ -14,8 +14,8 @@ if [ "$(hostname -d)" == "cm.gemini" ];then #export train_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G" - #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G" + #export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G" export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G" # export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G" else diff --git a/egs/voxceleb/v1.2/conf/train_fwseresnet34pe_xvec_stage1_v3.1.yaml b/egs/voxceleb/v1.2/conf/train_fwseresnet34pe_xvec_stage1_v3.1.yaml new file mode 100644 index 00000000..efa601c0 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_fwseresnet34pe_xvec_stage1_v3.1.yaml @@ -0,0 +1,78 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + num_augs: 4 + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: seg_chunk_sampler + min_batch_size: 64 + max_chunk_length: 2.0 + min_chunk_length: 2.0 + data_loader: + num_workers: 8 +model: + feats: fbank80_specaug1_stmn_16k.yaml + xvector: + resnet_type: fwseresnet34 + in_channels: 1 + in_feats: 80 + in_kernel_size: 3 + in_stride: 1 + no_maxpool: true + pool_net: + pool_type: ch-wise-att-mean+stddev + inner_feats: 128 + embed_dim: 192 + cos_scale: 30.0 + loss_type: subcenter-arc-softmax + num_subcenters: 2 + margin: 0.2 + margin_warmup_epochs: 5.0 + dropout_rate: 0.1 + norm_before: false + hid_act: swish + se_r: 4 + freq_pos_enc: true +trainer: + optim: + opt_type: adam + lr: 0.01 + amsgrad: true + beta1: 0.9 + beta2: 0.99 + weight_decay: 2.0e-05 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 40000 + hold_steps: 65000 + min_lr: 1.0e-05 + warmup_steps: 15000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 35 + eff_batch_size: 256 + target_key: speaker +master_port: 4567 \ No newline at end of file diff --git a/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.2.yaml b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.2.yaml new file mode 100644 index 00000000..99fbf196 --- /dev/null +++ b/egs/voxceleb/v1.2/conf/train_idrnd_resnet100_xvec_stage2_v3.2.yaml @@ -0,0 +1,74 @@ +data: + train: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 + val: + dataset: + class_names: + - speaker + aug_cfgs: + - conf/reverb_noise_aug.yaml + return_segment_info: + - speaker + sampler: + sampler_type: class_weighted_random_seg_chunk_sampler + min_batch_size: 16 + max_chunk_length: 4.0 + min_chunk_length: 4.0 + num_chunks_per_seg_epoch: 6 + class_name: speaker + seg_weight_mode: data-prior + num_hard_prototypes: 8 + data_loader: + num_workers: 8 +model: + xvector: + override_output: true + loss_type: subcenter-arc-softmax + num_subcenters: 2 + cos_scale: 30.0 + margin: 0.3 + margin_warmup_epochs: 0 + intertop_margin: 0.1 + override_dropouts: true + dropout_rate: 0.025 +trainer: + optim: + opt_type: sgd + lr: 1e-4 + momentum: 0.9 + weight_decay: 2e-5 + lrsched: + lrsch_type: exp_lr + decay_rate: 0.5 + decay_steps: 32000 + hold_steps: 16000 + min_lr: 1.0e-6 + warmup_steps: 8000 + update_lr_on_opt_step: true + grad_clip: 250 + use_amp: true + log_interval: 1000 + epochs: 11 + eff_batch_size: 256 + swa_start: 20 + swa_lr: 1e-5 + swa_anneal_epochs: 2 + target_key: speaker + \ No newline at end of file diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh index 12b86dd1..19f90be6 100644 --- a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34.v3.1.sh @@ -26,8 +26,8 @@ nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth # back-end do_plda=false -do_snorm=false #true -do_qmf=false #true +do_snorm=true +do_qmf=true do_voxsrc22=true plda_aug_config=conf/reverb_noise_aug.yaml diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34pe.v3.1.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34pe.v3.1.sh new file mode 100644 index 00000000..62092708 --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_fwseresnet34pe.v3.1.sh @@ -0,0 +1,44 @@ +# Freq-wise-SE ResNet34 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_fwseresnet34pe.v3.1 + +nnet_s1_base_cfg=conf/train_fwseresnet34pe_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0035.pth + +nnet_s2_base_cfg=conf/train_fwseresnet34_xvec_stage2_v3.1.yaml +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/swa_model_ep0016.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.2.sh b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.2.sh new file mode 100644 index 00000000..4dbee17d --- /dev/null +++ b/egs/voxceleb/v1.2/global_conf/config_fbank80_stmn_idrnd_resnet100.v3.2.sh @@ -0,0 +1,45 @@ +# IdRnd ResNet100 + +# acoustic features +feat_config=conf/fbank80_stmn_16k.yaml +feat_type=fbank80_stmn + +#vad +vad_config=conf/vad_16k.yaml + +# x-vector training +nnet_data=voxceleb2cat_train + +# x-vector cfg +nnet_type=resnet +nnet_name=${feat_type}_idrnd_resnet100.v3.1 + +nnet_s1_base_cfg=conf/train_idrnd_resnet100_xvec_stage1_v3.1.yaml +nnet_s1_name=$nnet_name.s1 +nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name +nnet_s1=$nnet_s1_dir/model_ep0029.pth + +nnet_s2_base_cfg=conf/train_idrnd_resnet100_xvec_stage2_v3.2.yaml +nnet_name=${feat_type}_idrnd_resnet100.v3.2 +nnet_s2_name=${nnet_name}.s2 +nnet_s2_dir=exp/xvector_nnets/$nnet_s2_name +nnet_s2=$nnet_s2_dir/model_ep0011.pth + +# back-end +do_plda=false +do_snorm=true +do_qmf=true +do_voxsrc22=true + +plda_aug_config=conf/reverb_noise_aug.yaml +plda_num_augs=0 +if [ $plda_num_augs -eq 0 ]; then + plda_data=voxceleb2cat_train +else + plda_data=voxceleb2cat_train_augx${plda_num_augs} +fi +plda_type=splda +lda_dim=200 +plda_y_dim=150 +plda_z_dim=200 + diff --git a/egs/voxceleb/v1.2/run_007_eval_be.sh b/egs/voxceleb/v1.2/run_007_eval_be.sh index 53621488..bd436644 100755 --- a/egs/voxceleb/v1.2/run_007_eval_be.sh +++ b/egs/voxceleb/v1.2/run_007_eval_be.sh @@ -157,7 +157,7 @@ if [ "$do_snorm" == "true" ];then do for((j=1;j<=$num_parts;j++)); do - $train_cmd $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ + $train_cmd --mem 22G $score_cosine_snorm_dir/log/voxsrc22_dev_${i}_${j}.log \ hyp_utils/conda_env.sh \ hyperion-eval-cosine-scoring-backend \ --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ @@ -278,7 +278,7 @@ if [ "$do_qmf" == "true" ];then do for((j=1;j<=$num_parts;j++)); do - $train_cmd $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ + $train_cmd --mem 22G $score_cosine_qmf_dir/log/voxsrc22_dev_${i}_${j}.log \ hyp_utils/conda_env.sh \ hyperion-eval-cosine-scoring-backend-with-qmf \ --feats-file csv:$xvector_dir/voxsrc22_dev/xvector.csv \ diff --git a/hyperion/torch/layer_blocks/resnet_blocks.py b/hyperion/torch/layer_blocks/resnet_blocks.py index 428d8139..17b6ce25 100644 --- a/hyperion/torch/layer_blocks/resnet_blocks.py +++ b/hyperion/torch/layer_blocks/resnet_blocks.py @@ -41,6 +41,7 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) class FreqPosEnc(nn.Module): def __init__(self, num_feats): + super().__init__() self.pos_enc = nn.Parameter(torch.zeros((num_feats, 1))) def forward(self, x): @@ -185,7 +186,7 @@ def __init__( self.downsample_factor = stride self.pos_enc = None if freq_pos_enc: - self.pos_enc = FreqPosEnc(num_feats) + self.pos_enc = FreqPosEnc(num_feats*stride) @property def out_channels(self): From 5a6240e90cb317bb60f5ead68a74dd01b9ea2a78 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 10 May 2024 19:14:59 -0400 Subject: [PATCH 135/154] asr conformer runs --- README.md | 3 + .../v1/conf/fbank80_specaug1_mn_16k.yaml | 3 + egs/librispeech/v1/conf/sp_unigram_1000.yaml | 9 + egs/librispeech/v1/conf/sp_unigram_512.yaml | 9 + ...mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml | 4 +- ...nk80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh | 12 +- egs/librispeech/v1/run_001_prepare_data.sh | 25 +++ .../v1/run_002_prepare_noises_rirs.sh | 102 ++++++++++ .../v1/run_003_train_tokenizers.sh | 25 +++ egs/librispeech/v1/run_004_train_asr.sh | 47 +++++ .../conf/train_fwseresnet34_dino_v1.2.2.yaml | 3 +- ...config_fbank80_stmn_fwseresnet34.v1.2.2.sh | 10 +- hyperion/bin/hyperion_dataset.py | 58 +++++- hyperion/bin/train_dino_wav2xvector.py | 4 +- hyperion/bin/train_tokenizer.py | 174 ++++++++++++++++ hyperion/bin/train_wav2rnn_transducer.py | 19 +- hyperion/bin/train_wav2xvector.py | 2 +- hyperion/np/augment/speed_augment.py | 4 +- hyperion/torch/data/bucketing_seg_sampler.py | 33 ++-- hyperion/torch/data/dino_audio_dataset.py | 38 ++-- hyperion/torch/data/seg_sampler.py | 107 +++++----- hyperion/torch/data/seg_sampler_factory.py | 42 ++-- .../torch/layer_blocks/transformer_input.py | 3 +- hyperion/torch/layers/mvn.py | 4 + hyperion/torch/layers/spec_augment.py | 71 ++++--- hyperion/torch/models/__init__.py | 5 +- hyperion/torch/models/transducer/__init__.py | 9 +- .../torch/models/transducer/rnn_transducer.py | 2 + .../{subsampling.py => subsampling0.py} | 0 .../{transducer.py => transducer0.py} | 0 .../{transformer.py => transformer0.py} | 7 +- .../torch/models/wav2transducer/__init__.py | 7 +- .../wav2transducer/hf_wav2rnn_transducer.py | 98 ++++----- .../hf_wav2vec2rnn_transducer.py | 24 +-- .../wav2conformer_v1_rnn_transducer.py | 73 +++++++ .../wav2transducer/wav2rnn_rnn_transducer.py | 71 +++++++ .../wav2transducer/wav2rnn_transducer.py | 27 +-- .../wav2xvectors/wav2resnet1d_xvector.py | 1 - hyperion/torch/narchs/conformer_encoder_v1.py | 2 +- hyperion/torch/trainers/dvae_trainer.py | 4 +- hyperion/torch/trainers/torch_trainer.py | 2 +- hyperion/torch/trainers/transducer_trainer.py | 5 +- hyperion/torch/trainers/vae_trainer.py | 4 +- hyperion/torch/trainers/vq_dvae_trainer.py | 4 +- hyperion/torch/trainers/vq_vae_trainer.py | 4 +- .../torch/trainers/xvector_adv_trainer.py | 4 +- .../trainers/xvector_adv_trainer_from_wav.py | 4 +- hyperion/torch/trainers/xvector_trainer.py | 4 +- .../trainers/xvector_trainer_deep_feat_reg.py | 2 +- .../xvector_trainer_deep_feat_reg_from_wav.py | 4 +- .../trainers/xvector_trainer_from_wav.py | 4 +- hyperion/torch/utils/masking.py | 13 +- hyperion/utils/class_info.py | 12 +- hyperion/utils/hyp_dataset.py | 187 ++++++++++++++++-- hyperion/utils/info_table.py | 24 ++- hyperion/utils/recording_set.py | 39 ++++ 56 files changed, 1153 insertions(+), 304 deletions(-) create mode 100644 egs/librispeech/v1/conf/sp_unigram_1000.yaml create mode 100644 egs/librispeech/v1/conf/sp_unigram_512.yaml create mode 100755 egs/librispeech/v1/run_002_prepare_noises_rirs.sh create mode 100755 egs/librispeech/v1/run_003_train_tokenizers.sh create mode 100755 egs/librispeech/v1/run_004_train_asr.sh create mode 100644 hyperion/bin/train_tokenizer.py rename hyperion/torch/models/transducer/{subsampling.py => subsampling0.py} (100%) rename hyperion/torch/models/transducer/{transducer.py => transducer0.py} (100%) rename hyperion/torch/models/transducer/{transformer.py => transformer0.py} (98%) create mode 100644 hyperion/torch/models/wav2transducer/wav2conformer_v1_rnn_transducer.py create mode 100644 hyperion/torch/models/wav2transducer/wav2rnn_rnn_transducer.py diff --git a/README.md b/README.md index d56406d7..71a0fbd3 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,9 @@ The full API is described in the documentation page [https://hyperion-ml.readthe conda create --name ${your_env} python=3.11 conda activate ${your_env} conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia +# If using k2 for ASR +wget https://huggingface.co/csukuangfj/k2/resolve/main/ubuntu-cuda/k2-1.24.4.dev20240223+cuda11.8.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +pip install k2-1.24.4.dev20240223+cuda11.8.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ``` For systems with cuda 10.2 driver: diff --git a/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml b/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml index 99f202bb..e6def26c 100644 --- a/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml +++ b/egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml @@ -23,3 +23,6 @@ spec_augment: mask_method: mean mvn: norm_var: false + left_context: 0 + right_context: 0 + diff --git a/egs/librispeech/v1/conf/sp_unigram_1000.yaml b/egs/librispeech/v1/conf/sp_unigram_1000.yaml new file mode 100644 index 00000000..2a9b1b1e --- /dev/null +++ b/egs/librispeech/v1/conf/sp_unigram_1000.yaml @@ -0,0 +1,9 @@ +vocab_size: 1000 +model_type: unigram +char_coverage: 1.0 +unk_id: 2 +user_defined_symbols: +- +- +uppercase_text: true + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/sp_unigram_512.yaml b/egs/librispeech/v1/conf/sp_unigram_512.yaml new file mode 100644 index 00000000..116e6d22 --- /dev/null +++ b/egs/librispeech/v1/conf/sp_unigram_512.yaml @@ -0,0 +1,9 @@ +vocab_size: 512 +model_type: unigram +char_coverage: 1.0 +unk_id: 2 +user_defined_symbols: +- +- +uppercase_text: true + \ No newline at end of file diff --git a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml index ed622adb..a142349b 100644 --- a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml +++ b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml @@ -8,7 +8,7 @@ data: - text sampler: sampler_type: bucketing_seg_sampler - max_batch_length: 625. + max_batch_length: 1500. min_batch_size: 1 drop_last: false data_loader: @@ -20,7 +20,7 @@ data: - text sampler: sampler_type: bucketing_seg_sampler - max_batch_length: 625 + max_batch_length: 1500. min_batch_size: 1 drop_last: true data_loader: diff --git a/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh b/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh index ee8c2b55..62817852 100644 --- a/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh +++ b/egs/librispeech/v1/global_conf/config_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.sh @@ -1,17 +1,19 @@ # Conformer + RNN-T # training data -nnet_train_data=train_960h -nnet_val__data=dev_all +nnet_train_data=librispeech_train-960 +nnet_val_data=librispeech_dev # tokenizer -bpe_model=data/lang_bpe_1000/bpe.model +token_train_data=librispeech_train-960 +token_cfg=conf/sp_unigram_512.yaml +token_dir=data/token_${token_train_data}_unigram_512 +token_model=$token_dir/tokenizer.model # rnn-t cfg nnet_type=conformer_v1_rnn_transducer nnet_name=fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p -nnet_s1_base_cfg=conf/train_${nnet_name}.s1.yaml -nnet_s1_args="" +nnet_s1_cfg=conf/train_${nnet_name}.s1.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/asr_nnets/$nnet_s1_name diff --git a/egs/librispeech/v1/run_001_prepare_data.sh b/egs/librispeech/v1/run_001_prepare_data.sh index 3a4ef221..1ca8b585 100755 --- a/egs/librispeech/v1/run_001_prepare_data.sh +++ b/egs/librispeech/v1/run_001_prepare_data.sh @@ -41,3 +41,28 @@ if [ $stage -le 1 ]; then touch data/lhotse_librispeech/.librispeech.done fi fi + +if [ $stage -le 2 ];then + echo "Stage 2: Convert Manifest to Hyperion Datasets" + for data in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other + do + hyperion-dataset from_lhotse \ + --recordings-file data/lhotse_librispeech/librispeech_recordings_${data}.jsonl.gz \ + --supervisions-file data/lhotse_librispeech/librispeech_supervisions_${data}.jsonl.gz \ + --dataset data/librispeech_${data} + done + +fi + +if [ $stage -le 3 ];then + echo "Stage 3: Merge Librispeech train sets" + hyperion-dataset merge \ + --input-datasets data/librispeech_train-{clean-100,clean-360,other-500} \ + --dataset data/librispeech_train-960 + + echo "Stage 3: Merge Librispeech dev sets" + hyperion-dataset merge \ + --input-datasets data/librispeech_dev-{clean,other} \ + --dataset data/librispeech_dev + +fi diff --git a/egs/librispeech/v1/run_002_prepare_noises_rirs.sh b/egs/librispeech/v1/run_002_prepare_noises_rirs.sh new file mode 100755 index 00000000..73c7ed82 --- /dev/null +++ b/egs/librispeech/v1/run_002_prepare_noises_rirs.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +# We prepare the noise files and RIR for online speech augmentation +if [ $stage -le 1 ]; then + for name in noise music speech + do + hyperion-prepare-data musan \ + --corpus-dir $musan_root \ + --subset $name \ + --output-dir data/musan_$name + done +fi + +if [ $stage -le 2 ]; then + # # Prepare to distribute data over multiple machines + # # This only does something at CLSP grid + # hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes + + for name in musan_noise musan_music + do + input_data_dir=data/$name + output_data_dir=data/${name}_proc_audio + output_dir=exp/proc_audio/$name + $train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \ + hyp_utils/conda_env.sh \ + hyperion-preprocess-audio-files \ + --audio-format flac \ + --part-idx JOB --num-parts $nj \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_dir/recordings.JOB.csv + + hyperion-tables cat \ + --table-type recordings \ + --output-file $output_dir/recordings.csv --num-tables $nj + hyperion-dataset set_recordings \ + --dataset $input_data_dir \ + --recordings-file $output_dir/recordings.csv \ + --output-dataset $output_data_dir + + + done +fi + +if [ $stage -le 3 ]; then + # Create Babble noise from MUSAN speech files + for name in musan_speech + do + input_data_dir=data/$name + output_data_dir=data/${name}_babble + output_dir=exp/proc_audio/${name}_babble + $train_cmd $output_dir/log/make_babble_noise_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-make-babble-noise-audio-files \ + --audio-format flac \ + --min-spks 3 --max-spks 10 --num-reuses 5 \ + --recordings-file $input_data_dir/recordings.csv \ + --output-path $output_dir \ + --output-recordings-file $output_data_dir/recordings.csv + hyperion-dataset make_from_recordings \ + --dataset $output_data_dir \ + --recordings-file $output_data_dir/recordings.csv + done +fi + +if [ $stage -le 4 ]; then + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom + hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real + for rirs in rirs_smallroom rirs_mediumroom rirs_real + do + output_dir=exp/rirs/$rirs + data_dir=data/$rirs + $train_cmd $output_dir/log/pack_rirs_${name}.log \ + hyp_utils/conda_env.sh \ + hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \ + --output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1; + hyperion-dataset add_features --dataset $data_dir \ + --features-name rirs --features-file $output_dir/rirs.csv + + done +fi + diff --git a/egs/librispeech/v1/run_003_train_tokenizers.sh b/egs/librispeech/v1/run_003_train_tokenizers.sh new file mode 100755 index 00000000..35ae7da2 --- /dev/null +++ b/egs/librispeech/v1/run_003_train_tokenizers.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright +# 2020 Johns Hopkins University (Author: Jesus Villalba) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +nj=10 +config_file=default_config.sh +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +if [ $stage -le 1 ];then + $train_cmd \ + $token_dir/train_sp.log \ + hyperion-train-tokenizer sentencepiece \ + --cfg $token_cfg \ + --segments-file data/$token_train_data/segments.csv \ + --tokenizer-path $token_dir + +fi diff --git a/egs/librispeech/v1/run_004_train_asr.sh b/egs/librispeech/v1/run_004_train_asr.sh new file mode 100755 index 00000000..d158689e --- /dev/null +++ b/egs/librispeech/v1/run_004_train_asr.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright +# 2022 Johns Hopkins University (Author: Yen-Ju Lu) +# Apache 2.0. +# +. ./cmd.sh +. ./path.sh +set -e + +stage=1 +ngpu=2 +config_file=default_config.sh +interactive=false +num_workers="" +use_tb=false +use_wandb=false + +. parse_options.sh || exit 1; +. $config_file +. datapath.sh + +train_dir=data/${nnet_train_data} +val_dir=data/${nnet_val_data} + +if [ "$interactive" == "true" ];then + export cuda_cmd=run.pl +fi + +# Network Training +if [ $stage -le 1 ]; then + + mkdir -p $nnet_s1_dir/log + $cuda_cmd \ + --gpu $ngpu $nnet_s1_dir/log/train.log \ + hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \ + hyperion-train-wav2rnn-transducer $nnet_type \ + --cfg $nnet_s1_cfg \ + --data.train.dataset.recordings-file $train_dir/recordings.csv \ + --data.train.dataset.segments-file $train_dir/segments.csv \ + --data.train.dataset.bpe-model $token_model \ + --data.val.dataset.recordings-file $val_dir/recordings.csv \ + --data.val.dataset.segments-file $val_dir/segments.csv \ + --trainer.exp-path $nnet_s1_dir $args \ + --num-gpus $ngpu + +fi + diff --git a/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml index 6d6e60a9..37bada36 100644 --- a/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml +++ b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml @@ -87,7 +87,8 @@ trainer: init_momentum: 0.996 momentum: 1.0 warmup_steps: 500000 - grad_clip: 15 + # grad_clip: 15 + grad_clip: 5 use_amp: true log_interval: 1000 epochs: 100 diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh index 13a72732..8a8b58a3 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh @@ -18,7 +18,15 @@ nnet_s1_base_cfg=conf/train_fwseresnet34_dino_v1.2.2.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name nnet_s1=$nnet_s1_dir/teacher_model_ep0034.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0025.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0038.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0043.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0044.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0046.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0049.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0054.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0058.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0064.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0067.pth # clustering of dino embeddings cluster_method=cos_ahc_plda_ahc diff --git a/hyperion/bin/hyperion_dataset.py b/hyperion/bin/hyperion_dataset.py index 3bb61fb0..f5db8ada 100755 --- a/hyperion/bin/hyperion_dataset.py +++ b/hyperion/bin/hyperion_dataset.py @@ -38,6 +38,9 @@ "split_train_val", "copy", "add_cols_to_segments", + "merge", + "from_lhotse", + "from_kaldi", ] @@ -514,6 +517,30 @@ def add_cols_to_segments( dataset.save(output_dataset) +def make_merge_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--input-datasets", required=True, nargs="+", help="input datasets" + ) + add_common_args(parser) + return parser + + +def merge(dataset: PathLike, input_datasets: List[PathLike]): + input_dataset_paths = input_datasets + dataset_path = dataset + input_datasets = [] + for dset_file in input_dataset_paths: + input_datasets.append(HypDataset.load(dset_file)) + + dataset = HypDataset.merge(input_datasets) + dataset.save(dataset_path) + + def make_from_lhotse_parser(): parser = ArgumentParser() parser.add_argument("--cfg", action=ActionConfigFile) @@ -535,6 +562,8 @@ def make_from_lhotse_parser(): default=None, help="lhotse supervisions file", ) + add_common_args(parser) + return parser def from_lhotse( @@ -545,11 +574,36 @@ def from_lhotse( ): assert cuts_file is not None or supervisions_file is not None - dataset_dir = dataset + dataset_path = dataset dataset = HypDataset.from_lhotse( cuts=cuts_file, recordings=recordings_file, supervisions=supervisions_file ) - dataset.save(dataset) + dataset.save(dataset_path) + + +def make_from_kaldi_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--dataset", required=True, help="""dataset dir or .yaml file""" + ) + parser.add_argument( + "--kaldi-data-dir", + required=True, + help="Kaldi data directory", + ) + add_common_args(parser) + return parser + + +def from_kaldi( + dataset: PathLike, + kaldi_data_dir: PathLike, +): + + dataset_path = dataset + dataset = HypDataset.from_kaldi(kaldi_data_dir) + dataset.save(dataset_path) def main(): diff --git a/hyperion/bin/train_dino_wav2xvector.py b/hyperion/bin/train_dino_wav2xvector.py index cb541f55..88d3a556 100755 --- a/hyperion/bin/train_dino_wav2xvector.py +++ b/hyperion/bin/train_dino_wav2xvector.py @@ -237,8 +237,8 @@ def main(): try: config_file = Path(args_sc.trainer.exp_path) / "config.yaml" parser.save(args, str(config_file), format="yaml", overwrite=True) - except: - pass + except Exception as err: + logging.warning(f"failed saving {args} to {config_file} with {err}") args_sc.xvec_class = xvec_dict[xvec_type] # torch docs recommend using forkserver diff --git a/hyperion/bin/train_tokenizer.py b/hyperion/bin/train_tokenizer.py new file mode 100644 index 00000000..b3d28923 --- /dev/null +++ b/hyperion/bin/train_tokenizer.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" +import logging +import os +from pathlib import Path +from typing import Dict, List + +import sentencepiece as spm +from jsonargparse import ( + ActionConfigFile, + ActionParser, + ActionYesNo, + ArgumentParser, + namespace_to_dict, +) + +from hyperion.hyp_defs import config_logger +from hyperion.utils import PathLike, SegmentSet + +tokenizer_list = ["sentencepiece"] + + +def add_common_args(parser): + parser.add_argument( + "--segments-file", + required=True, + help="input segments file with sentence transcriptions", + ) + parser.add_argument( + "--text-column", default="text", help="text column in segments file" + ) + parser.add_argument("--tokenizer-path", required=True, help="tokenizer model dir") + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + ) + + +def train_sentencepiece( + segments_file: PathLike, + text_column: str, + vocab_size: int, + model_type: str, + char_coverage: str, + sentence_size: int, + user_defined_symbols: List[str], + unk_id: int, + sos_id: int, + eos_id: int, + pad_id: int, + unk_piece: str, + sos_piece: str, + eos_piece: str, + pad_piece: str, + uppercase_text: bool, + tokenizer_path: PathLike, +): + + tokenizer_path = Path(tokenizer_path) + tokenizer_path.mkdir(exist_ok=True, parents=True) + + text_file = tokenizer_path / "text" + if not text_file.is_file(): + segments = SegmentSet.load(segments_file) + with open(text_file, "w", encoding="utf-8") as f_text: + for text in segments[text_column]: + if uppercase_text: + text = text.upper() + f_text.write(f"{text}\n") + + model_prefix = tokenizer_path / "tokenizer" + model_file = model_prefix.with_suffix(".model") + if not model_file.is_file(): + spm.SentencePieceTrainer.train( + input=text_file, + vocab_size=vocab_size, + model_type=model_type, + model_prefix=str(model_prefix), + input_sentence_size=sentence_size, + character_coverage=char_coverage, + user_defined_symbols=user_defined_symbols, + unk_id=unk_id, + bos_id=sos_id, + eos_id=eos_id, + pad_id=pad_id, + unk_piece=unk_piece, + bos_piece=sos_piece, + eos_piece=eos_piece, + pad_piece=pad_piece, + ) + + generate_sentencepiece_tokens(model_file, tokenizer_path) + + +def generate_sentencepiece_tokens(model_file: PathLike, tokenizer_path: PathLike): + sp = spm.SentencePieceProcessor() + sp.load(str(model_file)) + token2id: Dict[str, int] = {sp.id_to_piece(i): i for i in range(sp.vocab_size())} + with open(tokenizer_path / "tokens.txt", "w", encoding="utf-8") as f: + for sym, i in token2id.items(): + f.write(f"{sym} {i}\n") + + +def make_sentencepiece_parser(): + parser = ArgumentParser() + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument( + "--vocab-size", default=1000, type=int, help="output vocabulary size" + ) + parser.add_argument( + "--model-type", default="unigram", choices=["unigram", "bpe", "char", "word"] + ) + parser.add_argument("--char-coverage", default=1.0, type=float) + parser.add_argument("--sentence-size", default=100000000, type=int) + parser.add_argument( + "--user-defined-symbols", + default=["", ""], + nargs="+", + help="user defined symbols", + ) + parser.add_argument("--unk-id", default=2, type=int) + parser.add_argument("--sos-id", default=-1, type=int) + parser.add_argument("--eos-id", default=-1, type=int) + parser.add_argument("--pad-id", default=-1, type=int) + parser.add_argument("--unk-piece", default="") + parser.add_argument("--sos-piece", default="") + parser.add_argument("--eos-piece", default="") + parser.add_argument("--pad-piece", default="") + parser.add_argument("--uppercase-text", default=True, action=ActionYesNo) + + add_common_args(parser) + return parser + + +def main(): + parser = ArgumentParser(description="Train sentence piece tokenizer") + parser.add_argument("--cfg", action=ActionConfigFile) + + subcommands = parser.add_subcommands() + for subcommand in tokenizer_list: + parser_func = f"make_{subcommand}_parser" + subparser = globals()[parser_func]() + subcommands.add_subcommand(subcommand, subparser) + + args = parser.parse_args() + try: + gpu_id = int(os.environ["LOCAL_RANK"]) + except: + gpu_id = 0 + + subcommand = f"train_{args.subcommand}" + kwargs = namespace_to_dict(args)[args.subcommand] + if gpu_id == 0: + try: + config_file = Path(kwargs["tokenizer_path"]) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) + except Exception as err: + logging.warning(f"failed saving {args} err={err}") + + config_logger(kwargs["verbose"]) + del kwargs["verbose"] + del kwargs["cfg"] + globals()[subcommand](**kwargs) + + +if __name__ == "__main__": + main() diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 6d947d24..ebd23845 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -25,12 +25,13 @@ from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.data import AudioDataset as AD from hyperion.torch.data import SegSamplerFactory -from hyperion.torch.models import Wav2RNNRNNTransducer +from hyperion.torch.models import Wav2ConformerV1RNNTransducer, Wav2RNNRNNTransducer from hyperion.torch.trainers import TransducerTrainer as Trainer from hyperion.torch.utils import ddp model_dict = { "rnn_rnn_transducer": Wav2RNNRNNTransducer, + "conformer_v1_rnn_transducer": Wav2ConformerV1RNNTransducer, } @@ -38,6 +39,14 @@ def transducer_collate(batch): audio = [] audio_length = [] target = [] + for record in batch: + audio_length.append(record["x"].shape[0]) + audio_length = torch.as_tensor(audio_length) + if not torch.all(audio_length[:-1] >= audio_length[1:]): + sort_idx = torch.argsort(audio_length, descending=True) + batch = [batch[i] for i in sort_idx] + + audio_length = [] for record in batch: wav = torch.as_tensor(record["x"]) audio.append(wav) @@ -109,7 +118,7 @@ def train_model(gpu_id, args): set_float_cpu("float32") # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False - torch.backends.cudnn.enabled = False + # torch.backends.cudnn.enabled = False ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) @@ -200,7 +209,7 @@ def make_parser(model_class): def main(): - parser = ArgumentParser(description="Train RNN Transducer model from audio files") + parser = ArgumentParser(description="Train Transducer model from audio files") parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() @@ -222,8 +231,8 @@ def main(): try: config_file = Path(args_sc.trainer.exp_path) / "config.yaml" parser.save(args, str(config_file), format="yaml", overwrite=True) - except: - pass + except Exception as err: + logging.warning(f"{err}") args_sc.model_class = model_dict[model_type] # torch docs recommend using forkserver diff --git a/hyperion/bin/train_wav2xvector.py b/hyperion/bin/train_wav2xvector.py index 2c4684c3..bb4a3913 100755 --- a/hyperion/bin/train_wav2xvector.py +++ b/hyperion/bin/train_wav2xvector.py @@ -192,7 +192,7 @@ def main(): config_file = Path(args_sc.trainer.exp_path) / "config.yaml" parser.save(args, str(config_file), format="yaml", overwrite=True) except: - pass + logging.warning(f"failed saving {args} to {config_file}") args_sc.xvec_class = xvec_dict[xvec_type] # torch docs recommend using forkserver diff --git a/hyperion/np/augment/speed_augment.py b/hyperion/np/augment/speed_augment.py index a648190d..95127084 100644 --- a/hyperion/np/augment/speed_augment.py +++ b/hyperion/np/augment/speed_augment.py @@ -96,11 +96,11 @@ def forward(self, x): # change speed r = self.speed_ratios[speed_idx] info = {"speed_ratio": r} - y = time_stretch(x, r) + y = time_stretch(x, rate=r) # print(f"1 r={r} {x.shape} {y.shape}", flush=True) if self.keep_length: if r > 1: - dither = np.max(x) / 2 ** 15 # we add some dither in the padding + dither = np.max(x) / 2**15 # we add some dither in the padding pad_y = dither * np.ones((x.shape[-1] - y.shape[-1],), dtype=y.dtype) y = np.concatenate((y, pad_y), axis=-1) elif r < 1: diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index c890627e..aa02661c 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -7,7 +7,6 @@ import math import numpy as np - import torch import torch.distributed as dist @@ -17,13 +16,15 @@ class BucketingSegSampler(HypSampler): - def __init__(self, - seg_set, - base_sampler=SegSampler, - num_buckets=10, - length_column="duration", - seed=1234, - **base_kwargs): + def __init__( + self, + seg_set, + base_sampler=SegSampler, + num_buckets=10, + length_column="duration", + seed=1234, + **base_kwargs + ): super().__init__(shuffle=False, seed=seed) self.seg_set = seg_set self.base_sampler = base_sampler @@ -33,13 +34,12 @@ def __init__(self, self.length_column = length_column self._create_bucket_samplers() self._compute_len() - self.depleted_buckets = torch.zeros((num_buckets, ), dtype=torch.bool) + self.depleted_buckets = torch.zeros((num_buckets,), dtype=torch.bool) def create_buckets(self): sort_idx = np.argsort(self.seg_set[self.length_column].values) sorted_seg_set = self.seg_set.iloc[sort_idx] - cum_lengths = np.cumsum(sorted_seg_set[self.length_column].values, - axis=0) + cum_lengths = np.cumsum(sorted_seg_set[self.length_column].values, axis=0) bucket_length = cum_lengths[-1] / self.num_buckets buckets = [] for i in range(self.num_buckets): @@ -67,9 +67,9 @@ def _compute_len(self): for i in range(self.num_buckets): self._len += len(self.bucket_samplers[i]) - def set_epoch(self, epoch): + def set_epoch(self, epoch, batch=0): for i in range(self.num_buckets): - self.bucket_samplers[i].set_epoch(epoch) + self.bucket_samplers[i].set_epoch(epoch, batch) def __iter__(self): super().__iter__() @@ -88,10 +88,9 @@ def __next__(self): raise StopIteration while True: - bucket_idx = torch.randint(low=0, - high=self.num_buckets, - size=(1, ), - generator=self.rng).item() + bucket_idx = torch.randint( + low=0, high=self.num_buckets, size=(1,), generator=self.rng + ).item() if self.depleted_buckets[bucket_idx]: continue diff --git a/hyperion/torch/data/dino_audio_dataset.py b/hyperion/torch/data/dino_audio_dataset.py index bb0a93a5..15eaca4b 100644 --- a/hyperion/torch/data/dino_audio_dataset.py +++ b/hyperion/torch/data/dino_audio_dataset.py @@ -236,12 +236,12 @@ def _split_audio_into_teacher_student_chunks(self, x, duration, fs): x_student, student_samples, ) = self._split_audio_into_teacher_student_disjoint(x, duration, fs) - assert ( - len(x_teacher) >= 64000 and len(x_teacher) <= 136000 - ), f"{len(x_teacher)}, {len(x_student)} {len(x)} {duration*fs}, {teacher_samples}, {student_samples}" - assert ( - len(x_student) >= 32000 and len(x_student) <= 136000 - ), f"{len(x_teacher)}, {len(x_student)}, {len(x)} {duration*fs}, {teacher_samples}, {student_samples}" + # assert ( + # len(x_teacher) >= 64000 and len(x_teacher) <= 136000 + # ), f"{len(x_teacher)}, {len(x_student)} {len(x)} {duration*fs}, {teacher_samples}, {student_samples}" + # assert ( + # len(x_student) >= 32000 and len(x_student) <= 136000 + # ), f"{len(x_teacher)}, {len(x_student)}, {len(x)} {duration*fs}, {teacher_samples}, {student_samples}" xs_teacher = self._split_audio_into_chunks( x_teacher, teacher_samples, @@ -254,14 +254,14 @@ def _split_audio_into_teacher_student_chunks(self, x, duration, fs): int(fs * self.student_chunk_length), self.num_student_chunks, ) - for xx in xs_teacher: - assert ( - len(xx) >= 64000 and len(xx) <= 72000 - ), f"{[len(t) for t in xs_teacher]} {len(x_teacher)} {len(x)}" - for xx in xs_student: - assert ( - len(xx) >= 32000 and len(xx) <= 40000 - ), f"{[len(t) for t in xs_student]} {len(x_student)} {len(x)}" + # for xx in xs_teacher: + # assert ( + # len(xx) >= 64000 and len(xx) <= 72000 + # ), f"{[len(t) for t in xs_teacher]} {len(x_teacher)} {len(x)}" + # for xx in xs_student: + # assert ( + # len(xx) >= 32000 and len(xx) <= 40000 + # ), f"{[len(t) for t in xs_student]} {len(x_student)} {len(x)}" return xs_teacher, xs_student @@ -284,16 +284,6 @@ def __getitem__(self, segment): ) data.update(x_augs_teacher) data.update(x_augs_student) - # print(data, flush=True) - # for ll in [ - # "x_teacher_0", - # "x_teacher_1", - # "x_student_0", - # "x_student_1", - # "x_student_2", - # "x_student_3", - # ]: - # print("zzz ", ll, data[ll].shape, flush=True) seg_info = self._get_segment_info(seg_id) data.update(seg_info) return data diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index 39d1eed2..a280c87e 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -7,10 +7,10 @@ import math import numpy as np -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from ...utils.misc import filter_func_args from .hyp_sampler import HypSampler @@ -24,6 +24,7 @@ def __init__( length_name="duration", shuffle=False, drop_last=False, + sort_by_length=True, seed=1234, ): super().__init__(shuffle=shuffle, seed=seed) @@ -33,21 +34,20 @@ def __init__( self.max_batch_length = max_batch_length self.var_batch_size = max_batch_length is not None self.length_name = length_name + self.sort_by_length = sort_by_length if self.var_batch_size: - avg_batch_size = max_batch_length / np.mean( - self.seg_set[self.length_name]) + avg_batch_size = max_batch_length / np.mean(self.seg_set[self.length_name]) else: avg_batch_size = min_batch_size self.avg_batch_size = avg_batch_size if drop_last: - self._len = int( - len(self.seg_set) / (avg_batch_size * self.world_size)) + self._len = int(len(self.seg_set) / (avg_batch_size * self.world_size)) else: self._len = int( - math.ceil( - (len(self.seg_set) // self.world_size) / avg_batch_size)) + math.ceil((len(self.seg_set) // self.world_size) / avg_batch_size) + ) self._permutation = None @@ -55,8 +55,9 @@ def __len__(self): return self._len def _shuffle_segs(self): - self._permutation = torch.randperm(len(self.seg_set), - generator=self.rng).numpy() + self._permutation = torch.randperm( + len(self.seg_set), generator=self.rng + ).numpy() def __iter__(self): super().__iter__() @@ -82,37 +83,49 @@ def __next__(self): else: idx = self.start - max_length = max(max_length, self.seg_set.iloc[idx, - column_idx]) + max_length = max(max_length, self.seg_set.iloc[idx, column_idx]) if max_length * (batch_size + 1) > self.max_batch_length: break idxs.append(idx) self.start = (self.start + self.world_size) % len(self.seg_set) batch_size += 1 - if (self.max_batch_size is not None - and batch_size >= self.max_batch_size): + if ( + self.max_batch_size is not None + and batch_size >= self.max_batch_size + ): break - assert len( - idxs - ) >= 1, f"increase max_batch_length {self.max_batch_length} >= {max_length}" + assert ( + len(idxs) >= 1 + ), f"increase max_batch_length {self.max_batch_length} >= {max_length}" else: - stop = min(self.start + self.world_size * self.min_batch_size, - len(self.seg_set)) + stop = min( + self.start + self.world_size * self.min_batch_size, len(self.seg_set) + ) if self.shuffle: - idxs = self._permutation[self.start:stop:self.world_size] + idxs = self._permutation[self.start : stop : self.world_size] else: idxs = slice(self.start, stop, self.world_size) self.start += self.world_size * self.min_batch_size + ids = self.seg_set.iloc[idxs].id.values + if self.sort_by_length: + lengths = self.seg_set.loc[ids, self.length_name].values + sort_idx = np.argsort(lengths)[::-1] + ids = ids[sort_idx] + if "chunk_start" in self.seg_set: - chunks = self.seg_set.iloc[idxs] - seg_ids = [(id, s, d) for id, s, d in zip( - chunks.seg_id, chunks.chunk_start, chunks[self.length_name])] + chunks = self.seg_set.loc[ids] + seg_ids = [ + (id, s, d) + for id, s, d in zip( + chunks.seg_id, chunks.chunk_start, chunks[self.length_name] + ) + ] else: - seg_ids = self.seg_set.iloc[idxs].id.values + seg_ids = ids if self.batch == 0: logging.info("batch 0 seg_ids=%s", str(seg_ids[:10])) @@ -122,18 +135,19 @@ def __next__(self): @staticmethod def filter_args(**kwargs): + return filter_func_args(SegSampler.__init__, kwargs, skip={"seg_set"}) - valid_args = ( - "min_batch_size", - "max_batch_size", - "max_batch_length", - "length_name", - "shuffle", - "drop_last", - "seed", - ) + # valid_args = ( + # "min_batch_size", + # "max_batch_size", + # "max_batch_length", + # "length_name", + # "shuffle", + # "drop_last", + # "seed", + # ) - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -151,31 +165,28 @@ def add_class_args(parser, prefix=None): "--max-batch-size", type=int, default=None, - help= - ("maximum batch size per gpu, if None, estimated from max_batch_length" - ), + help=( + "maximum batch size per gpu, if None, estimated from max_batch_length" + ), ) parser.add_argument( "--max-batch-duration", type=float, default=None, - help= - ("maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" - ), + help=( + "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), ) parser.add_argument( - "--drop-last", - action=ActionYesNo, - help="drops the last batch of the epoch", + "--drop-last", action=ActionYesNo, help="drops the last batch of the epoch", ) parser.add_argument( "--shuffle", action=ActionYesNo, - help= - "shuffles the segments or chunks at the beginning of the epoch", + help="shuffles the segments or chunks at the beginning of the epoch", ) parser.add_argument( @@ -188,9 +199,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--length-name", default="duration", - help= - "which column in the segment table indicates the duration of the file", + help="which column in the segment table indicates the duration of the file", ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index 35973f50..8f6501b5 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -2,6 +2,7 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging from typing import Optional, Union @@ -9,15 +10,13 @@ from .audio_dataset import AudioDataset from .bucketing_seg_sampler import BucketingSegSampler -from .class_weighted_seg_chunk_sampler import \ - ClassWeightedRandomSegChunkSampler +from .class_weighted_seg_chunk_sampler import ClassWeightedRandomSegChunkSampler from .feat_seq_dataset import FeatSeqDataset from .seg_chunk_sampler import SegChunkSampler from .seg_sampler import SegSampler sampler_dict = { - "class_weighted_random_seg_chunk_sampler": - ClassWeightedRandomSegChunkSampler, + "class_weighted_random_seg_chunk_sampler": ClassWeightedRandomSegChunkSampler, "seg_sampler": SegSampler, "seg_chunk_sampler": SegChunkSampler, "bucketing_seg_sampler": BucketingSegSampler, @@ -28,6 +27,7 @@ class SegSamplerFactory(object): """Factory class to create different types of samplers for sequencial data like audio or acoustic features. """ + @staticmethod def create( dataset: Union[AudioDataset, FeatSeqDataset], @@ -91,6 +91,7 @@ def filter_args(**kwargs): "batch_size", "shuffle", "drop_last", + "sort_by_length", "seed", ) @@ -113,8 +114,7 @@ def add_class_args(parser, prefix=None): "--base-sampler-type", choices=["seg_sampler", "bucketing_seg_sampler"], default="seg_sampler", - help= - "base sampler used for seg_chunk_sampler or bucketing_seg_sampler", + help="base sampler used for seg_chunk_sampler or bucketing_seg_sampler", ) parser.add_argument( @@ -141,9 +141,9 @@ def add_class_args(parser, prefix=None): "--max-batch-size", type=int, default=None, - help= - ("maximum batch size per gpu, if None, estimated from max_batch_length" - ), + help=( + "maximum batch size per gpu, if None, estimated from max_batch_length" + ), ) parser.add_argument( @@ -157,9 +157,9 @@ def add_class_args(parser, prefix=None): "--max-batch-length", type=float, default=None, - help= - ("maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" - ), + help=( + "maximum accumlated duration of the batch, if None estimated from the min/max_batch_size and min/max_chunk_lengths" + ), ) parser.add_argument( @@ -225,8 +225,7 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--shuffle", action=ActionYesNo, - help= - "shuffles the segments or chunks at the beginning of the epoch", + help="shuffles the segments or chunks at the beginning of the epoch", ) parser.add_argument( "--seed", @@ -238,16 +237,19 @@ def add_class_args(parser, prefix=None): parser.add_argument( "--length-name", default="duration", - help= - "which column in the segment table indicates the duration of the segment", + help="which column in the segment table indicates the duration of the segment", ) parser.add_argument( "--class-name", default="class_id", - help= - "which column in the segment table indicates the class of the segment", + help="which column in the segment table indicates the class of the segment", + ) + parser.add_argument( + "--sort-by-length", + default=True, + action=ActionYesNo, + help="sort sequences in the batch by duration", ) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/layer_blocks/transformer_input.py b/hyperion/torch/layer_blocks/transformer_input.py index 6c5de188..e55071b9 100644 --- a/hyperion/torch/layer_blocks/transformer_input.py +++ b/hyperion/torch/layer_blocks/transformer_input.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import math import torch @@ -80,7 +81,7 @@ def forward(self, x, x_mask=None): if x_mask is None: return x, None - return x, x_mask[:, :, :: self.stride] + return x, x_mask[..., :: self.stride] class TransformerConv1dSubsampler(nn.Module): diff --git a/hyperion/torch/layers/mvn.py b/hyperion/torch/layers/mvn.py index a46ce20d..5a92e89a 100644 --- a/hyperion/torch/layers/mvn.py +++ b/hyperion/torch/layers/mvn.py @@ -2,6 +2,7 @@ Copyright 2020 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import torch import torch.nn as nn from jsonargparse import ActionParser, ActionYesNo, ArgumentParser @@ -53,6 +54,8 @@ def forward(self, x, x_lengths=None, x_mask=None): Args: x: feature tensor. + x_lengths: lengths of x sequences + x_mask: mask of valid frames, if present, x_lengths is ignored. Returns: Normalized feature tensor. @@ -69,6 +72,7 @@ def forward(self, x, x_lengths=None, x_mask=None): x_lengths, max_length, dtype=x.dtype, + ndim=x.dim(), none_if_all_max=True, ) diff --git a/hyperion/torch/layers/spec_augment.py b/hyperion/torch/layers/spec_augment.py index f4e03842..9ef71f5f 100644 --- a/hyperion/torch/layers/spec_augment.py +++ b/hyperion/torch/layers/spec_augment.py @@ -2,13 +2,15 @@ Copyright 2021 Johns Hopkins University (Author: Jesus Villalba, Nanxin Chen) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import logging -from jsonargparse import ActionParser, ArgumentParser +import logging import torch import torch.nn as nn import torch.nn.functional as nnf +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser + +from ...utils.misc import filter_func_args count = 0 @@ -22,8 +24,9 @@ class AxisMasker(nn.Module): max_width: maximum width of the mask. min_num_mask: minimum number of masks. max_num_mask: maximum number of masks. - dim: axis where we apply the mask - mask_value: masking value + dim: axis where we apply the mask. + mask_value: masking value. + use_num_masks_percentage: if True, num_masks are per 100 frames, if False they are absolute. """ def __init__( @@ -35,6 +38,7 @@ def __init__( dim=-1, mask_method="constant", mask_value=0, + use_num_masks_percentage=False, ): super().__init__() assert min_width >= 0 @@ -44,17 +48,22 @@ def __init__( self.min_width = min_width self.max_width = max_width + if not use_num_masks_percentage: + min_num_masks = int(min_num_masks) + max_num_masks = int(max_num_masks) + self.min_num_masks = min_num_masks self.max_num_masks = max_num_masks self.dim = dim self.mask_method = mask_method self.mask_value = mask_value + self.use_num_masks_percentage = use_num_masks_percentage def __repr__(self): s = ( "{}(min_width={}, max_width={}, " "min_num_masks={}, max_num_masks={}, " - "dim={}, mask_method={}, mask_value={})" + "dim={}, mask_method={}, mask_value={} use_num_masks_percentage={})" ).format( self.__class__.__name__, self.min_width, @@ -64,6 +73,7 @@ def __repr__(self): self.dim, self.mask_method, self.mask_value, + self.use_num_masks_percentage, ) return s @@ -86,9 +96,16 @@ def forward(self, x): batch_size = x.shape[0] masked_dim_length = x.shape[self.dim] + if self.use_num_masks_percentage: + min_num_masks = int(round(self.min_num_masks * masked_dim_length / 100)) + max_num_masks = int(round(self.max_num_masks * masked_dim_length / 100)) + else: + min_num_masks = self.min_num_masks + max_num_masks = self.max_num_masks + # select how many masks num_masks = torch.randint( - self.min_num_masks, self.max_num_masks + 1, size=(1,), device=x.device + min_num_masks, max_num_masks + 1, size=(1,), device=x.device )[0] # (batch, num_mask, 1) widths = torch.randint( @@ -156,7 +173,7 @@ def forward(self, x, x_lengths=None): Args: x: spectrogram shape= (batch, *, time, freq) - lengths: time lengths of the sequences. + x_lengths: time lengths of the sequences. Returns: warped spectrogram shape = (batch, *, time, freq) """ @@ -184,11 +201,10 @@ def forward(self, x, x_lengths=None): if dim == -1 or x_lengths is None: warp_length = x.shape[-2] else: - warp_length = int(x.shape[-2] * torch.min(x_lengths)) + warp_length = int(torch.min(x_lengths)) center = torch.randint(self.window, warp_length - self.window, (1,))[0] warped = torch.randint(center - self.window, center + self.window, (1,))[0] + 1 - # (batch, C, warped, freq) left = nnf.interpolate( x[:, :, :center], (warped, x.shape[3]), mode=self.mode, align_corners=False @@ -210,6 +226,9 @@ def forward(self, x, x_lengths=None): if dim == -1: x = x.transpose(-1, -2) + if ndim == 3: + x = x.squeeze(1) + x = x.view(in_shape) return x @@ -231,6 +250,7 @@ class SpecAugment(nn.Module): time_max_width: maximum width of the time mask. time_min_num_mask: minimum number of time masks. time_max_num_mask: maximum number of time masks. + time_use_num_masks_percentage: if True, num_masks are per 100 frames, if False they are absolute. freq_mask_prob: probability of applying frequency masking. freq_min_width: minimum width of the frequency mask. freq_max_width: maximum width of the frequency mask. @@ -249,6 +269,7 @@ def __init__( time_mask_max_width=100, time_mask_min_num_masks=1, time_mask_max_num_masks=2, + time_use_num_masks_percentage=False, freq_mask_prob=0, freq_mask_min_width=0, freq_mask_max_width=20, @@ -287,6 +308,7 @@ def __init__( dim=-2, mask_method=mask_method, mask_value=mask_value, + use_num_masks_percentage=time_use_num_masks_percentage, ) if self.freq_mask_prob > 0: @@ -368,26 +390,7 @@ def filter_args(**kwargs): Returns: Dictionary with SpecAugment options. """ - valid_args = ( - "time_warp_prob", - "time_warp_window", - "time_warp_mode", - "time_mask_prob", - "time_mask_max_width", - "time_mask_min_width", - "time_mask_max_num_masks", - "time_mask_min_num_masks", - "freq_mask_prob", - "freq_mask_max_width", - "freq_mask_min_width", - "freq_mask_max_num_masks", - "freq_mask_min_num_masks", - "mask_value", - "mask_method", - ) - - d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - return d + return filter_func_args(SpecAugment.__init__, kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -437,16 +440,22 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( "--time-mask-min-num-masks", - type=int, + type=float, default=1, help="min. number of time mask", ) parser.add_argument( "--time-mask-max-num-masks", - type=int, + type=float, default=2, help="max. number of time mask", ) + parser.add_argument( + "--time-use-num-masks-percentage", + default=False, + action=ActionYesNo, + help="if True, num_masks are per 100 frames, if False they are absolute.", + ) parser.add_argument( "--freq-mask-prob", diff --git a/hyperion/torch/models/__init__.py b/hyperion/torch/models/__init__.py index 7292dbad..77a2543f 100644 --- a/hyperion/torch/models/__init__.py +++ b/hyperion/torch/models/__init__.py @@ -7,11 +7,12 @@ from .transducer import RNNRNNTransducer, RNNTransducer from .vae.vae import VAE from .vae.vq_vae import VQVAE -from .wav2transducer import ( +from .wav2transducer import ( # HFWav2Vec2Transducer, HFWav2Vec2ConformerV1RNNTransducer, HFWav2Vec2RNNRNNTransducer, HFWav2Vec2RNNTransducer, - HFWav2Vec2Transducer, + Wav2ConformerV1RNNTransducer, + Wav2RNNRNNTransducer, ) from .wav2xvectors import ( HFHubert2ConformerV1XVector, diff --git a/hyperion/torch/models/transducer/__init__.py b/hyperion/torch/models/transducer/__init__.py index 984e15ec..331e3ef0 100644 --- a/hyperion/torch/models/transducer/__init__.py +++ b/hyperion/torch/models/transducer/__init__.py @@ -7,8 +7,9 @@ from .conformer_v1_rnn_transducer import ConformerV1RNNTransducer from .rnn_rnn_transducer import RNNRNNTransducer from .rnn_transducer import RNNTransducer, RNNTransducerOutput -from .transducer import Transducer -#from .conformer import Conformer -#from .decoder import Decoder -#from .joiner import Joiner +# from .transducer import Transducer + +# from .conformer import Conformer +# from .decoder import Decoder +# from .joiner import Joiner diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py index b8e7fe74..a9fa5830 100644 --- a/hyperion/torch/models/transducer/rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -44,6 +44,8 @@ def __init__( self, encoder: Union[TorchModel, None], decoder: Union[Dict, RNNTransducerDecoder], + ctc_weight: float = 0.0, + rnnt_weight: float = 1.0, ): super().__init__() if encoder is not None: diff --git a/hyperion/torch/models/transducer/subsampling.py b/hyperion/torch/models/transducer/subsampling0.py similarity index 100% rename from hyperion/torch/models/transducer/subsampling.py rename to hyperion/torch/models/transducer/subsampling0.py diff --git a/hyperion/torch/models/transducer/transducer.py b/hyperion/torch/models/transducer/transducer0.py similarity index 100% rename from hyperion/torch/models/transducer/transducer.py rename to hyperion/torch/models/transducer/transducer0.py diff --git a/hyperion/torch/models/transducer/transformer.py b/hyperion/torch/models/transducer/transformer0.py similarity index 98% rename from hyperion/torch/models/transducer/transformer.py rename to hyperion/torch/models/transducer/transformer0.py index a354b5f5..0beb405f 100644 --- a/hyperion/torch/models/transducer/transformer.py +++ b/hyperion/torch/models/transducer/transformer0.py @@ -20,10 +20,11 @@ import torch import torch.nn as nn + from hyperion.utils.text import make_pad_mask from .encoder_interface import EncoderInterface -from .subsampling import Conv2dSubsampling, VggSubsampling +from .subsampling0 import Conv2dSubsampling, VggSubsampling class Transformer(EncoderInterface): @@ -250,9 +251,7 @@ def _get_activation_fn(activation: str): elif activation == "gelu": return nn.functional.gelu - raise RuntimeError( - "activation should be relu/gelu, not {}".format(activation) - ) + raise RuntimeError("activation should be relu/gelu, not {}".format(activation)) class PositionalEncoding(nn.Module): diff --git a/hyperion/torch/models/wav2transducer/__init__.py b/hyperion/torch/models/wav2transducer/__init__.py index 71e82b98..e57b36ff 100644 --- a/hyperion/torch/models/wav2transducer/__init__.py +++ b/hyperion/torch/models/wav2transducer/__init__.py @@ -4,8 +4,9 @@ """ -from .hf_wav2vec2_transducer import HFWav2Vec2Transducer -from .hf_wav2vec2conformer_v1_rnn_transducer import \ - HFWav2Vec2ConformerV1RNNTransducer +# from .hf_wav2vec2_transducer import HFWav2Vec2Transducer +from .hf_wav2vec2conformer_v1_rnn_transducer import HFWav2Vec2ConformerV1RNNTransducer from .hf_wav2vec2rnn_rnn_transducer import HFWav2Vec2RNNRNNTransducer from .hf_wav2vec2rnn_transducer import HFWav2Vec2RNNTransducer +from .wav2conformer_v1_rnn_transducer import Wav2ConformerV1RNNTransducer +from .wav2rnn_rnn_transducer import Wav2RNNRNNTransducer diff --git a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py index 1d16675c..c4f65ba6 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2rnn_transducer.py @@ -2,15 +2,15 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import contextlib import logging from dataclasses import dataclass from typing import Dict, List, Union -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...torch_model import TorchModel from ...utils import remove_silence @@ -18,7 +18,7 @@ class HFWav2RNNTransducer(TorchModel): - """Abstract Base class for x-vector models that use a Hugging Face Model as feature extractor. + """Abstract Base class for RNN-T transducer models that use a Hugging Face Model as feature extractor. Attributes: hf_feats: hugging face model wrapper object. @@ -29,11 +29,13 @@ class HFWav2RNNTransducer(TorchModel): than one layer is used. """ - def __init__(self, - hf_feats: TorchModel, - transducer: Union[Dict, TorchModel], - feat_fusion_start: int = 0, - feat_fusion_method: str = "weighted-avg"): + def __init__( + self, + hf_feats: TorchModel, + transducer: Union[Dict, TorchModel], + feat_fusion_start: int = 0, + feat_fusion_method: str = "weighted-avg", + ): super().__init__() self.hf_feats = hf_feats @@ -66,12 +68,9 @@ def _make_fuser(self): self.feat_fuser = nn.Parameter(torch.zeros(num_layers)) elif self.feat_fusion_method == "linear": self.feat_fuser = nn.Linear(num_layers, 1, bias=False) - self.feat_fuser.weight.data = torch.ones(1, - num_layers) / num_layers + self.feat_fuser.weight.data = torch.ones(1, num_layers) / num_layers elif self.feat_fusion_method == "cat": - self.feat_fuser = nn.Linear(num_layers * layer_dim, - layer_dim, - bias=False) + self.feat_fuser = nn.Linear(num_layers * layer_dim, layer_dim, bias=False) def _fuse_hid_feats(self, hid_feats): """Fuses the hidden features from the Wav2Vec model. @@ -86,7 +85,7 @@ def _fuse_hid_feats(self, hid_feats): # There is only one layer of features return hid_feats[0] - hid_feats = hid_feats[self.feat_fusion_start:] + hid_feats = hid_feats[self.feat_fusion_start :] if self.feat_fusion_method == "weighted-avg": hid_feats = torch.stack(hid_feats, dim=-1) norm_weights = nn.functional.softmax(self.feat_fuser, dim=-1) @@ -102,14 +101,14 @@ def _fuse_hid_feats(self, hid_feats): return feats - def forward_feats(self, - x, - x_lengths, - return_feat_layers=None, - chunk_length=0, - detach_chunks=False): - return_hid_states = (False if return_feat_layers is None - and self.feat_fusion_method == "last" else True) + def forward_feats( + self, x, x_lengths, return_feat_layers=None, chunk_length=0, detach_chunks=False + ): + return_hid_states = ( + False + if return_feat_layers is None and self.feat_fusion_method == "last" + else True + ) with self._hf_context: hf_output = self.hf_feats( x, @@ -131,7 +130,8 @@ def forward_feats(self, # add hidden feats from wav2vec to the output. We transpose to be (batch, C, time) # as the hidden features of the x-vector encoder. hid_feats = [ - f.transpose(1, 2) for i, f in enumerate(hid_feats) + f.transpose(1, 2) + for i, f in enumerate(hid_feats) if i in return_feat_layers ] else: @@ -167,7 +167,8 @@ def forward( "h_feats" (wav2vec features) """ feats, hid_feats, feat_lengths = self.forward_feats( - x, x_lengths, return_feat_layers) + x, x_lengths, return_feat_layers + ) feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) output = self.transducer( @@ -181,13 +182,15 @@ def forward( return output - def infer(self, - x: torch.Tensor, - x_lengths: torch.Tensor, - decoding_method="time_sync_beam_search", - beam_width: int = 5, - max_sym_per_frame: int = 3, - max_sym_per_utt: int = 1000): + def infer( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + decoding_method="time_sync_beam_search", + beam_width: int = 5, + max_sym_per_frame: int = 3, + max_sym_per_utt: int = 1000, + ): """ ASR tokens inference Args: @@ -204,12 +207,14 @@ def infer(self, feats = feats.permute(0, 2, 1) # (N, C, T) ->(N, T, C) - y = self.transducer.infer(feats, - feat_lengths, - decoding_method=decoding_method, - beam_width=beam_width, - max_sym_per_frame=max_sym_per_frame, - max_sym_per_utt=max_sym_per_utt) + y = self.transducer.infer( + feats, + feat_lengths, + decoding_method=decoding_method, + beam_width=beam_width, + max_sym_per_frame=max_sym_per_frame, + max_sym_per_utt=max_sym_per_utt, + ) return y def freeze_feat_fuser(self): @@ -265,11 +270,11 @@ def _train(self, train_mode: str): if train_mode in ["full", "frozen"]: super()._train(train_mode) elif train_mode in [ - "ft-transducer", - "hf-feats-frozen", - "ft-transducer-nograd", - "hf-feats-frozen-nograd", - "hf-feat-extractor-frozen", + "ft-transducer", + "hf-feats-frozen", + "ft-transducer-nograd", + "hf-feats-frozen-nograd", + "hf-feat-extractor-frozen", ]: self.hf_feats.train() self.transducer._train("full") @@ -340,8 +345,10 @@ def add_class_args(parser, prefix=None, skip=set()): "--feat-fusion-method", default="weighted-avg", choices=["weighted-avg", "linear", "cat", "last"], - help=("method to fuse the hidden layers from the wav2vec model " - "in [weighted-avg, linear, cat, last]"), + help=( + "method to fuse the hidden layers from the wav2vec model " + "in [weighted-avg, linear, cat, last]" + ), ) if prefix is not None: @@ -359,8 +366,7 @@ def add_infer_args(parser, prefix=None): RNNTransducer.add_infer_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def filter_infer_args(**kwargs): diff --git a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py index fe82f734..dac8c776 100644 --- a/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/hf_wav2vec2rnn_transducer.py @@ -2,13 +2,13 @@ Copyright 2022 Johns Hopkins University (Author: Yen-Ju Lu) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging from typing import Dict, Optional, Union -from jsonargparse import ActionParser, ArgumentParser - import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...tpm import HFWav2Vec2 from ..transducer import RNNTransducer @@ -44,19 +44,7 @@ def __init__( else: assert isinstance(hf_feats, HFWav2Vec2) - # if isinstance(transducer, dict): - # transducer["decoder"]["in_feats"] = hf_feats.hidden_size - # transducer["joiner"]["in_feats"] = hf_feats.hidden_size - # if "class_name" in transducer: - # del transducer["class_name"] - # transducer = Transducer(**transducer) - # else: - # assert isinstance(transducer, Transducer) - # assert transducer.decoder.in_feats == hf_feats.hidden_size - # assert transducer.joiner.in_feats == hf_feats.hidden_size - - super().__init__(hf_feats, transducer, feat_fusion_start, - feat_fusion_method) + super().__init__(hf_feats, transducer, feat_fusion_start, feat_fusion_method) @staticmethod def filter_args(**kwargs): @@ -78,8 +66,7 @@ def add_class_args(parser, prefix=None): HFWav2RNNTransducer.add_class_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) @staticmethod def filter_finetune_args(**kwargs): @@ -100,5 +87,4 @@ def add_finetune_args(parser, prefix=None): RNNTransducer.add_finetune_args(parser, prefix="transducer") if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/wav2conformer_v1_rnn_transducer.py b/hyperion/torch/models/wav2transducer/wav2conformer_v1_rnn_transducer.py new file mode 100644 index 00000000..330aea3b --- /dev/null +++ b/hyperion/torch/models/wav2transducer/wav2conformer_v1_rnn_transducer.py @@ -0,0 +1,73 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import ConformerV1RNNTransducer +from .wav2rnn_transducer import Wav2RNNTransducer + + +class Wav2ConformerV1RNNTransducer(Wav2RNNTransducer): + """Class for RNN-T with ConformerV1 Encoder and acoustic feature input + + Attributes: + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + transducer: Transducer configuration dictionary or object. + """ + + def __init__( + self, + feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, ConformerV1RNNTransducer], + ): + + if isinstance(transducer, dict): + if "class_name" in transducer: + del transducer["class_name"] + + transducer = ConformerV1RNNTransducer(**transducer) + else: + assert isinstance(transducer, ConformerV1RNNTransducer) + + super().__init__(feats, transducer) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2RNNTransducer.add_class_args(parser) + ConformerV1RNNTransducer.add_class_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = ConformerV1RNNTransducer.filter_finetune_args( + **kwargs["transducer"] + ) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + ConformerV1RNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/wav2rnn_rnn_transducer.py b/hyperion/torch/models/wav2transducer/wav2rnn_rnn_transducer.py new file mode 100644 index 00000000..25890d78 --- /dev/null +++ b/hyperion/torch/models/wav2transducer/wav2rnn_rnn_transducer.py @@ -0,0 +1,71 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Yen-Ju Lu) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +import logging +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser + +from ...tpm import HFWav2Vec2 +from ..transducer import RNNRNNTransducer +from .wav2rnn_transducer import Wav2RNNTransducer + + +class Wav2RNNRNNTransducer(Wav2RNNTransducer): + """Class for RNN-T LSTM encoder and acoustic feature input + + Attributes: + Attributes: + feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. + transducer: Transducer configuration dictionary or object. + """ + + def __init__( + self, + feats: Union[Dict, HFWav2Vec2], + transducer: Union[Dict, RNNRNNTransducer], + ): + + if isinstance(transducer, dict): + if "class_name" in transducer: + del transducer["class_name"] + + transducer = RNNRNNTransducer(**transducer) + else: + assert isinstance(transducer, RNNRNNTransducer) + + super().__init__(feats, transducer) + + @staticmethod + def add_class_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + Wav2RNNTransducer.add_class_args(parser) + RNNRNNTransducer.add_class_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + + @staticmethod + def filter_finetune_args(**kwargs): + base_args = {} + child_args = RNNRNNTransducer.filter_finetune_args(**kwargs["transducer"]) + base_args["transducer"] = child_args + return base_args + + @staticmethod + def add_finetune_args(parser, prefix=None): + if prefix is not None: + outer_parser = parser + parser = ArgumentParser(prog="") + + RNNRNNTransducer.add_finetune_args(parser, prefix="transducer") + + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py index 4b2f235b..bce8e368 100644 --- a/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py +++ b/hyperion/torch/models/wav2transducer/wav2rnn_transducer.py @@ -2,13 +2,18 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging from typing import Dict, Optional, Tuple, Union -from jsonargparse import ActionParser, ArgumentParser +try: + import k2 +except ModuleNotFoundError: + from ...utils import dummy_k2 as k2 import torch import torch.nn as nn +from jsonargparse import ActionParser, ArgumentParser from ...narchs import AudioFeatsMVN from ...torch_model import TorchModel @@ -16,11 +21,12 @@ class Wav2RNNTransducer(TorchModel): - """Base class for models that integrate the acoustic feature extractor and and x-vector model that takes acoustic features as input. + """Base class for models that integrate the acoustic feature extractor and and + RNN-T Transducer that takes acoustic features as input Attributes: feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. - xvector: x-vector model object. + transducer: RNN-T transducer model """ def __init__(self, feats, transducer): @@ -29,7 +35,7 @@ def __init__(self, feats, transducer): if isinstance(feats, dict): feats = AudioFeatsMVN.filter_args(**feats) - feats["trans"] = True + feats["trans"] = False feats = AudioFeatsMVN(**feats) else: assert isinstance(feats, AudioFeatsMVN) @@ -43,7 +49,7 @@ def forward( x_lengths: torch.Tensor, y: k2.RaggedTensor, vad_samples: Optional[torch.Tensor] = None, - vad_feats: Optional[torch.Tensor] = None + vad_feats: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: if vad_samples is not None: @@ -59,17 +65,17 @@ def set_train_mode(self, mode): def get_config(self): feat_cfg = self.feats.get_config() - xvector_cfg = self.xvector.get_config() + xvector_cfg = self.transducer.get_config() config = { "feats": feat_cfg, - "xvector": xvector_cfg, + "transducer": xvector_cfg, } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) @staticmethod - def filter_args(*kwargs): + def filter_args(**kwargs): """Filters Wav2XVector class arguments from arguments dictionary. Args: @@ -80,7 +86,7 @@ def filter_args(*kwargs): """ valid_args = ( "feats", - "xvector", + "transducer", ) return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @@ -100,5 +106,4 @@ def add_class_args(parser, prefix=None): AudioFeatsMVN.add_class_args(parser, prefix="feats") if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py index aa01850f..5a8b14b8 100644 --- a/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py +++ b/hyperion/torch/models/wav2xvectors/wav2resnet1d_xvector.py @@ -19,7 +19,6 @@ class Wav2ResNet1dXVector(Wav2XVector): ResNet1dXVector extractor. Attributes: - Attributes: feats: feature extractor object of class AudioFeatsMVN or dictionary of options to instantiate AudioFeatsMVN object. xvector: ResNet1dXVector configuration dictionary or object. """ diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index ff36096b..72f50f82 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -317,7 +317,7 @@ def forward( Tensor with mask if return_mask is True """ max_in_length = x.size(self.in_time_dim) - x_mask = self._make_masks(x, x_lengths, x_mask) + x_mask = self._make_masks(max_in_length, x_lengths, x_mask) x, x_mask = self._forward_input(x, x_mask) if isinstance(x, tuple): x, pos_emb = x diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index 10bc2edc..6b391912 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -135,7 +135,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_target=target, return_x_mean=True) elbo = output["elbo"].mean() @@ -190,7 +190,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 8bbdcb47..bb0df6b6 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -1051,7 +1051,7 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): help="use mixed precision training", ) parser.add_argument( - "--amp-dtype", default=AMPDType.FLOAT16, choices=AMPDType.choices() + "--amp-dtype", default=AMPDType.FLOAT16.value, choices=AMPDType.choices() ) parser.add_argument( "--cpu-offload", diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 808cce3e..3c52b8bf 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -8,6 +8,7 @@ from collections import OrderedDict as ODict import torch +import torch.cuda.amp as amp import torch.nn as nn import torchaudio from jsonargparse import ActionParser, ArgumentParser @@ -118,7 +119,7 @@ def train_epoch(self, data_loader): ) batch_size = input_data.shape[0] - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_lengths=input_lengths, y=target) loss = output.loss loss = loss.mean() / self.grad_acc_steps @@ -183,7 +184,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): # data, target = data.to(self.device), target.to(self.device) # batch_size = data.shape[0] - with self.amp_autocast(): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_lengths=input_lengths, y=target) for k, v in output.items(): diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index dbf5dfdd..27d485ff 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -136,7 +136,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_target=target, return_x_mean=True) elbo = output["elbo"].mean() loss = -elbo / self.grad_acc_steps @@ -191,7 +191,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index 43aa59a5..1488f5e5 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -101,7 +101,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_target=target, return_x_mean=True) loss = output["loss"] / self.grad_acc_steps x_hat = output["x_mean"] @@ -152,7 +152,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 64db2e64..2331a2b8 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -101,7 +101,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_target=target, return_x_mean=True) loss = output["loss"] x_hat = output["x_mean"] @@ -153,7 +153,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): for batch, data in enumerate(data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, x_target=target, return_x_mean=True) x_hat = output["x_mean"] diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 8603b22a..9d5a8bae 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -138,7 +138,7 @@ def train_epoch(self, data_loader): self.optimizer.zero_grad() - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(input_data, target) loss = self.loss(output.logits, target) / self.grad_acc_steps @@ -192,7 +192,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): self.model.train() with torch.no_grad(): - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(data, **self.amp_args) loss = self.loss(output.logits, target) diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index ccafecdd..f63c532b 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -143,7 +143,7 @@ def train_epoch(self, data_loader): with torch.no_grad(): feats = self.feat_extractor(input_data) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(feats, y=target) loss = self.loss(output.logits, target) / self.grad_acc_steps @@ -197,7 +197,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with torch.no_grad(): feats = self.feat_extractor(input_data) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(feats) loss = self.loss(output.logits, target) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 151993e0..15c5bd42 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -120,7 +120,7 @@ def train_epoch(self, data_loader): batch_keys = [aug_key, self.target_key] x, target = tensors_subset(data, batch_keys, self.device) batch_size = x.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(x, y=target) loss = self.loss(output.logits, target) / loss_scale loss_acc += loss.item() @@ -178,7 +178,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_keys = [aug_key, self.target_key] x, target = tensors_subset(data, batch_keys, self.device) batch_size = x.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(x) loss = self.loss(output.logits, target) / loss_scale loss_acc += loss.item() diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 1c9209f6..f230372c 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -136,7 +136,7 @@ def train_epoch(self, data_loader): input_data, target = tensors_subset(data, batch_keys, self.device) batch_size = input_data.size(0) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): outputs = self.model( input_data, y=target, diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 4b1d23ba..98c74af3 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -129,7 +129,7 @@ def train_epoch(self, data_loader): with torch.no_grad(): feats = self.feat_extractor(input_data) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): outputs = self.model( feats, y=target, @@ -229,7 +229,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_size = input_data.size(0) feats = self.feat_extractor(input_data) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(feats) loss = self.loss(output.logits, target) diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index f46b2109..ada74bb6 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -117,7 +117,7 @@ def train_epoch(self, data_loader): with torch.no_grad(): feats, feats_lengths = self.feat_extractor(audio) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(feats, feats_lengths, y=target) loss = self.loss(output.logits, target) / self.grad_acc_steps @@ -171,7 +171,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_size = audio.size(0) feats, feats_lengths = self.feat_extractor(audio) - with amp.autocast(enabled=self.use_amp): + with amp.autocast(enabled=self.use_amp, dtype=self.amp_dtype): output = self.model(feats, feats_lengths) loss = self.loss(output.logits, target) diff --git a/hyperion/torch/utils/masking.py b/hyperion/torch/utils/masking.py index c7095b31..1a240976 100644 --- a/hyperion/torch/utils/masking.py +++ b/hyperion/torch/utils/masking.py @@ -21,7 +21,7 @@ def scale_seq_lengths(lengths, max_out_length, max_in_length=None): def seq_lengths_to_mask( - lengths, max_length=None, dtype=None, time_dim=1, none_if_all_max=False + lengths, max_length=None, dtype=None, time_dim=1, ndim=None, none_if_all_max=False ): """Creates a binary masks indicating the valid values in a sequence. @@ -33,9 +33,11 @@ def seq_lengths_to_mask( return a view of the mask which will adapt to the shape of the tensor where we want to apply the mask. This has to be a positive integer. + ndim: number of dimensions in the mask tensor, if None, it is equal to time_dim + 1. + none_if_all_max: if True and all lengths are equal to max. length, it returns None Returns: - Binary mask with shape=(batch,...,max_length) or None + Binary mask with shape=(batch,...,max_length,...) or None """ if lengths is None: return None @@ -54,9 +56,12 @@ def seq_lengths_to_mask( # compute mask shape=(batch, max_length) mask = idx.unsqueeze(0) < lengths.unsqueeze(1) + if ndim is None: + ndim = time_dim + 1 + # view to match the tensor where we want to apply the mask - if time_dim > 1: - shape = [1] * (time_dim + 1) + if ndim > 1: + shape = [1] * ndim shape[0] = lengths.size(0) shape[time_dim] = -1 mask = mask.view(*shape) diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index 4d4dd55a..3cb03659 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -2,6 +2,8 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + +import logging from pathlib import Path import numpy as np @@ -92,13 +94,19 @@ def cat(cls, tables): """ df_list = [table.df for table in tables] df = pd.concat(df_list) - assert df["id"].is_unique, """there are duplicated ids in original tables""" + if not df["id"].is_unique: + logging.warning( + """there are duplicated ids in original tables, + removing duplicated rows""" + ) + df.drop_duplicates(subset="id", keep="first", inplace=True) + if not df["class_idx"].is_unique: logging.warning( """class_idx in concat tables are not unique, we will assign new class_idx""" ) - df["class_idx"].drop(columns=["class_idx"], inplace=True) + df.drop(columns=["class_idx"], inplace=True) return cls(df) def filter( diff --git a/hyperion/utils/hyp_dataset.py b/hyperion/utils/hyp_dataset.py index dbf268da..ba137b65 100644 --- a/hyperion/utils/hyp_dataset.py +++ b/hyperion/utils/hyp_dataset.py @@ -67,14 +67,14 @@ def __init__( self._segments_path = Path(segments) self._classes, self._classes_paths = self._parse_dict_args(classes, ClassInfo) - - if isinstance(recordings, RecordingSet): - self._recordings = recordings - self._recordings_path = None - else: - assert isinstance(recordings, (str, Path)) - self._recordings = None - self._recordings_path = Path(recordings) + if recordings is not None: + if isinstance(recordings, RecordingSet): + self._recordings = recordings + self._recordings_path = None + else: + assert isinstance(recordings, (str, Path)) + self._recordings = None + self._recordings_path = Path(recordings) # self._recordings, self._recordings_paths = self._parse_dict_args( # recordings, RecordingSet @@ -183,8 +183,8 @@ def recordings(self, keep_loaded: bool = True): def features_keys(self): if self._features is not None: return self._features.keys() - elif self._features_path is not None: - return self._features_path.keys() + elif self._features_paths is not None: + return self._features_paths.keys() else: return {} @@ -857,7 +857,7 @@ def add_cols_to_segments( elif right_table in self.features_keys(): right_table = self.features_value(right_table) elif right_table in self.classes_keys(): - right_table = self.classes_value + right_table = self.classes_value(right_table) else: raise ValueError("%s not found", right_table) @@ -1254,6 +1254,64 @@ def split_train_val( return train_ds, val_ds + @classmethod + def merge(cls, datasets): + segments = [] + for dset in datasets: + segs_dset = dset.segments(keep_loaded=False) + if segs_dset is not None: + segments.append(segs_dset) + + segments = SegmentSet.cat(segments) + dataset = cls(segments) + + classes_keys = [] + for dset in datasets: + classes_dset = list(dset.classes_keys()) + classes_keys.extend(classes_dset) + + classes_keys = list(set(classes_keys)) + for key in classes_keys: + classes = [] + for dset in datasets: + if key in dset.classes_keys(): + classes_key = dset.classes_value(key, keep_loaded=False) + classes.append(classes_key) + + classes = ClassInfo.cat(classes) + dataset.add_classes(classes_name=key, classes=classes) + + recordings = [] + for dset in datasets: + recs_i = dset.recordings(keep_loaded=False) + if recs_i is not None: + recordings.append(recs_i) + + if recordings: + recordings = RecordingSet.cat(recordings) + dataset.set_recordings(recordings) + + features_keys = [] + for dset in datasets: + features_dset = list(dset.features_keys()) + features_keys.extend(features_dset) + + features_keys = list(set(features_keys)) + for key in features_keys: + features = [] + for dset in datasets: + if key in dset.features_keys(): + features_key = dset.features_value(key, keep_loaded=False) + features.append(features_key) + + features = FeatureSet.cat(features) + dataset.add_features(features_name=key, features=features) + + # TODO: merge enrollments and trials + # Usually you don't need that + return dataset + + @classmethod def from_lhotse( cls, cuts: Optional[Union[lhotse.CutSet, PathLike]] = None, @@ -1288,7 +1346,14 @@ def from_lhotse( from lhotse import MonoCut, Recording, SupervisionSegment - supervision_keys = ["speaker", "gender", "language", "text", "duration"] + supervision_keys = [ + "speaker", + "gender", + "language", + "emotion", + "text", + "duration", + ] recs_df = [] segs_df = [] for cut in cuts: @@ -1297,16 +1362,16 @@ def from_lhotse( seg_dict = {"id": cut.id} recording = cut.recording if recording is not None: - if recording.id != cut.id: - seg_dict["recording_id"] = recording.id + # if recording.id != cut.id: + # seg_dict["recording_id"] = recording.id rec_dict = { - "id": recording.id, + "id": cut.id, "sampling_rate": recording.sampling_rate, "duration": recording.duration, } source = recording.sources[0] - assert len(recording.source) == 1 + assert len(recording.sources) == 1 assert source.type in ["file", "command"] rec_dict["storage_path"] = source.source assert recording.transforms is None, f"{recording.transforms}" @@ -1323,7 +1388,7 @@ def from_lhotse( if val is not None: seg_dict[key] = val - segs_df = seg_dict + segs_df.append(seg_dict) recs_df = pd.DataFrame(recs_df) segs_df = pd.DataFrame(segs_df) @@ -1334,9 +1399,93 @@ def from_lhotse( for key in class_names: if key in segments: uniq_classes = np.unique(segments[key]) - classes[key] = pd.DataFrame({"id": uniq_classes}) + classes[key] = ClassInfo(pd.DataFrame({"id": uniq_classes})) + + if not classes: + classes = None dataset = cls(segments=segments, classes=classes, recordings=recordings) return dataset - return None + @classmethod + def from_kaldi( + cls, + kaldi_data_dir: PathLike, + ): + """Creates a Hyperion Dataset from a Kaldi data dir + + Args: + kaldi_data_dir: Kaldi data directory + + Returns + HypDataset object + """ + kaldi_data_dir = Path(kaldi_data_dir) + + kaldi_files = ["utt2lang", "utt2dur", "utt2text"] + attributes = ["language", "duration", "text"] + + k_file = kaldi_data_dir / "utt2spk" + from .utt2info import Utt2Info + + utt2spk = Utt2Info.load(k_file) + df_segs = pd.DataFrame({"id": utt2spk.key, "speaker": utt2spk.info}) + segments = SegmentSet(df_segs) + del utt2spk + + for att, k_file in zip(kaldi_files, attributes): + k_file = kaldi_data_dir / k_file + if k_file.is_file(): + u2i = Utt2Info.load(k_file) + segments.loc[u2i.key, att] = u2i.info + + k_file = kaldi_data_dir / "spk2gender" + if k_file.is_file(): + segments["gender"] = "N/A" + s2g = Utt2Info.load(k_file) + for spk in s2g.key: + g = s2g[spk] + segments.loc[segments["speaker"] == spk, "gender"] = g + + kaldi_files = ["feats.scp", "vad.scp"] + attributes = ["feats", "vad"] + features = None + from .scp_list import SCPList + + for att, k_file in zip(kaldi_files, attributes): + k_file = kaldi_data_dir / k_file + if k_file.is_file(): + scp = SCPList.load(k_file) + feats_dict = {"id": scp.key, "storage_path": scp.file_path} + if scp.offset is not None: + feats_dict["storage_byte"] = scp.offset + df_feats = pd.DataFrame(feats_dict) + if features is None: + features = {} + features["att"] = FeatureSet(df_feats) + + recordings = None + k_file = kaldi_data_dir / "wav.scp" + if k_file.is_file(): + scp = SCPList.load(k_file) + wav_dict = {"id": scp.key, "storage_path": scp.file_path} + df_recs = pd.DataFrame(wav_dict) + recordings = RecordingSet(df_recs) + recordings.get_durations() + if "duration" not in segments: + segments["duration"] = recordings.loc[segments["id"], "duration"] + + class_names = ["speaker", "language", "emotion", "gender"] + classes = {} + for key in class_names: + if key in segments: + uniq_classes = np.unique(segments[key]) + classes[key] = ClassInfo(pd.DataFrame({"id": uniq_classes})) + + if not classes: + classes = None + + dataset = cls( + segments=segments, classes=classes, recordings=recordings, features=features + ) + return dataset diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index ea03f058..a813a467 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -12,6 +12,7 @@ import numpy as np import pandas as pd +from pandas.api.types import infer_dtype from .list_utils import split_list, split_list_group_by_key @@ -25,10 +26,15 @@ class InfoTable: """ def __init__(self, df): - self.df = df assert "id" in df, f"info_table={df}" + self.df = df + self.fix_dtypes() self.df.set_index("id", drop=False, inplace=True) + def fix_dtypes(self): + if infer_dtype(self.df.id) != "string": + self.df[:, "id"] = self.df["id"].apply(str) + def copy(self): """Makes a copy of the object.""" return deepcopy(self) @@ -145,7 +151,19 @@ def load(cls, file_path, sep=None, name="class_id"): if sep is None: sep = "\t" if ".tsv" in ext else "," - df = pd.read_csv(file_path, sep=sep) + fixed_dtypes = { + "id": str, + "speaker": str, + "language": str, + "gender": str, + "duration": float, + "storage_path": str, + "storage_byte": int, + "num_frames": int, + "video_ids": str, + "language_est": str, + } + df = pd.read_csv(file_path, sep=sep, dtype=fixed_dtypes) return cls(df) @@ -213,7 +231,7 @@ def filter( iindex: filters the table based on integer index with pandas command: df.iloc[iiindex], used if predicate and items are None columns: columns to keep of remove. - by: column id to use with itmes criterion + by: column id to use with items criterion keep: if True, the criterion is used to keep rows, if False it is used to remove rows diff --git a/hyperion/utils/recording_set.py b/hyperion/utils/recording_set.py index 8346315c..b266e514 100644 --- a/hyperion/utils/recording_set.py +++ b/hyperion/utils/recording_set.py @@ -3,6 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ +import logging from pathlib import Path import numpy as np @@ -59,3 +60,41 @@ def load(cls, file_path, sep=None): return cls(df) return super().load(file_path, sep) + + @staticmethod + def _get_durations(recordings, i, n): + from ..io import SequentialAudioReader as AR + + durations = [] + fss = [] + with AR(recordings, part_idx=i + 1, num_parts=n) as reader: + for data in reader: + key, x, fs = data + duration = x.shape[0] / fs + fss.append(fs) + durations.append(duration) + + return fss, durations + + def get_durations(self, num_threads: int = 16): + + import itertools + from concurrent.futures import ThreadPoolExecutor + + from tqdm import tqdm + + futures = [] + num_threads = min(num_threads, len(self.df)) + logging.info("submitting threats...") + with ThreadPoolExecutor(max_workers=num_threads) as pool: + for i in tqdm(range(num_threads)): + future = pool.submit(RecordingSet._get_durations, self, i, num_threads) + futures.append(future) + + logging.info("waiting threats...") + res = [f.result() for f in tqdm(futures)] + fss = list(itertools.chain(*[r[0] for r in res])) + durations = list(itertools.chain(*[r[1] for r in res])) + + self.df["duration"] = durations + self.df["sample_freq"] = fss From ad0561e8d4cba96d74e6a315afc72f3a4cdcec4a Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 13 May 2024 17:17:51 -0400 Subject: [PATCH 136/154] add max_batches arg to samplers --- ...mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml | 4 +- ...config_fbank80_stmn_fwseresnet34.v1.2.2.sh | 3 + hyperion/torch/data/bucketing_seg_sampler.py | 30 ++--- .../data/class_weighted_embed_sampler.py | 68 ++++++----- .../data/class_weighted_seg_chunk_sampler.py | 112 ++++++++++-------- hyperion/torch/data/embed_sampler.py | 52 +++++--- hyperion/torch/data/hyp_sampler.py | 9 +- hyperion/torch/data/seg_chunk_sampler.py | 42 ++++--- hyperion/torch/data/seg_sampler.py | 37 ++++-- hyperion/torch/trainers/torch_trainer.py | 6 +- 10 files changed, 226 insertions(+), 137 deletions(-) diff --git a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml index a142349b..683f85ee 100644 --- a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml +++ b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml @@ -62,7 +62,9 @@ trainer: min_lr: 1e-6 warmup_steps: 25000 update_lr_on_opt_step: true - grad_clip: 100 + # grad_clip: 100 + # grad_clip: 20 + grad_clip: 1 use_amp: true log_interval: 1000 epochs: 120 diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh index 8a8b58a3..e56d97cc 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh @@ -27,6 +27,9 @@ nnet_s1=$nnet_s1_dir/teacher_model_ep0054.pth nnet_s1=$nnet_s1_dir/teacher_model_ep0058.pth nnet_s1=$nnet_s1_dir/teacher_model_ep0064.pth nnet_s1=$nnet_s1_dir/teacher_model_ep0067.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0071.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0077.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0083.pth # clustering of dino embeddings cluster_method=cos_ahc_plda_ahc diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index aa02661c..e73e7e44 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -5,11 +5,13 @@ import logging import math +from typing import Optional, Type import numpy as np import torch import torch.distributed as dist +from ...utils import SegmentSet from .hyp_sampler import HypSampler from .seg_sampler import SegSampler @@ -18,14 +20,17 @@ class BucketingSegSampler(HypSampler): def __init__( self, - seg_set, - base_sampler=SegSampler, - num_buckets=10, - length_column="duration", - seed=1234, + seg_set: SegmentSet, + base_sampler: Type[HypSampler] = SegSampler, + num_buckets: int = 10, + length_column: str = "duration", + max_batches_per_epoch: Optional[int] = None, + seed: int = 1234, **base_kwargs ): - super().__init__(shuffle=False, seed=seed) + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, maxshuffle=False, seed=seed + ) self.seg_set = seg_set self.base_sampler = base_sampler self.base_kwargs = base_kwargs @@ -67,6 +72,9 @@ def _compute_len(self): for i in range(self.num_buckets): self._len += len(self.bucket_samplers[i]) + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) + def set_epoch(self, epoch, batch=0): for i in range(self.num_buckets): self.bucket_samplers[i].set_epoch(epoch, batch) @@ -120,12 +128,4 @@ def avg_batch_size(self): @staticmethod def filter_args(**kwargs): - - valid_args = ( - "num_buckets", - "length_column", - "shuffle", - "seed", - ) - - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return kwargs diff --git a/hyperion/torch/data/class_weighted_embed_sampler.py b/hyperion/torch/data/class_weighted_embed_sampler.py index edf1c00d..264e561c 100644 --- a/hyperion/torch/data/class_weighted_embed_sampler.py +++ b/hyperion/torch/data/class_weighted_embed_sampler.py @@ -6,13 +6,15 @@ import logging import math import time +from typing import Optional import numpy as np import pandas as pd -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from ...utils import ClassInfo +from ...utils.misc import filter_func_args from .hyp_sampler import HypSampler @@ -20,18 +22,21 @@ class ClassWeightedEmbedSampler(HypSampler): def __init__( self, embed_set, - class_info, - batch_size=1, - num_embeds_per_class=1, - weight_exponent=1.0, - weight_mode="custom", - num_hard_prototypes=0, - affinity_matrix=None, - class_name="class_id", - shuffle=False, - seed=1234, + class_info: ClassInfo, + batch_size: int = 1, + num_embeds_per_class: int = 1, + weight_exponent: float = 1.0, + weight_mode: str = "custom", + num_hard_prototypes: int = 0, + affinity_matrix: Optional[torch.Tensor] = None, + class_name: str = "class_id", + max_batches_per_epoch: Optiona[int] = None, + shuffle: bool = False, + seed: int = 1234, ): - super().__init__(shuffle=shuffle, seed=seed) + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, shuffle=shuffle, seed=seed + ) self.class_name = class_name self.embed_set = embed_set self.class_info = class_info @@ -70,6 +75,8 @@ def _compute_len(self): self._len = int( math.ceil(len(self.embed_set) / self.avg_batch_size / self.world_size) ) + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) def __len__(self): return self._len @@ -147,7 +154,9 @@ def _compute_num_classes_per_batch(self): num_classes /= self.num_hard_prototypes self.num_classes_per_batch = int(math.ceil(num_classes)) - def _get_class_weights(self,): + def _get_class_weights( + self, + ): return torch.as_tensor(self.class_info["weights"].values) def _sample_classes(self): @@ -208,19 +217,19 @@ def __next__(self): @staticmethod def filter_args(**kwargs): - - valid_args = ( - "batch_size", - "num_embeds_per_class", - "weight_exponent", - "weight_mode", - "num_hard_prototypes", - "class_name", - "shuffle", - "seed", - ) - - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return filter_func_args(ClassWeightedEmbedSampler.__init__, kwargs) + # valid_args = ( + # "batch_size", + # "num_embeds_per_class", + # "weight_exponent", + # "weight_mode", + # "num_hard_prototypes", + # "class_name", + # "shuffle", + # "seed", + # ) + + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -229,7 +238,10 @@ def add_class_args(parser, prefix=None): parser = ArgumentParser(prog="") parser.add_argument( - "--batch-size", type=int, default=1, help=("batch size per gpu"), + "--batch-size", + type=int, + default=1, + help=("batch size per gpu"), ) parser.add_argument( diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 6ee00307..0bb78901 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -6,42 +6,47 @@ import logging import math import time +from typing import Optional import numpy as np import pandas as pd -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from ...utils import ClassInfo, SegmentSet +from ...utils.misc import filter_func_args from .hyp_sampler import HypSampler class ClassWeightedRandomSegChunkSampler(HypSampler): def __init__( self, - seg_set, - class_info, - min_chunk_length, - max_chunk_length=None, - min_batch_size=1, - max_batch_size=None, - max_batch_length=None, - num_chunks_per_seg_epoch="auto", - num_segs_per_class=1, - num_chunks_per_seg=1, - weight_exponent=1.0, - weight_mode="custom", - seg_weight_mode="uniform", - num_hard_prototypes=0, - affinity_matrix=None, - class_name="class_id", - length_name="duration", - shuffle=False, - iters_per_epoch=None, - batch_size=None, - seed=1234, + seg_set: SegmentSet, + class_info: ClassInfo, + min_chunk_length: int, + max_chunk_length: Optional[int] = None, + min_batch_size: int = 1, + max_batch_size: Optional[int] = None, + max_batch_length: Optional[int] = None, + num_chunks_per_seg_epoch: Union[str, int] = "auto", + num_segs_per_class: int = 1, + num_chunks_per_seg: int = 1, + weight_exponent: float = 1.0, + weight_mode: str = "custom", + seg_weight_mode: str = "uniform", + num_hard_prototypes: int = 0, + affinity_matrix: Optional[torch.Tensor] = None, + class_name: str = "class_id", + length_name: str = "duration", + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + iters_per_epoch: Optional[int] = None, + batch_size: Optional[int] = None, + seed: int = 1234, ): - super().__init__(shuffle=shuffle, seed=seed) + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, shuffle=shuffle, seed=seed + ) self.class_name = class_name self.length_name = length_name self.seg_set = seg_set @@ -148,6 +153,8 @@ def _compute_len(self): / self.world_size ) ) + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) def __len__(self): return self._len @@ -284,7 +291,10 @@ def _get_class_weights(self, chunk_length): def _sample_classes(self, num_classes, chunk_length): weights = self._get_class_weights(chunk_length) row_idx = torch.multinomial( - weights, num_samples=num_classes, replacement=True, generator=self.rng, + weights, + num_samples=num_classes, + replacement=True, + generator=self.rng, ).numpy() class_ids = self.class_info.iloc[row_idx].id.values @@ -417,29 +427,30 @@ def __next__(self): @staticmethod def filter_args(**kwargs): + return filter_func_args(ClassWeightedRandomSegChunkSampler.__init__, kwargs) + + # valid_args = ( + # "min_chunk_length", + # "max_chunk_length", + # "min_batch_size", + # "max_batch_size", + # "max_batch_length", + # "num_chunks_per_seg_epoch", + # "num_segs_per_class", + # "num_chunks_per_seg", + # "weight_exponent", + # "weight_mode", + # "seg_weight_mode", + # "num_hard_prototypes", + # "class_name", + # "length_name", + # "iters_per_epoch", + # "batch_size", + # "shuffle", + # "seed", + # ) - valid_args = ( - "min_chunk_length", - "max_chunk_length", - "min_batch_size", - "max_batch_size", - "max_batch_length", - "num_chunks_per_seg_epoch", - "num_segs_per_class", - "num_chunks_per_seg", - "weight_exponent", - "weight_mode", - "seg_weight_mode", - "num_hard_prototypes", - "class_name", - "length_name", - "iters_per_epoch", - "batch_size", - "shuffle", - "seed", - ) - - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -545,6 +556,13 @@ def add_class_args(parser, prefix=None): help=("number of hard prototype classes per batch"), ) + parser.add_argument( + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), + ) + parser.add_argument( "--shuffle", action=ActionYesNo, diff --git a/hyperion/torch/data/embed_sampler.py b/hyperion/torch/data/embed_sampler.py index 65adcba6..251ba917 100644 --- a/hyperion/torch/data/embed_sampler.py +++ b/hyperion/torch/data/embed_sampler.py @@ -5,20 +5,29 @@ import logging import math +from typing import Optional import numpy as np -from jsonargparse import ActionParser, ActionYesNo, ArgumentParser - import torch +from jsonargparse import ActionParser, ActionYesNo, ArgumentParser +from ...utils.misc import filter_func_args from .hyp_sampler import HypSampler class EmbedSampler(HypSampler): def __init__( - self, embed_set, batch_size=1, shuffle=False, drop_last=False, seed=1234, + self, + embed_set, + batch_size: int = 1, + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + drop_last: bool = False, + seed: int = 1234, ): - super().__init__(shuffle=shuffle, seed=seed) + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, shuffle=shuffle, seed=seed + ) self.embed_set = embed_set self.batch_size = batch_size self.avg_batch_size = batch_size @@ -29,6 +38,9 @@ def __init__( else: self._len = int(math.ceil(num_batches)) + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) + self._permutation = None def __len__(self): @@ -72,15 +84,15 @@ def __next__(self): @staticmethod def filter_args(**kwargs): + return filter_func_args(EmbedSampler.__init__, kwargs) + # valid_args = ( + # "batch_size", + # "shuffle", + # "drop_last", + # "seed", + # ) - valid_args = ( - "batch_size", - "shuffle", - "drop_last", - "seed", - ) - - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): @@ -89,11 +101,23 @@ def add_class_args(parser, prefix=None): parser = ArgumentParser(prog="") parser.add_argument( - "--batch-size", type=int, default=1, help=("minimum batch size per gpu"), + "--batch-size", + type=int, + default=1, + help=("minimum batch size per gpu"), + ) + + parser.add_argument( + "--drop-last", + action=ActionYesNo, + help="drops the last batch of the epoch", ) parser.add_argument( - "--drop-last", action=ActionYesNo, help="drops the last batch of the epoch", + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), ) parser.add_argument( diff --git a/hyperion/torch/data/hyp_sampler.py b/hyperion/torch/data/hyp_sampler.py index f8d0862b..61a922db 100644 --- a/hyperion/torch/data/hyp_sampler.py +++ b/hyperion/torch/data/hyp_sampler.py @@ -1,5 +1,6 @@ import logging import math +from typing import Optional import numpy as np import torch @@ -9,13 +10,19 @@ class HypSampler(Sampler): - def __init__(self, shuffle=False, seed=1234): + def __init__( + self, + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + seed: int = 1234, + ): super().__init__(None) self.epoch = 0 self.batch = 0 self.init_batch = 0 self.shuffle = shuffle self.seed = seed + self.max_batches_per_epoch = max_batches_per_epoch try: rank = dist.get_rank() diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py index da47c8ac..345ec287 100644 --- a/hyperion/torch/data/seg_chunk_sampler.py +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -5,6 +5,7 @@ import logging import math +from typing import Optional, Type import numpy as np import pandas as pd @@ -12,7 +13,8 @@ import torch.distributed as dist from jsonargparse import ActionParser, ArgumentParser -from ...utils.segment_set import SegmentSet +from ...utils import SegmentSet +from ...utils.misc import filter_func_args from .hyp_sampler import HypSampler from .seg_sampler import SegSampler @@ -20,13 +22,14 @@ class SegChunkSampler(HypSampler): def __init__( self, - seg_set, - min_chunk_length, - max_chunk_length=None, - base_sampler=SegSampler, - length_name="duration", - shuffle=False, - seed=1234, + seg_set: SegmentSet, + min_chunk_length: int, + max_chunk_length: Optional[int] = None, + base_sampler: Type[HypSampler] = SegSampler, + length_name: str = "duration", + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + seed: int = 1234, **base_kwargs, ): super().__init__(shuffle=shuffle, seed=seed) @@ -141,12 +144,17 @@ def __next__(self): @staticmethod def filter_args(**kwargs): - valid_args = ( - "min_chunk_length", - "max_chunk_length", - "length_name", - "shuffle", - "seed", - ) - - return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + valid_args = filter_func_args(SegChunkSampler.__init__, kwargs) + base_args = filter_func_args(SegSampler.__init__, kwargs) + valid_args.update(base_args) + return valid_args + + # valid_args = ( + # "min_chunk_length", + # "max_chunk_length", + # "length_name", + # "shuffle", + # "seed", + # ) + + # return dict((k, kwargs[k]) for k in valid_args if k in kwargs) diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index a280c87e..5d988092 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -5,29 +5,32 @@ import logging import math +from typing import Optional import numpy as np import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser from ...utils.misc import filter_func_args +from ...utils import SegmentSet from .hyp_sampler import HypSampler class SegSampler(HypSampler): def __init__( self, - seg_set, - min_batch_size=1, - max_batch_size=None, - max_batch_length=None, - length_name="duration", - shuffle=False, - drop_last=False, - sort_by_length=True, - seed=1234, + seg_set: SegmentSet, + min_batch_size:int=1, + max_batch_size:Optional[int]=None, + max_batch_length:Optional[int]=None, + length_name:str="duration", + max_batches_per_epoch: Optional[int]=None, + shuffle:bool=False, + drop_last:bool=False, + sort_by_length:bool=True, + seed:int=1234, ): - super().__init__(shuffle=shuffle, seed=seed) + super().__init__(max_batches_per_epoch=max_batches_per_epoch,shuffle=shuffle, seed=seed) self.seg_set = seg_set self.min_batch_size = min_batch_size self.max_batch_size = max_batch_size @@ -49,6 +52,9 @@ def __init__( math.ceil((len(self.seg_set) // self.world_size) / avg_batch_size) ) + if self.max_batches_per_epoch is not None: + self._len = min(self._len, self.max_batches_per_epoch) + self._permutation = None def __len__(self): @@ -180,7 +186,16 @@ def add_class_args(parser, prefix=None): ) parser.add_argument( - "--drop-last", action=ActionYesNo, help="drops the last batch of the epoch", + "--drop-last", + action=ActionYesNo, + help="drops the last batch of the epoch", + ) + + parser.add_argument( + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), ) parser.add_argument( diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index bb0df6b6..4d8adcf4 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -744,7 +744,7 @@ def save_model_checkpoint( self, model_name: str, checkpoint: Dict[str, Any], partial: bool = False ): if partial: - file_path = "%s/%s_ep%04d_step%10d.pth" % ( + file_path = "%s/%s_ep%04d_step%010d.pth" % ( self.exp_path, model_name, self.cur_epoch, @@ -784,7 +784,7 @@ def old_save_checkpoint(self, logs=None, partial: bool = False): checkpoint = self.checkpoint(logs) if partial: - file_path = "%s/model_ep%04d_step%08d.pth" % ( + file_path = "%s/model_ep%04d_step%010d.pth" % ( self.exp_path, self.cur_epoch, self.global_step, @@ -1023,7 +1023,7 @@ def add_class_args(parser, prefix=None, train_modes=None, skip=set()): help="use tensorboard logger", ) parser.add_argument( - "--use-wandb", action="store_true", default=False, help="use wandb logger" + "--use-wandb", action=ActionYesNo, default=False, help="use wandb logger" ) parser.add_argument("--wandb.project", default=None, help="wandb project name") parser.add_argument("--wandb.group", default=None, help="wandb group name") From 91407ad32842e9b83ea6463c31aea1f5e21c85f5 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 13 May 2024 19:01:13 -0400 Subject: [PATCH 137/154] fix --- .../data/class_weighted_embed_sampler.py | 2 +- .../data/class_weighted_seg_chunk_sampler.py | 2 +- hyperion/torch/data/embed_sampler_factory.py | 16 ++++++++++-- hyperion/torch/data/seg_sampler.py | 26 ++++++++++--------- hyperion/torch/data/seg_sampler_factory.py | 8 ++++++ 5 files changed, 38 insertions(+), 16 deletions(-) diff --git a/hyperion/torch/data/class_weighted_embed_sampler.py b/hyperion/torch/data/class_weighted_embed_sampler.py index 264e561c..708e12ed 100644 --- a/hyperion/torch/data/class_weighted_embed_sampler.py +++ b/hyperion/torch/data/class_weighted_embed_sampler.py @@ -30,7 +30,7 @@ def __init__( num_hard_prototypes: int = 0, affinity_matrix: Optional[torch.Tensor] = None, class_name: str = "class_id", - max_batches_per_epoch: Optiona[int] = None, + max_batches_per_epoch: Optional[int] = None, shuffle: bool = False, seed: int = 1234, ): diff --git a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py index 0bb78901..7cadfee2 100644 --- a/hyperion/torch/data/class_weighted_seg_chunk_sampler.py +++ b/hyperion/torch/data/class_weighted_seg_chunk_sampler.py @@ -6,7 +6,7 @@ import logging import math import time -from typing import Optional +from typing import Optional, Union import numpy as np import pandas as pd diff --git a/hyperion/torch/data/embed_sampler_factory.py b/hyperion/torch/data/embed_sampler_factory.py index aea35ddf..6ead9daf 100644 --- a/hyperion/torch/data/embed_sampler_factory.py +++ b/hyperion/torch/data/embed_sampler_factory.py @@ -2,6 +2,7 @@ Copyright 2022 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging from typing import Optional, Union @@ -32,7 +33,7 @@ def create( """Functions that creates a sampler based on a dataset, sampler_type and sampler arguments. Args: - dataset: embeddings dataset object containing the data info + dataset: embeddings dataset object containing the data info sampler_type: string indicating the sampler type. """ @@ -60,6 +61,7 @@ def filter_args(**kwargs): "weight_mode", "num_hard_prototypes", "class_name", + "max_batches_per_epoch", "shuffle", "seed", ) @@ -73,7 +75,10 @@ def add_class_args(parser, prefix=None): parser = ArgumentParser(prog="") parser.add_argument( - "--batch-size", type=int, default=1, help=("batch size per gpu"), + "--batch-size", + type=int, + default=1, + help=("batch size per gpu"), ) parser.add_argument( @@ -102,6 +107,13 @@ def add_class_args(parser, prefix=None): help=("number of hard prototype classes per batch"), ) + parser.add_argument( + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), + ) + parser.add_argument( "--shuffle", action=ActionYesNo, diff --git a/hyperion/torch/data/seg_sampler.py b/hyperion/torch/data/seg_sampler.py index 5d988092..bb3a37ac 100644 --- a/hyperion/torch/data/seg_sampler.py +++ b/hyperion/torch/data/seg_sampler.py @@ -11,8 +11,8 @@ import torch from jsonargparse import ActionParser, ActionYesNo, ArgumentParser -from ...utils.misc import filter_func_args from ...utils import SegmentSet +from ...utils.misc import filter_func_args from .hyp_sampler import HypSampler @@ -20,17 +20,19 @@ class SegSampler(HypSampler): def __init__( self, seg_set: SegmentSet, - min_batch_size:int=1, - max_batch_size:Optional[int]=None, - max_batch_length:Optional[int]=None, - length_name:str="duration", - max_batches_per_epoch: Optional[int]=None, - shuffle:bool=False, - drop_last:bool=False, - sort_by_length:bool=True, - seed:int=1234, + min_batch_size: int = 1, + max_batch_size: Optional[int] = None, + max_batch_length: Optional[int] = None, + length_name: str = "duration", + max_batches_per_epoch: Optional[int] = None, + shuffle: bool = False, + drop_last: bool = False, + sort_by_length: bool = True, + seed: int = 1234, ): - super().__init__(max_batches_per_epoch=max_batches_per_epoch,shuffle=shuffle, seed=seed) + super().__init__( + max_batches_per_epoch=max_batches_per_epoch, shuffle=shuffle, seed=seed + ) self.seg_set = seg_set self.min_batch_size = min_batch_size self.max_batch_size = max_batch_size @@ -191,7 +193,7 @@ def add_class_args(parser, prefix=None): help="drops the last batch of the epoch", ) - parser.add_argument( + parser.add_argument( "--max-batches-per-epoch", type=int, default=None, diff --git a/hyperion/torch/data/seg_sampler_factory.py b/hyperion/torch/data/seg_sampler_factory.py index 8f6501b5..8a37344d 100644 --- a/hyperion/torch/data/seg_sampler_factory.py +++ b/hyperion/torch/data/seg_sampler_factory.py @@ -89,6 +89,7 @@ def filter_args(**kwargs): "length_name", "iters_per_epoch", "batch_size", + "max_batches_per_epoch", "shuffle", "drop_last", "sort_by_length", @@ -222,6 +223,13 @@ def add_class_args(parser, prefix=None): help="drops the last batch of the epoch", ) + parser.add_argument( + "--max-batches-per-epoch", + type=int, + default=None, + help=("Max. batches per epoch"), + ) + parser.add_argument( "--shuffle", action=ActionYesNo, From 6a88ee547c6423228b2a36a711a2d9a567d8270d Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 14 May 2024 08:33:01 -0400 Subject: [PATCH 138/154] fix --- ...config_fbank80_stmn_fwseresnet34.v1.2.2.sh | 1 + hyperion/bin/train_wav2rnn_transducer.py | 5 ++- hyperion/torch/data/audio_dataset.py | 39 +++++++++++++++++++ hyperion/torch/data/bucketing_seg_sampler.py | 2 +- 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh index e56d97cc..846e85f5 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh @@ -30,6 +30,7 @@ nnet_s1=$nnet_s1_dir/teacher_model_ep0067.pth nnet_s1=$nnet_s1_dir/teacher_model_ep0071.pth nnet_s1=$nnet_s1_dir/teacher_model_ep0077.pth nnet_s1=$nnet_s1_dir/teacher_model_ep0083.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0088.pth # clustering of dino embeddings cluster_method=cos_ahc_plda_ahc diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index ebd23845..5a5e5717 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -90,7 +90,10 @@ def init_data(partition, rank, num_gpus, **kwargs): {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} ) data_loader = torch.utils.data.DataLoader( - dataset, batch_sampler=sampler, **largs, collate_fn=transducer_collate + dataset, + batch_sampler=sampler, + **largs, + collate_fn=dataset.get_collator(), # collate_fn=transducer_collate ) return data_loader diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 83f314e1..62317d2b 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -12,6 +12,11 @@ import pandas as pd # import k2 +try: + import k2 +except: + from ..utils.dummy_k2 import k2 + import sentencepiece as spm import torch import torch.distributed as dist @@ -374,6 +379,40 @@ def __getitem__(self, segment): data.update(seg_info) return data + @staticmethod + def collate(self, batch): + from torch.nn.utils.rnn import pad_sequence + + audio = [] + audio_length = [] + target = [] + for record in batch: + audio_length.append(record["x"].shape[0]) + audio_length = torch.as_tensor(audio_length) + if not torch.all(audio_length[:-1] >= audio_length[1:]): + sort_idx = torch.argsort(audio_length, descending=True) + batch = [batch[i] for i in sort_idx] + + audio_length = [] + for record in batch: + wav = torch.as_tensor(record["x"]) + audio.append(wav) + audio_length.append(wav.shape[0]) + target.append(record["text"]) + audio = pad_sequence(audio) + + audio_length = torch.as_tensor(audio_length) + target = k2.RaggedTensor(target) + batch = { + "x": torch.transpose(audio, 0, 1), + "x_lengths": audio_length, + "text": target, + } + return batch + + def get_collator(self): + return lambda batch: AudioDataset(self, batch) + @staticmethod def filter_args(**kwargs): args = filter_func_args(AudioDataset.__init__, kwargs) diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index e73e7e44..f5db3a22 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -29,7 +29,7 @@ def __init__( **base_kwargs ): super().__init__( - max_batches_per_epoch=max_batches_per_epoch, maxshuffle=False, seed=seed + max_batches_per_epoch=max_batches_per_epoch, shuffle=False, seed=seed ) self.seg_set = seg_set self.base_sampler = base_sampler From f9f539f53705bbca31dc487eacfa87cf828082f8 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 14 May 2024 09:02:46 -0400 Subject: [PATCH 139/154] fix --- hyperion/torch/data/audio_dataset.py | 4 ++-- hyperion/torch/data/bucketing_seg_sampler.py | 2 +- hyperion/torch/data/seg_chunk_sampler.py | 12 +++++++----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 62317d2b..e6c7b128 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -399,8 +399,8 @@ def collate(self, batch): audio.append(wav) audio_length.append(wav.shape[0]) target.append(record["text"]) - audio = pad_sequence(audio) + audio = pad_sequence(audio) audio_length = torch.as_tensor(audio_length) target = k2.RaggedTensor(target) batch = { @@ -411,7 +411,7 @@ def collate(self, batch): return batch def get_collator(self): - return lambda batch: AudioDataset(self, batch) + return lambda batch: AudioDataset.collate(self, batch) @staticmethod def filter_args(**kwargs): diff --git a/hyperion/torch/data/bucketing_seg_sampler.py b/hyperion/torch/data/bucketing_seg_sampler.py index f5db3a22..64d2928c 100644 --- a/hyperion/torch/data/bucketing_seg_sampler.py +++ b/hyperion/torch/data/bucketing_seg_sampler.py @@ -33,7 +33,7 @@ def __init__( ) self.seg_set = seg_set self.base_sampler = base_sampler - self.base_kwargs = base_kwargs + self.base_kwargs = base_sampler.filter_args(**base_kwargs) self.base_kwargs["seed"] = seed self.num_buckets = num_buckets self.length_column = length_column diff --git a/hyperion/torch/data/seg_chunk_sampler.py b/hyperion/torch/data/seg_chunk_sampler.py index 345ec287..e6c78775 100644 --- a/hyperion/torch/data/seg_chunk_sampler.py +++ b/hyperion/torch/data/seg_chunk_sampler.py @@ -45,9 +45,10 @@ def __init__( if "subbase_sampler" in base_kwargs: base_kwargs["base_sampler"] = base_kwargs.pop("subbase_sampler") - self.base_kwargs = base_kwargs + self.base_kwargs = base_sampler.filter_args(**base_kwargs) self.base_kwargs["seed"] = seed self.base_kwargs["shuffle"] = shuffle + self.base_kwargs["max_batches_per_epoch"] = max_batches_per_epoch self.__iter__() self.avg_batch_size = self._seg_sampler.avg_batch_size @@ -144,10 +145,11 @@ def __next__(self): @staticmethod def filter_args(**kwargs): - valid_args = filter_func_args(SegChunkSampler.__init__, kwargs) - base_args = filter_func_args(SegSampler.__init__, kwargs) - valid_args.update(base_args) - return valid_args + return kwargs + # valid_args = filter_func_args(SegChunkSampler.__init__, kwargs) + # base_args = filter_func_args(SegSampler.__init__, kwargs) + # valid_args.update(base_args) + # return valid_args # valid_args = ( # "min_chunk_length", From 6a733eeb82a6868f0465dba09c312a75b42f4f73 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 14 May 2024 09:11:34 -0400 Subject: [PATCH 140/154] fix --- hyperion/torch/data/audio_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index e6c7b128..40221fcb 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -15,7 +15,7 @@ try: import k2 except: - from ..utils.dummy_k2 import k2 + import ..utils.dummy_k2 as k2 import sentencepiece as spm import torch From cdb1ebbba765faae3ebf5b3f68e08287abafff17 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 14 May 2024 09:14:52 -0400 Subject: [PATCH 141/154] fix --- hyperion/torch/data/audio_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 40221fcb..e0c498e7 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -15,7 +15,7 @@ try: import k2 except: - import ..utils.dummy_k2 as k2 + from ..torch.utils import dummy_k2 as k2 import sentencepiece as spm import torch From 49c50c9161708fa02281b928d1ce07cbbc796d5b Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 14 May 2024 09:20:06 -0400 Subject: [PATCH 142/154] fix --- hyperion/torch/data/audio_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index e0c498e7..905b8533 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -15,7 +15,7 @@ try: import k2 except: - from ..torch.utils import dummy_k2 as k2 + from ..utils import dummy_k2 as k2 import sentencepiece as spm import torch From e03dc8cb0744c2d4a78b4906b38718af20a4ca05 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 15 May 2024 10:43:23 -0400 Subject: [PATCH 143/154] added first tokenizers --- ...mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml | 8 + egs/librispeech/v1/run_004_train_asr.sh | 3 +- ...config_fbank80_stmn_fwseresnet34.v1.2.2.sh | 1 + hyperion/bin/train_tokenizer.py | 6 +- hyperion/bin/train_wav2rnn_transducer.py | 55 +- hyperion/torch/data/audio_dataset.py | 155 ++++- hyperion/torch/data/hyp_sampler.py | 5 + hyperion/torch/models/tvector/__init__.py | 8 - .../torch/models/tvector/resnet_tvector.py | 196 ------ hyperion/torch/models/tvector/tvector.py | 567 ------------------ hyperion/torch/tokenizers/__init__.py | 7 + hyperion/torch/tokenizers/hyp_tokenizer.py | 44 ++ hyperion/torch/tokenizers/sp_tokenizer.py | 93 +++ hyperion/torch/trainers/transducer_trainer.py | 1 - hyperion/torch/utils/__init__.py | 7 +- hyperion/torch/utils/collation.py | 199 ++++-- hyperion/torch/utils/vad_utils.py | 4 +- 17 files changed, 494 insertions(+), 865 deletions(-) delete mode 100644 hyperion/torch/models/tvector/__init__.py delete mode 100644 hyperion/torch/models/tvector/resnet_tvector.py delete mode 100644 hyperion/torch/models/tvector/tvector.py create mode 100644 hyperion/torch/tokenizers/__init__.py create mode 100644 hyperion/torch/tokenizers/hyp_tokenizer.py create mode 100644 hyperion/torch/tokenizers/sp_tokenizer.py diff --git a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml index 683f85ee..baea17ab 100644 --- a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml +++ b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml @@ -4,6 +4,10 @@ data: wav_scale: 1 aug_cfgs: - conf/speed_reverb_noise10-20dB_aug.yaml + tokenizer_mappings: + - text->text + tokenizer_files: + - data/token_librispeech_train-960_unigram_512/tokenizer.yaml return_segment_info: - text sampler: @@ -16,6 +20,10 @@ data: val: dataset: wav_scale: 1 + tokenizer_mappings: + - text->text + tokenizer_files: + - data/token_librispeech_train-960_unigram_512/tokenizer.yaml return_segment_info: - text sampler: diff --git a/egs/librispeech/v1/run_004_train_asr.sh b/egs/librispeech/v1/run_004_train_asr.sh index d158689e..33b68ed2 100755 --- a/egs/librispeech/v1/run_004_train_asr.sh +++ b/egs/librispeech/v1/run_004_train_asr.sh @@ -37,11 +37,10 @@ if [ $stage -le 1 ]; then --cfg $nnet_s1_cfg \ --data.train.dataset.recordings-file $train_dir/recordings.csv \ --data.train.dataset.segments-file $train_dir/segments.csv \ - --data.train.dataset.bpe-model $token_model \ --data.val.dataset.recordings-file $val_dir/recordings.csv \ --data.val.dataset.segments-file $val_dir/segments.csv \ --trainer.exp-path $nnet_s1_dir $args \ --num-gpus $ngpu - + #--data.train.dataset.bpe-model $token_model \ fi diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh index 846e85f5..7aebfd69 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh @@ -31,6 +31,7 @@ nnet_s1=$nnet_s1_dir/teacher_model_ep0071.pth nnet_s1=$nnet_s1_dir/teacher_model_ep0077.pth nnet_s1=$nnet_s1_dir/teacher_model_ep0083.pth nnet_s1=$nnet_s1_dir/teacher_model_ep0088.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0094.pth # clustering of dino embeddings cluster_method=cos_ahc_plda_ahc diff --git a/hyperion/bin/train_tokenizer.py b/hyperion/bin/train_tokenizer.py index b3d28923..cd8ab9cf 100644 --- a/hyperion/bin/train_tokenizer.py +++ b/hyperion/bin/train_tokenizer.py @@ -62,6 +62,7 @@ def train_sentencepiece( uppercase_text: bool, tokenizer_path: PathLike, ): + from hyperion.torch.tokenizers import SPTokenizer tokenizer_path = Path(tokenizer_path) tokenizer_path.mkdir(exist_ok=True, parents=True) @@ -96,7 +97,10 @@ def train_sentencepiece( pad_piece=pad_piece, ) - generate_sentencepiece_tokens(model_file, tokenizer_path) + tokenizer = SPTokenizer.load(model_file) + tokenizer.save(model_file.with_suffix(".yaml")) + + # generate_sentencepiece_tokens(model_file, tokenizer_path) def generate_sentencepiece_tokens(model_file: PathLike, tokenizer_path: PathLike): diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 5a5e5717..14fc8db3 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -98,13 +98,27 @@ def init_data(partition, rank, num_gpus, **kwargs): return data_loader -def init_model(blank_id, vocab_size, rank, model_class, **kwargs): +# def init_model(blank_id, vocab_size, rank, model_class, **kwargs): +# model_args = model_class.filter_args(**kwargs["model"]) +# if rank == 0: +# logging.info("model network args={}".format(model_args)) +# # TODO: check model_args +# model_args["transducer"]["decoder"]["blank_id"] = blank_id +# model_args["transducer"]["decoder"]["vocab_size"] = vocab_size +# model = model_class(**model_args) +# if rank == 0: +# logging.info("model={}".format(model)) +# return model + + +def init_model(rank, model_class, tokenizers, **kwargs): model_args = model_class.filter_args(**kwargs["model"]) if rank == 0: logging.info("model network args={}".format(model_args)) - # TODO: check model_args - model_args["transducer"]["decoder"]["blank_id"] = blank_id - model_args["transducer"]["decoder"]["vocab_size"] = vocab_size + + tokenizer = list(tokenizers.items())[0][1] + model_args["transducer"]["decoder"]["blank_id"] = tokenizer.blank_id + model_args["transducer"]["decoder"]["vocab_size"] = tokenizer.vocab_size model = model_class(**model_args) if rank == 0: logging.info("model={}".format(model)) @@ -129,9 +143,14 @@ def train_model(gpu_id, args): train_loader = init_data(partition="train", **kwargs) val_loader = init_data(partition="val", **kwargs) + # model = init_model( + # train_loader.dataset.sp.piece_to_id(""), + # train_loader.dataset.sp.get_piece_size(), + # **kwargs, + # ) + model = init_model( - train_loader.dataset.sp.piece_to_id(""), - train_loader.dataset.sp.get_piece_size(), + tokenizers=train_loader.dataset.tokenizers, **kwargs, ) @@ -180,22 +199,28 @@ def make_parser(model_class): data_parser.add_argument("--val", action=ActionParser(parser=val_parser)) parser.add_argument("--data", action=ActionParser(parser=data_parser)) - parser.add_argument( - "--data.train.dataset.text_file", - type=str, - ) + # parser.add_argument( + # "--data.train.dataset.text_file", + # type=str, + # ) - parser.add_argument("--data.val.dataset.text_file", type=str) + # parser.add_argument("--data.val.dataset.text_file", type=str) - parser.add_argument( - "--data.train.dataset.bpe_model", - type=str, - ) + # parser.add_argument( + # "--data.train.dataset.bpe_model", + # type=str, + # ) parser.link_arguments( "data.train.data_loader.num_workers", "data.val.data_loader.num_workers" ) + parser.link_arguments( + "data.train.dataset.tokenizer_mappings", "data.val.dataset.tokenizer_mappings" + ) + parser.link_arguments( + "data.train.dataset.tokenizer_files", "data.val.dataset.tokenizer_files" + ) parser.link_arguments("data.train.dataset.bpe_model", "data.val.dataset.bpe_model") model_class.add_class_args(parser, prefix="model") diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 905b8533..d555a118 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -6,6 +6,7 @@ import logging import math import time +from collections import OrderedDict from typing import Dict, List, Optional import numpy as np @@ -27,11 +28,12 @@ from ...io import RandomAccessAudioReader as AR from ...np.augment import SpeechAugment from ...np.preprocessing import Resampler -from ...utils.class_info import ClassInfo +from ...utils import ClassInfo, SegmentSet from ...utils.misc import filter_func_args -from ...utils.segment_set import SegmentSet from ...utils.text import read_text +from ..tokenizers import HypTokenizer from ..torch_defs import floatstr_torch +from ..utils import collate_seqs_1d, collate_seqs_nd, list_of_dicts_to_list class AudioDataset(Dataset): @@ -42,9 +44,11 @@ class AudioDataset(Dataset): segments_file: segments manifest file (kaldi .scp or pandas .csv) class_names: list with the names of the types of classes in the datasets, e.g., speaker, language class_files: list of class info files - time_durs_file: (deprecated) segment to duration in secs file, if durations are not in segments_file - bpe_model: bpe model for the text label - text_file: text file with words labels for each utterances + tokenizer_mappings: list mapping the segment_set fields to the tokenizer name + that should be used with them, e.g., text->text-1, + this argument has to be sync with tokenizer_files. + tokenizer_files: list of tokenizer cofinguration files + this argument has to be sync with tokenizer_mappings. aug_cfgs: list of augmentation configuration files num_augs: number of augmentations per segment and augmentation type num_aug_mix: "number of AugMix augmentations per segment @@ -55,6 +59,9 @@ class AudioDataset(Dataset): wav_scale: make waves to be in [-wav_scale, wav_scale] is_val: is validation dataset. seed: random seed", + time_durs_file: (deprecated) segment to duration in secs file, if durations are not in segments_file + text_file: (deprecated) text file with words labels for each utterances. + bpe_model: (deprecated) bpe model for the text label. """ def __init__( @@ -63,9 +70,8 @@ def __init__( segments_file: str, class_names: Optional[List[str]] = None, class_files: Optional[List[str]] = None, - bpe_model: Optional[str] = None, - text_file: Optional[str] = None, - time_durs_file: Optional[str] = None, + tokenizer_mappings: Optional[List[str]] = None, + tokenizer_files: Optional[List[str]] = None, aug_cfgs: Optional[List[str]] = None, num_augs: int = 1, num_aug_mix: int = 0, @@ -76,6 +82,9 @@ def __init__( wav_scale: float = 1, is_val: bool = False, seed: int = 112358, + time_durs_file: Optional[str] = None, + text_file: Optional[str] = None, + bpe_model: Optional[str] = None, ): super().__init__() try: @@ -110,6 +119,9 @@ def __init__( logging.info("loading class-info files") self._load_class_infos(class_names, class_files, is_val) + logging.info("loading tokenizers") + self._load_tokenizers(tokenizer_mappings, tokenizer_files) + if bpe_model is not None: logging.info("loading bpe models") self._load_bpe_model(bpe_model, is_val) @@ -161,7 +173,7 @@ def _load_text_infos(self, text_file, is_val): self.seg_set["text"] = text.loc[self.seg_set["id"]].text def _load_class_infos(self, class_names, class_files, is_val): - self.class_info = {} + self.class_info = OrderedDict() if class_names is None: assert class_files is None return @@ -185,6 +197,27 @@ def _load_class_infos(self, class_names, class_files, is_val): "%s class: %s not present in dataset", name, c_id ) + def _load_tokenizers(self, tokenizer_mappings, tokenizer_files): + self.tokenizers = OrderedDict() + self.tokenizers_to_infos = OrderedDict() + if tokenizer_mappings is None: + assert tokenizer_files is None + return + + assert len(tokenizer_mappings) == len(tokenizer_files) + tokenizer_names = [] + for map in tokenizer_mappings: + info_name, tokenizer_name = map.split("->", maxsplit=1) + self.tokenizers_to_infos[tokenizer_name] = info_name + tokenizer_names.append(tokenizer_name) + + for name, file in zip(tokenizer_names, tokenizer_files): + assert name in self.seg_set, f"field {name} not present in the segment set" + if self.rank == 0: + logging.info("loading tokenizer file %s", file) + tokenizer = HypTokenizer.auto_load(file) + self.tokenizers[name] = tokenizer + def _create_augmenters(self, aug_cfgs): self.augmenters = [] self.reverb_context = 0 @@ -244,9 +277,6 @@ def _parse_segment_item(self, segment): else: seg_id, start, duration = segment, 0, 0 - # if "start" in self.seg_set: - # start += self.seg_set.loc[seg_id].start - return seg_id, start, duration def _read_audio(self, seg_id, start, duration): @@ -260,18 +290,6 @@ def _read_audio(self, seg_id, start, duration): x, fs = self.r.read([seg_id], time_offset=start, time_durs=read_duration) return x[0].astype(floatstr_torch(), copy=False), fs[0] - # def _read_audio0(self, seg_id, start, duration): - # # how much extra audio we need to load to - # # calculate the reverb of the first part of the audio - # reverb_context = min(self.reverb_context, start) - # start -= reverb_context - # read_duration = duration + reverb_context - - # # read audio - # recording_id = self.seg_set.recording_ids(seg_id) - # x, fs = self.r.read([recording_id], time_offset=start, time_durs=read_duration) - # return x[0].astype(floatstr_torch(), copy=False), fs[0] - def _apply_aug_mix(self, x, x_augs, aug_idx): x_aug_mix = {} alpha_d = (self.aug_mix_alpha,) * len(x_augs) @@ -328,6 +346,11 @@ def _get_segment_info(self, seg_id): seg_info = {} # converts the class_ids to integers for info_name in self.return_segment_info: + tokenizer_name = "" + if info_name in self.tokenizers_to_infos: + tokenizer_name = info_name + info_name = self.tokenizers_to_infos[tokenizer_name] + seg_info_i = self.seg_set.loc[seg_id, info_name] if info_name in self.class_info: # if the type of information is a class-id @@ -335,8 +358,9 @@ def _get_segment_info(self, seg_id): # convert from id to integer class_info = self.class_info[info_name] seg_info_i = class_info.loc[seg_info_i, "class_idx"] - - if info_name == "text": + elif tokenizer_name in self.tokenizers: + seg_info_i = self.tokenizers[tokenizer_name].encode(seg_info_i) + elif info_name == "text": seg_info_i = self.sp.encode(seg_info_i, out_type=int) seg_info[info_name] = seg_info_i @@ -381,6 +405,66 @@ def __getitem__(self, segment): @staticmethod def collate(self, batch): + + # sort batch by the length of x + audio_lengths = [] + for record in batch: + audio_lengths.append(record["x"].shape[0]) + audio_lengths = torch.as_tensor(audio_lengths) + if not torch.all(audio_lengths[:-1] >= audio_lengths[1:]): + sort_idx = torch.argsort(audio_lengths, descending=True) + batch = [batch[i] for i in sort_idx] + + del audio_lengths + + def _is_list_of_tensors(x): + return isinstance(x[0], (torch.Tensor, np.ndarray)) + + def _is_list_of_items(x): + return isinstance(x[0], (int, float)) + + def _is_list_of_strs(x): + return isinstance(x[0], str) + + def _is_list_of_strlists(x): + return isinstance(x[0], list) and isinstance(x[0][0], str) + + def _is_list_of_intlists(x): + return isinstance(x[0], list) and isinstance(x[0][0], int) + + output_batch = {} + batch_keys = batch[0].keys() + for key in batch_keys: + item_list = list_of_dicts_to_list(batch, key) + if key == "id": + # this are the segment ids + output_batch[key] = item_list + elif key == "x" or key[:2] == "x_" and _is_list_of_tensors(item_list): + # these are input audios + data, data_lengths = collate_seqs_1d(item_list) + output_batch[key] = data + output_batch[f"{key}_lengths"] = data_lengths + elif _is_list_of_items(item_list): + # these should be things like class ids + output_batch[key] = torch.as_tensor(item_list) + elif _is_list_of_tensors(item_list): + # other tensor data + data, data_lengths = collate_seqs_nd(item_list) + output_batch[key] = data + output_batch[f"{key}_lengths"] = data_lengths + elif _is_list_of_intlists(item_list): + # we assume k2 ragged tensor for now + output_batch[key] = k2.RaggedTensor(item_list) + elif _is_list_of_strs(item_list): + # we just left them as they are: + output_batch[key] = item_list + else: + raise TypeError(f"we don't know how to collate this data={item_list}") + + return output_batch + + @staticmethod + def collate_old(self, batch): from torch.nn.utils.rnn import pad_sequence audio = [] @@ -454,6 +538,25 @@ def add_class_args(parser, prefix=None, skip=set()): help="list of class info files", ) + parser.add_argument( + "--tokenizer-mappings", + default=None, + nargs="+", + help="""list mapping the segment_set fields to the tokenizer name + that should be used with them, e.g., text->text-1, + this argument has to be sync with tokenizer_files. + """, + ) + + parser.add_argument( + "--tokenizer-files", + default=None, + nargs="+", + help="""list of tokenizer cofinguration files + this argument has to be sync with tokenizer_mappings. + """, + ) + parser.add_argument( "--time-durs-file", default=None, diff --git a/hyperion/torch/data/hyp_sampler.py b/hyperion/torch/data/hyp_sampler.py index 61a922db..30010f5e 100644 --- a/hyperion/torch/data/hyp_sampler.py +++ b/hyperion/torch/data/hyp_sampler.py @@ -1,3 +1,8 @@ +""" + Copyright 2023 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + import logging import math from typing import Optional diff --git a/hyperion/torch/models/tvector/__init__.py b/hyperion/torch/models/tvector/__init__.py deleted file mode 100644 index 36999146..00000000 --- a/hyperion/torch/models/tvector/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -from .resnet_tvector import ResNetTVector -# t-vectors -from .tvector import TVector diff --git a/hyperion/torch/models/tvector/resnet_tvector.py b/hyperion/torch/models/tvector/resnet_tvector.py deleted file mode 100644 index c84a38fc..00000000 --- a/hyperion/torch/models/tvector/resnet_tvector.py +++ /dev/null @@ -1,196 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import logging -from argparse import Namespace - -import torch -import torch.nn as nn - -from ..narchs import ResNetFactory as RNF -from .xvector import XVector - - -class ResNetXVector(XVector): - def __init__( - self, - in_feats, - num_classes, - resnet_cfg=Namespace( - resnet_type="resnet34", - in_channels=1, - conv_channels=64, - base_channels=64, - in_kernel_size=7, - in_stride=1, - zero_init_residual=False, - groups=1, - replace_stride_with_dilation=None, - do_maxpool=False, - hid_act={"name": "relu", "inplace": True}, - dropout_rate=0, - norm_layer=None, - use_norm=True, - norm_before=True, - in_norm=False, - se_r=16, - res2net_scale=4, - res2net_width_factor=1, - ), - conformer_cfg=Namespace( - d_model=256, - num_heads=4, - num_blocks=6, - attype="scaled-dot-prod-v1", - atcontext=25, - conv_repeats=1, - conv_kernel_sizes=31, - conv_strides=1, - ff_type="linear", - d_ff=2048, - ff_kernel_size=1, - dropourate=0.1, - pos_dropourate=0.1, - att_dropout_rate=0.0, - in_layer_type="conv2d-sub", - rel_pos_enc=True, - causal_pos_enc=False, - no_pos_enc=False, - hid_act="swish", - conv_norm_layer=None, - se_r=None, - ff_macaron=True, - red_lnorms=False, - concat_after=False, - ), - pool_net="mean+stddev", - head_cfg=Namespace( - embed_dim=256, - num_embed_layers=1, - head_hid_act={"name": "relu", "inplace": True}, - loss_type="arc-softmax", - s=64, - margin=0.3, - margin_warmup_epochs=0, - num_subcenters=2, - norm_layer=None, - use_norm=True, - norm_before=True, - dropout_rate=0, - embed_layer=0, - ), - ): - - logging.info("making %s encoder network" % (resnet_type)) - if isinstance(resnet_cfg, Namespace): - resnet_cfg = var(resnet_cfg) - - self.resnet_type = resnet_cfg["resnet_type"] - encoder_net = RNF.create(**resnet_cfg) - - super().__init__( - encoder_net, - num_classes, - conformer_cfg=conformer_cfg, - pool_net=pool_net, - head_cfg=head_cfg, - in_feats=in_feats, - proj_feats=None, - ) - - @property - def in_channels(self): - return self.encoder_net.in_channels - - @property - def conv_channels(self): - return self.encoder_net.conv_channels - - @property - def base_channels(self): - return self.encoder_net.base_channels - - @property - def in_kernel_size(self): - return self.encoder_net.in_kernel_size - - @property - def in_stride(self): - return self.encoder_net.in_stride - - @property - def zero_init_residual(self): - return self.encoder_net.zero_init_residual - - @property - def groups(self): - return self.encoder_net.groups - - @property - def replace_stride_with_dilation(self): - return self.encoder_net.replace_stride_with_dilation - - @property - def do_maxpool(self): - return self.encoder_net.do_maxpool - - @property - def in_norm(self): - return self.encoder_net.in_norm - - @property - def se_r(self): - return self.encoder_net.se_r - - @property - def res2net_scale(self): - return self.encoder_net.res2net_scale - - @property - def res2net_width_factor(self): - return self.encoder_net.res2net_width_factor - - def get_config(self): - - base_config = super().get_config() - del base_config["encoder_cfg"] - enc_cfg = self.encoder_net.get_config() - del enc_cfg["block"] - del enc_cfg["out_units"] - del enc_cfg["out_act"] - enc_cfg["resnet_type"] = self.resnet_type - - base_config["resnet_cfg"] = enc_cfg - - return base_config - - @classmethod - def load(cls, file_path=None, cfg=None, state_dict=None): - - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - - model = cls(**cfg) - if state_dict is not None: - model.load_state_dict(state_dict) - - return model - - def filter_args(prefix=None, **kwargs): - - base_args = XVector.filter_args(prefix, **kwargs) - child_args = RNF.filter_args(prefix, **kwargs) - - base_args.update(child_args) - return base_args - - @staticmethod - def add_argparse_args(parser, prefix=None): - - XVector.add_argparse_args(parser, prefix) - if prefix is None: - prefix = "resnet" - else: - prefix = prefix + "-resnet" - RNF.add_argparse_args(parser, prefix) diff --git a/hyperion/torch/models/tvector/tvector.py b/hyperion/torch/models/tvector/tvector.py deleted file mode 100644 index a46fc324..00000000 --- a/hyperion/torch/models/tvector/tvector.py +++ /dev/null @@ -1,567 +0,0 @@ -""" - Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) - Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -""" - -import logging - -from jsonargparse import ActionParser, ArgumentParser - -import torch -import torch.nn as nn - -from ...narchs import ClassifHead, ConformerEncoderV1, TorchNALoader -from ..layer_blocks import TDNNBlock -from ..layers import GlobalPool1dFactory as PF -from ..torch_model import TorchModel -from ..utils import eval_nnet_by_chunks - - -class TXVector(TorchModel): - """x-Vector base class""" - - def __init__( - self, - encoder_net, - num_classes, - conformer_net={}, - pool_net="mean+stddev", - classif_net={}, - in_feats=None, - ): - - super().__init__() - - # encoder network - self.encoder_net = encoder_net - - # infer input and output shapes of encoder network - in_shape = self.encoder_net.in_shape() - if len(in_shape) == 3: - # encoder based on 1d conv or transformer - in_feats = in_shape[1] - out_shape = self.encoder_net.out_shape(in_shape) - enc_feats = out_shape[1] - elif len(in_shape) == 4: - # encoder based in 2d convs - assert ( - in_feats is not None - ), "in_feats dimension must be given to calculate pooling dimension" - in_shape = list(in_shape) - in_shape[2] = in_feats - out_shape = self.encoder_net.out_shape(tuple(in_shape)) - enc_feats = out_shape[1] * out_shape[2] - - self.in_feats = in_feats - - logging.info("encoder input shape={}".format(in_shape)) - logging.info("encoder output shape={}".format(out_shape)) - - # create conformer net - if isinstance(conformer_net, nn.Module): - self.conformer_net = conformer_net - else: - logging.info("making conformer net") - conformer_net["in_layer_type"] = "linear" - self.conformer_net = ConformerEncoderV1( - enc_feats, in_time_dim=1, out_time_dim=1, **conformer_net - ) - - d_model = self.conformer_net.d_model - self.pool_net = self._make_pool_net(pool_cfg, d_model) - pool_feats = int(d_model * self.pool_net.size_multiplier) - logging.info("infer pooling dimension %d", pool_feats) - - # create classification head - if isinstance(classif_net, nn.Module): - self.classif_net = classif_net - else: - logging.info("making classification head net") - self.classif_net = ClassifHead(pool_feats, num_classes, **head_cfg) - - @property - def pool_feats(self): - return self.classif_net.in_feats - - @property - def num_classes(self): - return self.classif_net.num_classes - - @property - def embed_dim(self): - return self.classif_net.embed_dim - - @property - def num_embed_layers(self): - return self.classif_net.num_embed_layers - - @property - def s(self): - return self.classif_net.s - - @property - def margin(self): - return self.classif_net.margin - - @property - def margin_warmup_epochs(self): - return self.classif_net.margin_warmup_epochs - - @property - def num_subcenters(self): - return self.classif_net.num_subcenters - - @property - def loss_type(self): - return self.classif_net.loss_type - - def _make_pool_net(self, pool_net, enc_feats=None): - """Makes the pooling block - - Args: - pool_net: str or dict to pass to the pooling factory create function - enc_feats: dimension of the features coming from the encoder - - Returns: - GlobalPool1d object - """ - if isinstance(pool_net, str): - pool_net = {"pool_type": pool_net} - - if isinstance(pool_net, dict): - if enc_feats is not None: - pool_net["in_feats"] = enc_feats - - return PF.create(**pool_net) - elif isinstance(pool_net, nn.Module): - return pool_net - else: - raise Exception("Invalid pool_net argument") - - def update_loss_margin(self, epoch): - """Updates the value of the margin in AAM/AM-softmax losses - given the epoch number - - Args: - epoch: epoch which is about to start - """ - self.classif_net.update_margin(epoch) - - def _pre_enc(self, x): - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - return x - - def _post_enc(self, x): - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - if self.proj is not None: - x = self.proj(x) - - return x - - def forward( - self, - x, - y=None, - enc_layers=None, - classif_layers=None, - return_output=True, - use_amp=False, - ): - if enc_layers is None and classif_layers is None: - return self.forward_output(x, y) - - h = self.forward_hid_feats(x, y, enc_layers, classif_layers, return_output) - output = {} - if enc_layers is not None: - if classif_layers is None: - output["h_enc"] = h - else: - output["h_enc"] = h[0] - else: - output["h_enc"] = [] - if classif_layers is not None: - output["h_classif"] = h[1] - else: - output["h_classif"] = [] - if return_output: - output["output"] = h[2] - return output - - def forward_output(self, x, y=None): - """Forward function - - Args: - x: input features tensor with shape=(batch, in_feats, time) - y: target classes torch.long tensor with shape=(batch,) - - Returns: - class posteriors tensor with shape=(batch, num_classes) - """ - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - - x = self.encoder_net(x) - x = self.conformer_net(x) - - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - p = self.pool_net(x) - y = self.classif_net(p, y) - return y - - def forward_hid_feats( - self, - x, - y=None, - enc_layers=None, - conf_layers=None, - classif_layers=None, - return_output=False, - ): - """forwards hidden representations in the x-vector network""" - - if self.encoder_net.in_dim() == 4 and x.dim() == 3: - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - - h_enc, x = self.encoder_net.forward_hid_feats(x, enc_layers, return_output=True) - - h_conf, x = self.conformer_net.forward_hid_feats( - x, conf_layers, return_output=True - ) - - if not return_output and classif_layers is None: - return h_enc - - if self.encoder_net.out_dim() == 4: - x = x.view(x.size(0), -1, x.size(-1)) - - if self.proj is not None: - x = self.proj(x) - - p = self.pool_net(x) - h_classif = self.classif_net.forward_hid_feats( - p, y, classif_layers, return_output=return_output - ) - if return_output: - h_classif, y = h_classif - return h_enc, h_classif, y - - return h_enc, h_classif - - def extract_embed(self, x, chunk_length=0, embed_layer=None, detach_chunks=False): - if embed_layer is None: - embed_layer = self.embed_layer - - x = self._pre_enc(x) - # if self.encoder_net.in_dim() == 4 and x.dim() == 3: - # x = x.view(x.size(0), 1, x.size(1), x.size(2)) - x = eval_nnet_by_chunks( - x, self.encoder_net, chunk_length, detach_chunks=detach_chunks - ) - - if x.device != self.device: - x = x.to(self.device) - - x = self._post_enc(x) - - # if self.encoder_net.out_dim() == 4: - # x = x.view(x.size(0), -1, x.size(-1)) - - # if self.proj is not None: - # x = self.proj(x) - - p = self.pool_net(x) - y = self.classif_net.extract_embed(p, embed_layer) - return y - - def extract_embed_slidwin( - self, - x, - win_length, - win_shift, - snip_edges=False, - feat_frame_length=None, - feat_frame_shift=None, - chunk_length=0, - embed_layer=None, - detach_chunks=False, - ): - - if feat_frame_shift is not None: - # assume win_length/shift are in secs, transform to frames - # pass feat times from msecs to secs - feat_frame_shift = feat_frame_shift / 1000 - feat_frame_length = feat_frame_length / 1000 - - # get length and shift in number of feature frames - win_shift = win_shift / feat_frame_shift # this can be a float - win_length = ( - win_length - feat_frame_length + feat_frame_shift - ) / feat_frame_shift - assert win_shift > 0.5, "win-length should be longer than feat-frame-length" - - if embed_layer is None: - embed_layer = self.embed_layer - - in_time = x.size(-1) - x = self._pre_enc(x) - x = eval_nnet_by_chunks( - x, self.encoder_net, chunk_length, detach_chunks=detach_chunks - ) - - if x.device != self.device: - x = x.to(self.device) - - x = self._post_enc(x) - pin_time = x.size(-1) # time dim before pooling - downsample_factor = float(pin_time) / in_time - p = self.pool_net.forward_slidwin( - x, - downsample_factor * win_length, - downsample_factor * win_shift, - snip_edges=snip_edges, - ) - # (batch, pool_dim, time) - - p = p.transpose(1, 2).contiguous().view(-1, p.size(1)) - y = ( - self.classif_net.extract_embed(p, embed_layer) - .view(x.size(0), -1, self.embed_dim) - .transpose(1, 2) - .contiguous() - ) - - return y - - def compute_slidwin_timestamps( - self, - num_windows, - win_length, - win_shift, - snip_edges=False, - feat_frame_length=25, - feat_frame_shift=10, - feat_snip_edges=False, - ): - - P = self.compute_slidwin_left_padding( - win_length, - win_shift, - snip_edges, - feat_frame_length, - feat_frame_shift, - feat_snip_edges, - ) - - tstamps = ( - torch.as_tensor( - [ - [i * win_shift, i * win_shift + win_length] - for i in range(num_windows) - ] - ) - - P - ) - tstamps[tstamps < 0] = 0 - return tstamps - - def compute_slidwin_left_padding( - self, - win_length, - win_shift, - snip_edges=False, - feat_frame_length=25, - feat_frame_shift=10, - feat_snip_edges=False, - ): - - # pass feat times from msecs to secs - feat_frame_shift = feat_frame_shift / 1000 - feat_frame_length = feat_frame_length / 1000 - - # get length and shift in number of feature frames - H = win_shift / feat_frame_shift - L = (win_length - feat_frame_length + feat_frame_shift) / feat_frame_shift - assert L > 0.5, "win-length should be longer than feat-frame-length" - - # compute left padding in case of snip_edges is False - if snip_edges: - P1 = 0 - else: - Q = ( - L - H - ) / 2 # left padding in frames introduced by x-vector sliding window - P1 = ( - Q * feat_frame_shift - ) # left padding in secs introduced by x-vector sliding window - - if feat_snip_edges: - # left padding introduced when computing acoustic feats - P2 = 0 - else: - P2 = (feat_frame_length - feat_frame_shift) / 2 - - # total left padding - return P1 + P2 - - def get_config(self): - - enc_cfg = self.encoder_net.get_config() - pool_cfg = PF.get_config(self.pool_net) - conformer_cfg = self.conformer_net.get_config() - classif_cfg = self.classif_net.get_config() - - config = { - "encoder_cfg": enc_cfg, - "num_classes": self.num_classes, - "conformer_net": self.conformer_cfg, - "pool_net": pool_cfg, - "classif_net": self.classif_cfg, - "in_feats": self.in_feats, - } - - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) - - @classmethod - def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - encoder_net = TorchNALoader.load_from_cfg(cfg=cfg["encoder_cfg"]) - - for k in "encoder_cfg": - del cfg[k] - - model = cls(encoder_net, **cfg) - if state_dict is not None: - model.load_state_dict(state_dict) - - return model - - def rebuild_output_layer( - self, - num_classes=None, - loss_type="arc-softmax", - s=64, - margin=0.3, - margin_warmup_epochs=10, - ): - if (self.num_classes is not None and self.num_classes != num_classes) or ( - self.loss_type != loss_type - ): - # if we change the number of classes or the loss-type - # we need to reinitiate the last layer - self.classif_net.rebuild_output_layer( - num_classes, loss_type, s, margin, margin_warmup_epochs - ) - return - - # otherwise we just change the values of s, margin and margin_warmup - self.classif_net.set_margin(margin) - self.classif_net.set_margin_warmup_epochs(margin_warmup_epochs) - self.classif_net.set_s(s) - - def freeze_preembed_layers(self): - self.encoder_net.freeze() - if self.proj is not None: - self.proj.freeze() - - for param in self.pool_net.parameters(): - param.requires_grad = False - - layer_list = [l for l in range(self.embed_layer)] - self.classif_net.freeze_layers(layer_list) - - def train_mode(self, mode="ft-embed-affine"): - if mode == "ft-full" or mode == "train": - self.train() - return - - self.encoder_net.eval() - self.conformer_net.eval() - self.pool_net.eval() - self.classif_net.train() - layer_list = [l for l in range(self.embed_layer)] - self.classif_net.put_layers_in_eval_mode(layer_list) - - @staticmethod - def filter_args(**kwargs): - - valid_args = ("num_classes", "in_feats") - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - # get arguments for conformer - conformer_args = ConformerEncoderV1.filter_args(**kwargs["conformer_net"]) - args["corformer_net"] = conformer_args - # get arguments for pooling - pool_args = PF.filter_args(**kwargs["pool_net"]) - args["pool_net"] = pool_args - # get arguments for classif head - classif_args = ClassifHead.filter_args(**kwargs["classif_net"]) - args["classif_net"] = classif_args - - return args - - @staticmethod - def add_class_args(parser, prefix=None): - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - CoformerEncoderV1.add_class_args(parser, prefix="conformer_net") - PF.add_class_args( - parser, prefix="pool_net", skip=["dim", "in_feats", "keepdim"] - ) - ClassifHead.add_class_args(parser, prefix="classif_net") - if prefix is not None: - outer_parser.add_argument( - "--" + prefix, - action=ActionParser(parser=parser), - help="xvector options", - ) - - @staticmethod - def filter_finetune_args(**kwargs): - valid_args = ("loss_type", "s", "margin", "margin_warmup_epochs") - args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - return args - - @staticmethod - def add_finetune_args(parser, prefix=None): - if prefix is not None: - outer_parser = parser - parser = ArgumentParser(prog="") - - parser.add_argument( - "--loss-type", - default="arc-softmax", - choices=["softmax", "arc-softmax", "cos-softmax", "subcenter-arc-softmax"], - help="loss type: softmax, arc-softmax, cos-softmax, subcenter-arc-softmax", - ) - - parser.add_argument("--s", default=64, type=float, help="scale for arcface") - - parser.add_argument( - "--margin", default=0.3, type=float, help="margin for arcface, cosface,..." - ) - - parser.add_argument( - "--margin-warmup-epochs", - default=10, - type=float, - help="number of epoch until we set the final margin", - ) - - parser.add_argument( - "--num-subcenters", - default=2, - type=float, - help="number of subcenters in subcenter losses", - ) - - if prefix is not None: - outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/tokenizers/__init__.py b/hyperion/torch/tokenizers/__init__.py new file mode 100644 index 00000000..42afcaf1 --- /dev/null +++ b/hyperion/torch/tokenizers/__init__.py @@ -0,0 +1,7 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from .hyp_tokenizer import HypTokenizer +from .sp_tokenizer import SPTokenizer diff --git a/hyperion/torch/tokenizers/hyp_tokenizer.py b/hyperion/torch/tokenizers/hyp_tokenizer.py new file mode 100644 index 00000000..0d6e9efb --- /dev/null +++ b/hyperion/torch/tokenizers/hyp_tokenizer.py @@ -0,0 +1,44 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from pathlib import Path + +import yaml + +from ...utils.misc import PathLike + + +class HypTokenizer: + """Base class for tokenizers in Hyperion""" + + registry = {} + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + HypTokenizer.registry[cls.__name__] = cls + + def normalize(self, text): + return text + + def encode(self, x): + pass + + def decode(self, x): + pass + + @staticmethod + def auto_load(file_path: PathLike): + file_path = Path(file_path) + with open(file_path, "r") as f: + cfg = yaml.safe_load(f) + + class_name = cfg["class_name"] + del cfg["class_name"] + if class_name in HypTokenizer.registry: + class_obj = HypTokenizer.registry[class_name] + else: + raise Exception("unknown object with class_name=%s" % (class_name)) + + return class_obj.load(file_path) diff --git a/hyperion/torch/tokenizers/sp_tokenizer.py b/hyperion/torch/tokenizers/sp_tokenizer.py new file mode 100644 index 00000000..c3fa35f9 --- /dev/null +++ b/hyperion/torch/tokenizers/sp_tokenizer.py @@ -0,0 +1,93 @@ +""" + Copyright 2024 Johns Hopkins University (Author: Jesus Villalba) + Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +""" + +from pathlib import Path +from typing import Dict + +import sentencepiece as spm +import yaml + +from ...utils.misc import PathLike +from .hyp_tokenizer import HypTokenizer + + +class SPTokenizer(HypTokenizer): + """Sentence Piece Tokenizer""" + + def __init__( + self, sp_model: spm.SentencePieceProcessor, uppercase_text: bool = True + ): + super().__init__() + self.sp_model = sp_model + self.uppercase_text = uppercase_text + self.blank_id = self.sp_model.piece_to_id("") + self.vocab_size = self.sp_model.get_piece_size() + self._token2id = None + + @property + def token2id(self): + if self._token2id is not None: + return self._token2id + + token2id: Dict[str, int] = { + self.sp_model.id_to_piece(i): i for i in range(self.sp_model.vocab_size()) + } + self._token2id = token2id + return token2id + + def normalize(self, text): + if self.uppercase_text: + text = text.upper() + return text + + def encode(self, text): + return self.sp_model.encode(text, out_type=int) + + def decode(self, tokens): + return self.sp_model.decoder(tokens) + + def save(self, file_path: PathLike, sp_model_prefix: str = "tokenizer"): + file_path = Path(file_path) + if file_path.suffix != ".yaml": + output_dir = file_path + file_path = output_dir / (sp_model_prefix + ".yaml") + else: + output_dir = file_path.parent + + output_dir.mkdir(parents=True, exist_ok=True) + sp_model_file = sp_model_prefix + ".model" + sp_tokens_file = sp_model_prefix + ".tokens" + cfg = { + "class_name": self.__class__.__name__, + "sp_model": sp_model_file, + "sp_tokens": sp_tokens_file, + "uppercase_text": self.uppercase_text, + } + with open(file_path, "w") as f: + yaml.dump(cfg, f) + + with open(output_dir / sp_tokens_file, "w", encoding="utf-8") as f: + for sym, i in self.token2id.items(): + f.write(f"{sym} {i}\n") + + @classmethod + def load(cls, file_path: PathLike): + file_path = Path(file_path) + if file_path.suffix == ".model": + sp_model = spm.SentencePieceProcessor() + sp_model.load(str(file_path)) + return cls(sp_model) + + with open(file_path, "r") as f: + cfg = yaml.safe_load(f) + + sp_model_file = Path(cfg["sp_model"]) + if not sp_model_file.is_file(): + sp_model_file = file_path.parent / sp_model_file + assert sp_model_file.is_file(), f"{sp_model_file} not found" + + sp_model = spm.SentencePieceProcessor() + sp_model.load(str(sp_model_file)) + return cls(sp_model) diff --git a/hyperion/torch/trainers/transducer_trainer.py b/hyperion/torch/trainers/transducer_trainer.py index 3c52b8bf..c9cbb60b 100644 --- a/hyperion/torch/trainers/transducer_trainer.py +++ b/hyperion/torch/trainers/transducer_trainer.py @@ -102,7 +102,6 @@ def train_epoch(self, data_loader): metric_acc = MetricAcc(device=self.device) batch_metrics = ODict() self.model.train() - self.sp = data_loader.dataset.sp for batch, data in enumerate(data_loader): self.loggers.on_batch_begin(batch) diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index 610a43e9..cbfab5ed 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -3,7 +3,12 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from .collation import collate_seq_1d, collate_seq_2d, collate_seq_nd +from .collation import ( + collate_seqs_1d, + collate_seqs_2d, + collate_seqs_nd, + list_of_dicts_to_list, +) from .data_parallel import TorchDataParallel from .ddp import FairFullyShardedDDP, FairShardedDDP, TorchDDP from .devices import ( diff --git a/hyperion/torch/utils/collation.py b/hyperion/torch/utils/collation.py index 25b3790b..2b18a87a 100644 --- a/hyperion/torch/utils/collation.py +++ b/hyperion/torch/utils/collation.py @@ -5,9 +5,21 @@ import torch import torch.nn as nn +from torch.nn.utils.rnn import pad_sequence -def collate_seq_1d(x, pad_value=0): +def list_of_dicts_to_list(list_of_dicts, key): + """Takes a list of dictionaries and a key, + and returns a list of the items corresponding to the key + """ + output = [] + for item in list_of_dicts: + output.append(item[key]) + + return output + + +def collate_seqs_1d(x, pad_value=0): """Combines a list/tuple of vectors with different lengths into a single tensor. @@ -18,17 +30,20 @@ def collate_seq_1d(x, pad_value=0): 2D tensor with shape (num_vectors, max_vector_length). 1D long tensor containing the vector lengths. """ - max_length = max([x_i.size(0) for x_i in x]) - y = pad_value * torch.ones(len(x), max_length, dtype=x[0].dtype, device=x[0].device) - y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) - for i, x_i in enumerate(x): - y[i, : x_i.size(0)] = x_i - y_lengths[i] = x_i.size(0) + if not isinstance(x[0], torch.Tensor): + x = [torch.from_numpy(x_i) for x_i in x] + + assert x[0].dim() == 1 + x_lengths = [] + for x_i in x: + x_lengths.append(x_i.size(0)) - return y, y_lengths + x_lengths = torch.as_tensor(x_lengths) + x = pad_sequence(x, batch_first=True, padding_value=pad_value) + return x, x_lengths -def collate_seq_2d(x, pad_value=0, pad_dim=-1): +def collate_seqs_2d(x, pad_value=0, pad_dim=0): """Combines a list/tuple of matrices with different sizes in one of the dimensions into a single 3d tensor. Combines performing padding on the dimension which is not constant. @@ -41,24 +56,28 @@ def collate_seq_2d(x, pad_value=0, pad_dim=-1): 3D tensor with shape (num_vectors, max_length, feat_dim) or (num_vectors, feat_dim, length). 1D long tensor containing the dimensions lengths. """ - max_length = max([x_i.size(pad_dim) for x_i in x]) - y_size = list(x[0].size()) - y_size[pad_dim] = max_length - y = pad_value * torch.ones(*y_size, dtype=x[0].dtype, device=x[0].device) - y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) - if pad_dim == -1 or pad_dim == 1: - for i, x_i in enumerate(x): - y[i, :, : x_i.size(pad_dim)] = x_i - y_lengths[i] = x_i.size(pad_dim) - else: - for i, x_i in enumerate(x): - y[i, : x_i.size(pad_dim)] = x_i - y_lengths[i] = x_i.size(pad_dim) - - return y, y_lengths - - -def collate_seq_nd(x, pad_value=0, pad_dim=-1): + if not isinstance(x[0], torch.Tensor): + x = [torch.from_numpy(x_i) for x_i in x] + assert x[0].dim() == 2 + if pad_dim < 0: + pad_dim = 2 + pad_dim + + if pad_dim != 0: + x = [x_i.transpose(pad_dim, 0) for x_i in x] + + x_lengths = [] + for x_i in x: + x_lengths.append(x_i.size(0)) + + x_lengths = torch.as_tensor(x_lengths) + x = pad_sequence(x, batch_first=True, padding_value=pad_value) + if pad_dim != 0: + x = x.transpose(1, pad_dim + 1) + + return x, x_lengths + + +def collate_seqs_nd(x, pad_value=0, pad_dim=0): """Combines a list/tuple of N-d tensors with different sizes in one of the dimensions into a single (N+1)-d tensor. Combines performing padding on the dimension which is not constant. @@ -68,25 +87,113 @@ def collate_seq_nd(x, pad_value=0, pad_dim=-1): pad_dim: padding dimension. Returns: - (N+1)-D combined tensor. + (N+1)-d combined tensor. 1D long tensor containing the dimensions lengths. """ + if not isinstance(x[0], torch.Tensor): + x = [torch.from_numpy(x_i) for x_i in x] + if x[0].dim() == 1: - return collate_seq_1d(x) - - if x[0].dim() == 2: - return collate_seq_2d(x) - - # here the general case - max_length = max([x_i.size(pad_dim) for x_i in x]) - y_trans_size = list(x[0].transpose(0, pad_dim).size()) - y = pad_value * torch.ones(*y_trans_size, dtype=x[0].dtype, device=x[0].device) - y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) - for i, x_i in enumerate(x): - y[i, : x_i.size(pad_dim)] = x_i.transpose(0, pad_dim) - y_lengths[i] = x_i.size(pad_dim) - - if pad_dim > 0: - pad_dim = pad_dim + 1 - y = y.transpose(1, pad_dim).contiguous() - return y, y_lengths + return collate_seqs_1d(x, pad_value=pad_value) + + if pad_dim < 0: + pad_dim = x[0].dim() + pad_dim + + if pad_dim != 0: + x = [x_i.transpose(pad_dim, 0) for x_i in x] + + x_lengths = [] + for x_i in x: + x_lengths.append(x_i.size(0)) + + x_lengths = torch.as_tensor(x_lengths) + x = pad_sequence(x, batch_first=True, padding_value=pad_value) + if pad_dim != 0: + x = x.transpose(1, pad_dim + 1) + + return x, x_lengths + + +# def collate_seq_1d(x, pad_value=0): +# """Combines a list/tuple of vectors with different lengths +# into a single tensor. + +# Args: +# x: input lits/tuple of vectors. + +# Returns: +# 2D tensor with shape (num_vectors, max_vector_length). +# 1D long tensor containing the vector lengths. +# """ +# max_length = max([x_i.size(0) for x_i in x]) +# y = pad_value * torch.ones(len(x), max_length, dtype=x[0].dtype, device=x[0].device) +# y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) +# for i, x_i in enumerate(x): +# y[i, : x_i.size(0)] = x_i +# y_lengths[i] = x_i.size(0) + +# return y, y_lengths + + +# def collate_seq_2d(x, pad_value=0, pad_dim=-1): +# """Combines a list/tuple of matrices with different sizes in one of +# the dimensions into a single 3d tensor. +# Combines performing padding on the dimension which is not constant. + +# Args: +# x: input lits/tuple of matrices. +# pad_dim: padding dimension. + +# Returns: +# 3D tensor with shape (num_vectors, max_length, feat_dim) or (num_vectors, feat_dim, length). +# 1D long tensor containing the dimensions lengths. +# """ +# max_length = max([x_i.size(pad_dim) for x_i in x]) +# y_size = list(x[0].size()) +# y_size[pad_dim] = max_length +# y = pad_value * torch.ones(*y_size, dtype=x[0].dtype, device=x[0].device) +# y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) +# if pad_dim == -1 or pad_dim == 1: +# for i, x_i in enumerate(x): +# y[i, :, : x_i.size(pad_dim)] = x_i +# y_lengths[i] = x_i.size(pad_dim) +# else: +# for i, x_i in enumerate(x): +# y[i, : x_i.size(pad_dim)] = x_i +# y_lengths[i] = x_i.size(pad_dim) + +# return y, y_lengths + + +# def collate_seq_nd(x, pad_value=0, pad_dim=-1): +# """Combines a list/tuple of N-d tensors with different sizes in one of +# the dimensions into a single (N+1)-d tensor. +# Combines performing padding on the dimension which is not constant. + +# Args: +# x: input lits/tuple of matrices. +# pad_dim: padding dimension. + +# Returns: +# (N+1)-D combined tensor. +# 1D long tensor containing the dimensions lengths. +# """ +# if x[0].dim() == 1: +# return collate_seq_1d(x) + +# if x[0].dim() == 2: +# return collate_seq_2d(x) + +# # here the general case +# max_length = max([x_i.size(pad_dim) for x_i in x]) +# y_trans_size = list(x[0].transpose(0, pad_dim).size()) +# y = pad_value * torch.ones(*y_trans_size, dtype=x[0].dtype, device=x[0].device) +# y_lengths = torch.empty(len(x), dtype=torch.long, device=x[0].device) +# for i, x_i in enumerate(x): +# y[i, : x_i.size(pad_dim)] = x_i.transpose(0, pad_dim) +# y_lengths[i] = x_i.size(pad_dim) + +# if pad_dim > 0: +# pad_dim = pad_dim + 1 +# y = y.transpose(1, pad_dim).contiguous() +# return y, y_lengths diff --git a/hyperion/torch/utils/vad_utils.py b/hyperion/torch/utils/vad_utils.py index a47b92ef..4dc11ff7 100644 --- a/hyperion/torch/utils/vad_utils.py +++ b/hyperion/torch/utils/vad_utils.py @@ -6,7 +6,7 @@ import torch import torch.nn as nn -from .collation import collate_seq_nd +from .collation import collate_seqs_nd def remove_silence(x, vad, x_lengths=None, time_dim=1, tol=0): @@ -52,7 +52,7 @@ def remove_silence(x, vad, x_lengths=None, time_dim=1, tol=0): for i in range(x.size(0)): y.append(x[i, vad[i]]) - y, y_lengths = collate_seq_nd(y, pad_dim=0) + y, y_lengths = collate_seqs_nd(y, pad_dim=0) if trans: y = y.transpose(1, time_dim).contigous() From 5fb459b46dc8194b9247be0f9cbe21bbc64da9e9 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Wed, 15 May 2024 17:04:59 -0400 Subject: [PATCH 144/154] rnnt decoder renamed --- README.md | 5 ++- ...mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml | 5 +-- hyperion/bin/train_wav2rnn_transducer.py | 4 +- .../transducer/conformer_v1_rnn_transducer.py | 8 ++-- .../models/transducer/rnn_rnn_transducer.py | 14 +++--- .../torch/models/transducer/rnn_transducer.py | 44 ++++++++++--------- 6 files changed, 40 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 71a0fbd3..6f7a8490 100644 --- a/README.md +++ b/README.md @@ -30,13 +30,14 @@ The full API is described in the documentation page [https://hyperion-ml.readthe ``` conda create --name ${your_env} python=3.11 conda activate ${your_env} -conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia +# We used PyTorch 2.0.1, other versions may work too +conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.8 -c pytorch -c nvidia # If using k2 for ASR wget https://huggingface.co/csukuangfj/k2/resolve/main/ubuntu-cuda/k2-1.24.4.dev20240223+cuda11.8.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl pip install k2-1.24.4.dev20240223+cuda11.8.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ``` -For systems with cuda 10.2 driver: +For older systems with cuda 10.2 driver: ``` conda create --name ${your_env} python=3.10 conda activate ${your_env} diff --git a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml index baea17ab..b0def8fc 100644 --- a/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml +++ b/egs/librispeech/v1/conf/train_fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p.s1.yaml @@ -44,7 +44,7 @@ model: num_blocks: 16 d_ff: 576 in_layer_type: conv2d-sub - decoder: + rnnt_decoder: rnnt_loss: k2_pruned simple_loss_scale: 0.2 predictor: @@ -71,8 +71,7 @@ trainer: warmup_steps: 25000 update_lr_on_opt_step: true # grad_clip: 100 - # grad_clip: 20 - grad_clip: 1 + grad_clip: 20 use_amp: true log_interval: 1000 epochs: 120 diff --git a/hyperion/bin/train_wav2rnn_transducer.py b/hyperion/bin/train_wav2rnn_transducer.py index 14fc8db3..6dc314ad 100755 --- a/hyperion/bin/train_wav2rnn_transducer.py +++ b/hyperion/bin/train_wav2rnn_transducer.py @@ -117,8 +117,8 @@ def init_model(rank, model_class, tokenizers, **kwargs): logging.info("model network args={}".format(model_args)) tokenizer = list(tokenizers.items())[0][1] - model_args["transducer"]["decoder"]["blank_id"] = tokenizer.blank_id - model_args["transducer"]["decoder"]["vocab_size"] = tokenizer.vocab_size + model_args["transducer"]["rnnt_decoder"]["blank_id"] = tokenizer.blank_id + model_args["transducer"]["rnnt_decoder"]["vocab_size"] = tokenizer.vocab_size model = model_class(**model_args) if rank == 0: logging.info("model={}".format(model)) diff --git a/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py b/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py index 89173eff..cf8bb91f 100644 --- a/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py +++ b/hyperion/torch/models/transducer/conformer_v1_rnn_transducer.py @@ -28,13 +28,13 @@ class ConformerV1RNNTransducer(RNNTransducer): """ - def __init__(self, encoder, decoder): + def __init__(self, encoder, rnnt_decoder): if isinstance(encoder, dict): encoder = ConformerEncoderV1(**encoder) else: assert isinstance(encoder, ConformerEncoderV1) - super().__init__(encoder, decoder) + super().__init__(encoder, rnnt_decoder) @staticmethod def filter_args(**kwargs): @@ -57,11 +57,11 @@ def add_class_args(parser, prefix=None, skip=set()): def change_config( self, encoder, - decoder, + rnnt_decoder, ): logging.info("changing transducer encoder config") self.encoder.change_config(**encoder) - super().chage_config(**decoder) + super().chage_config(**rnnt_decoder) @staticmethod def filter_finetune_args(**kwargs): diff --git a/hyperion/torch/models/transducer/rnn_rnn_transducer.py b/hyperion/torch/models/transducer/rnn_rnn_transducer.py index 02d0c482..46438dbc 100644 --- a/hyperion/torch/models/transducer/rnn_rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_rnn_transducer.py @@ -28,13 +28,13 @@ class RNNRNNTransducer(RNNTransducer): """ - def __init__(self, encoder, decoder): + def __init__(self, encoder, rnnt_decoder): if isinstance(encoder, dict): encoder = RNNEncoder(**encoder) else: assert isinstance(encoder, RNNEncoder) - super().__init__(encoder, decoder) + super().__init__(encoder, rnnt_decoder) @staticmethod def filter_args(**kwargs): @@ -53,17 +53,16 @@ def add_class_args(parser, prefix=None, skip=set()): RNNEncoder.add_class_args(parser, prefix="encoder", skip=skip) RNNTransducer.add_class_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) def change_config( self, encoder, - decoder, + rnnt_decoder, ): logging.info("changing transducer encoder config") self.encoder.change_config(**encoder) - super().chage_config(**decoder) + super().chage_config(**rnnt_decoder) @staticmethod def filter_finetune_args(**kwargs): @@ -82,5 +81,4 @@ def add_finetune_args(parser, prefix=None): RNNTransducer.add_finetune_args(parser) if prefix is not None: - outer_parser.add_argument("--" + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/transducer/rnn_transducer.py b/hyperion/torch/models/transducer/rnn_transducer.py index a9fa5830..c951818d 100644 --- a/hyperion/torch/models/transducer/rnn_transducer.py +++ b/hyperion/torch/models/transducer/rnn_transducer.py @@ -37,28 +37,28 @@ class RNNTransducer(TorchModel): Attributes: encoder: Encoder network module - decoder: RNN-T Decoder config. dictionary or module. + rnnt_decoder: RNN-T Decoder config. dictionary or module. """ def __init__( self, encoder: Union[TorchModel, None], - decoder: Union[Dict, RNNTransducerDecoder], - ctc_weight: float = 0.0, + rnnt_decoder: Union[Dict, RNNTransducerDecoder], rnnt_weight: float = 1.0, + ctc_weight: float = 0.0, ): super().__init__() if encoder is not None: assert isinstance(encoder, TorchModel) - if isinstance(decoder, dict): + if isinstance(rnnt_decoder, dict): if encoder is not None: - decoder["in_feats"] = encoder.out_shape()[-1] - decoder = RNNTransducerDecoder(**decoder) + rnnt_decoder["in_feats"] = encoder.out_shape()[-1] + rnnt_decoder = RNNTransducerDecoder(**rnnt_decoder) else: - assert isinstance(decoder, RNNTransducerDecoder) + assert isinstance(rnnt_decoder, RNNTransducerDecoder) self.encoder = encoder - self.decoder = decoder + self.rnnt_decoder = rnnt_decoder def forward( self, @@ -89,7 +89,7 @@ def forward( x, x_lengths = self.encoder(x, x_lengths) assert torch.all(x_lengths > 0) - dec_output = self.decoder(x, x_lengths, y) + dec_output = self.rnnt_decoder(x, x_lengths, y) output = RNNTransducerOutput(*dec_output) return output @@ -125,7 +125,7 @@ def infer( y = [] for i in range(batch_size): x_i = x[i : i + 1, : x_lengths[i]] - y_i = self.decoder.decode( + y_i = self.rnnt_decoder.decode( x_i, method=decoding_method, beam_width=beam_width, @@ -166,11 +166,11 @@ def get_config(self): enc_cfg = self.encoder.get_config() del enc_cfg["class_name"] - dec_cfg = self.decoder.get_config() + dec_cfg = self.rnnt_decoder.get_config() del dec_cfg["class_name"] config = { "encoder": enc_cfg, - "decoder": dec_cfg, + "rnnt_decoder": dec_cfg, } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) @@ -179,8 +179,8 @@ def get_config(self): def filter_args(**kwargs): # get arguments for pooling args = {} - decoder_args = RNNTransducerDecoder.filter_args(**kwargs["decoder"]) - args["decoder"] = decoder_args + rnnt_decoder_args = RNNTransducerDecoder.filter_args(**kwargs["rnnt_decoder"]) + args["rnnt_decoder"] = rnnt_decoder_args return args @staticmethod @@ -189,23 +189,25 @@ def add_class_args(parser, prefix=None, skip=set()): outer_parser = parser parser = ArgumentParser(prog="") - RNNTransducerDecoder.add_class_args(parser, prefix="decoder") + RNNTransducerDecoder.add_class_args(parser, prefix="rnnt_decoder") if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) def change_config( self, - decoder: Dict, + rnnt_decoder: Dict, ): - logging.info("changing decoder config") - self.decoder.change_config(**decoder) + logging.info("changing rnnt_decoder config") + self.rnnt_decoder.change_config(**rnnt_decoder) @staticmethod def filter_finetune_args(**kwargs): args = {} - decoder_args = RNNTransducerDecoder.filter_finetune_args(**kwargs["decoder"]) - args["decoder"] = decoder_args + rnnt_decoder_args = RNNTransducerDecoder.filter_finetune_args( + **kwargs["rnnt_decoder"] + ) + args["rnnt_decoder"] = rnnt_decoder_args return args @staticmethod @@ -214,7 +216,7 @@ def add_finetune_args(parser, prefix=None): outer_parser = parser parser = ArgumentParser(prog="") - RNNTransducerDecoder.add_finetune_args(parser, prefix="decoder") + RNNTransducerDecoder.add_finetune_args(parser, prefix="rnnt_decoder") if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) From dae863314fd925e12ab4ed876ec5f4d055dbbeb8 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 24 May 2024 11:17:52 -0400 Subject: [PATCH 145/154] some fixes --- .../conf/train_fwseresnet34_dino_v1.2.2.yaml | 2 +- ...config_fbank80_stmn_ecapatdnn512x3.v1.2.sh | 4 +-- ...config_fbank80_stmn_fwseresnet34.v1.2.1.sh | 4 +-- ...config_fbank80_stmn_fwseresnet34.v1.2.2.sh | 30 +++++-------------- .../config_fbank80_stmn_fwseresnet34.v1.2.sh | 4 +-- .../config_fbank80_stmn_lresnet34.v1.2.sh | 4 +-- hyperion/np/feats/feature_windows.py | 2 +- hyperion/torch/data/audio_dataset.py | 3 ++ .../torch/trainers/xvector_adv_trainer.py | 4 +-- .../trainers/xvector_adv_trainer_from_wav.py | 4 +-- hyperion/torch/trainers/xvector_trainer.py | 4 +-- .../xvector_trainer_deep_feat_reg_from_wav.py | 4 +-- .../trainers/xvector_trainer_from_wav.py | 4 +-- hyperion/utils/hyp_dataset.py | 11 +++++++ hyperion/utils/info_table.py | 6 +++- 15 files changed, 47 insertions(+), 43 deletions(-) diff --git a/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml index 37bada36..a12e05f0 100644 --- a/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml +++ b/egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml @@ -80,7 +80,7 @@ trainer: decay_rate: 0.5 decay_steps: 60000 hold_steps: 15000 - min_lr: 1.0e-05 + min_lr: 1.0e-04 warmup_steps: 15000 update_lr_on_opt_step: true teacher_optim: diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh index de643f1e..0ecf904d 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_ecapatdnn512x3.v1.2.sh @@ -43,7 +43,7 @@ nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 1.2 cluster_ft_s1_method=cos_ahc_plda_ahc cluster_ft_s1_cfg=conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml -cluster_ft_s1_name=${cluster_method} +cluster_ft_s1_name=${cluster_ft_s1_method} cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name @@ -62,7 +62,7 @@ nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 1.2 cluster_ft_s2_method=cos_ahc_plda_ahc cluster_ft_s2_cfg=conf/cluster_ecapatdnn512x3_v1.2_ft1_cos_ahc_plda_ahc.yaml -cluster_ft_s2_name=${cluster_method} +cluster_ft_s2_name=${cluster_ft_s2_method} cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh index 102fbaef..c4f5c8c7 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.1.sh @@ -43,7 +43,7 @@ nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 1.2 cluster_ft_s1_method=cos_ahc cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml -cluster_ft_s1_name=${cluster_method_ft_s1_method} +cluster_ft_s1_name=${cluster_ft_s1_method} cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name # finetuning stage 2.1 @@ -61,6 +61,6 @@ nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 2.2 cluster_ft_s2_method=cos_ahc cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml -cluster_ft_s2_name=${cluster_method_ft_s2_method} +cluster_ft_s2_name=${cluster_ft_s2_method} cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh index 7aebfd69..e3ba0c3a 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh @@ -17,21 +17,7 @@ nnet_name=${feat_type}_fwseresnet34_dino.v1.2.2 nnet_s1_base_cfg=conf/train_fwseresnet34_dino_v1.2.2.yaml nnet_s1_name=$nnet_name.s1 nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name -nnet_s1=$nnet_s1_dir/teacher_model_ep0034.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0038.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0043.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0044.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0046.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0049.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0054.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0058.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0064.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0067.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0071.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0077.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0083.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0088.pth -nnet_s1=$nnet_s1_dir/teacher_model_ep0094.pth +nnet_s1=$nnet_s1_dir/teacher_model_ep0100.pth # clustering of dino embeddings cluster_method=cos_ahc_plda_ahc @@ -43,13 +29,13 @@ cluster_dir=exp/clustering/$nnet_s1_name/$cluster_name plda_cfg=conf/plda.yaml # finetuning stage 1.1 -nnet_ft_s1_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s1_1_base_cfg=conf/train_fwseresnet34_xvec_stage1.1_v1.2.2.yaml nnet_ft_s1_1_name=$nnet_name.s1.ft.s1.1 nnet_ft_s1_1_dir=exp/xvector_nnets/$nnet_ft_s1_1_name -nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0030.pth +nnet_ft_s1_1=$nnet_ft_s1_1_dir/model_ep0025.pth # finetuning stage 1.2 -nnet_ft_s1_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s1_2_base_cfg=conf/train_fwseresnet34_xvec_stage1.2_v1.2.2.yaml nnet_ft_s1_2_name=$nnet_name.s1.ft.s1.2 nnet_ft_s1_2_dir=exp/xvector_nnets/$nnet_ft_s1_2_name nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth @@ -57,17 +43,17 @@ nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 1.2 cluster_ft_s1_method=cos_ahc cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml -cluster_ft_s1_name=${cluster_method_ft_s1_method} +cluster_ft_s1_name=${cluster_ft_s1_method} cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name # finetuning stage 2.1 -nnet_ft_s2_1_base_cfg=conf/train_lresnet34_xvec_stage1.1_v1.2.yaml +nnet_ft_s2_1_base_cfg=conf/train_fwseresnet34_xvec_stage1.1_v1.2.2.yaml nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth # finetuning stage 2.2 -nnet_ft_s2_2_base_cfg=conf/train_lresnet34_xvec_stage1.2_v1.2.yaml +nnet_ft_s2_2_base_cfg=conf/train_fwseresnet34_xvec_stage1.2_v1.2.2.yaml nnet_ft_s2_2_name=$nnet_name.s1.ft.s2.2 nnet_ft_s2_2_dir=exp/xvector_nnets/$nnet_ft_s2_2_name nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth @@ -75,6 +61,6 @@ nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 2.2 cluster_ft_s2_method=cos_ahc cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc.yaml -cluster_ft_s2_name=${cluster_method_ft_s2_method} +cluster_ft_s2_name=${cluster_ft_s2_method} cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh index b3a6e963..9fecaa96 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.sh @@ -43,7 +43,7 @@ nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 1.2 cluster_ft_s1_method=cos_ahc_plda_ahc cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml -cluster_ft_s1_name=${cluster_method} +cluster_ft_s1_name=${cluster_fs_s1_method} cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name # finetuning stage 2.1 @@ -61,6 +61,6 @@ nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 2.2 cluster_ft_s2_method=cos_ahc_plda_ahc cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml -cluster_ft_s2_name=${cluster_method} +cluster_ft_s2_name=${cluster_ft_s2_method} cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh index 788b3b4b..4d02e22d 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_lresnet34.v1.2.sh @@ -43,7 +43,7 @@ nnet_ft_s1_2=$nnet_ft_s1_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 1.2 cluster_ft_s1_method=cos_ahc_plda_ahc cluster_ft_s1_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml -cluster_ft_s1_name=${cluster_method} +cluster_ft_s1_name=${cluster_ft_s1_method} cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name # finetuning stage 2.1 @@ -61,5 +61,5 @@ nnet_ft_s2_2=$nnet_ft_s2_2_dir/model_ep0070.pth # clustering of ft embeddings from stage 2.2 cluster_ft_s2_method=cos_ahc_plda_ahc cluster_ft_s2_cfg=conf/cluster_lresnet34_v1.2_ft1_cos_ahc_plda_ahc.yaml -cluster_ft_s2_name=${cluster_method} +cluster_ft_s2_name=${cluster_ft_s2_method} cluster_ft_s2_dir=exp/clustering/$nnet_ft_s2_2_name/$cluster_ft_s2_name diff --git a/hyperion/np/feats/feature_windows.py b/hyperion/np/feats/feature_windows.py index 000831ac..ef8fe7b4 100644 --- a/hyperion/np/feats/feature_windows.py +++ b/hyperion/np/feats/feature_windows.py @@ -6,7 +6,7 @@ import logging import numpy as np -from scipy.signal import blackman, hamming, hann +from scipy.signal.windows import blackman, hamming, hann from ...hyp_defs import float_cpu diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index d555a118..9d8bebc6 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -183,6 +183,9 @@ def _load_class_infos(self, class_names, class_files, is_val): assert ( name in self.seg_set ), f"class_name {name} not present in the segment set" + self.seg_set.convert_col_to_str( + name + ) # make sure that class ids are strings if self.rank == 0: logging.info("loading class-info file %s", file) table = ClassInfo.load(file) diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index 9d5a8bae..12ff506a 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -154,7 +154,7 @@ def train_epoch(self, data_loader): batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -198,7 +198,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics["loss"] = loss.item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index f63c532b..01676300 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -159,7 +159,7 @@ def train_epoch(self, data_loader): batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -203,7 +203,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics["loss"] = loss.item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index 15c5bd42..e8a91bb0 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -137,7 +137,7 @@ def train_epoch(self, data_loader): batch_metrics["loss"] = loss_acc * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -185,7 +185,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics["loss"] = loss_acc for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 98c74af3..3d1a8ccf 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -193,7 +193,7 @@ def train_epoch(self, data_loader): self.save_checkpoint(partial=True) for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -235,7 +235,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics["loss"] = loss.item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index ada74bb6..2f1fd18a 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -133,7 +133,7 @@ def train_epoch(self, data_loader): batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics @@ -177,7 +177,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): - batch_metrics[k] = metric(output, target) + batch_metrics[k] = metric(output.logits, target) metric_acc.update(batch_metrics, batch_size) diff --git a/hyperion/utils/hyp_dataset.py b/hyperion/utils/hyp_dataset.py index ba137b65..dda4231e 100644 --- a/hyperion/utils/hyp_dataset.py +++ b/hyperion/utils/hyp_dataset.py @@ -95,6 +95,16 @@ def __init__( self.sparse_trials = sparse_trials self.table_sep = table_sep self._files_to_delete = [] + self.fix_segments_dtypes() + + def fix_segments_dtypes(self): + if self._segments is not None: + self._fix_segments_dtypes(self._segments) + + def _fix_segments_dtypes(self, segments): + # ids in class_infos should be strings in segment set columns + for k in self.classes_keys(): + segments.convert_col_to_str(k) def get_dataset_files(self): file_paths = [] @@ -149,6 +159,7 @@ def segments(self, keep_loaded: bool = True): if self._segments is None: assert self._segments_path is not None segments = SegmentSet.load(self._segments_path, sep=self.table_sep) + self._fix_segments_dtypes(segments) if keep_loaded: self._segments = segments return segments diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index a813a467..ad8d3d68 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -33,7 +33,11 @@ def __init__(self, df): def fix_dtypes(self): if infer_dtype(self.df.id) != "string": - self.df[:, "id"] = self.df["id"].apply(str) + self.df.loc[:, "id"] = self.df["id"].apply(str) + + def convert_col_to_str(self, column): + if infer_dtype(self.df[column]) != "string": + self.df.loc[:, column] = self.df[column].apply(str) def copy(self): """Makes a copy of the object.""" From b62824af7e3eac17883fae7cddc3f2b8c9d3a815 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Fri, 24 May 2024 11:36:48 -0400 Subject: [PATCH 146/154] some fixes --- egs/voxceleb/ssl.v1/README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/egs/voxceleb/ssl.v1/README.md b/egs/voxceleb/ssl.v1/README.md index 03b2e1c4..0d12c86b 100644 --- a/egs/voxceleb/ssl.v1/README.md +++ b/egs/voxceleb/ssl.v1/README.md @@ -137,6 +137,12 @@ run_xxx_xxxx.sh --config-file global_conf/other_config.sh | | | | | FT-2 | PLDA | 1.67 | 0.137 | 0.193 | | config_fbank80_stmn_fwseresnet34.v1.2.1.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.49 | 0.101 | 0.161 | | | | | | FT-2 | PLDA | 1.53 | 0.109 | 0.168| +| config_fbank80_stmn_fwseresnet34.v1.2.2.sh | FW-SE ResNet34 / 0.1 x Cos Reg. | Cos+AHC+PLDA+AHC | Cos+AHC | DINO | Cosine | 3.96 | 0.232 | 0.358 | +| | | | | | PLDA | 4.04 | 0.185 | 0.291 | +| | | | | FT-1 | Cosine | 2.03 | 0.125 | 0.203 | +| | | | | FT-1 | PLDA | 2.44 | 0.149 | 0.231 | +| | | | | FT-2 | Cosine | +| | | | | FT-2 | PLDA | ### VoxCeleb 1 Entire-Clean trial list @@ -161,6 +167,12 @@ run_xxx_xxxx.sh --config-file global_conf/other_config.sh | | | | | FT-2 | PLDA | 1.77 | 0.121 | 0.208 | | config_fbank80_stmn_fwseresnet34.v1.2.1.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 1.83 | 0.106 | 0.170 | | | | | | FT-2 | PLDA | 1.68 | 0.109 | 0.188 | +| config_fbank80_stmn_fwseresnet34.v1.2.2.sh | FW-SE ResNet34 / 0.1 x Cos Reg. | Cos+AHC+PLDA+AHC | Cos+AHC | DINO | Cosine | 4.31 | 0.250 | 0.387 | +| | | | | | PLDA | 4.32 | 0.166 | 0.263 | +| | | | | FT-1 | Cosine | 2.61 | 0.138 | 0.210 | +| | | | | FT-1 | PLDA | 2.72 | 0.1366 | 0.216 | +| | | | | FT-2 | Cosine | +| | | | | FT-2 | PLDA | ### VoxCeleb 1 Hard-Clean trial list @@ -185,4 +197,10 @@ run_xxx_xxxx.sh --config-file global_conf/other_config.sh | | | | | FT-2 | PLDA | 3.51 | 0.219 | 0.351 | | config_fbank80_stmn_fwseresnet34.v1.2.1.sh | FW-SE ResNet34 | Cos+AHC+PLDA+AHC | Cos+AHC | FT-2 | Cosine | 3.11 | 0.172 | 0.270 | | | | | | FT-2 | PLDA | 3.15 | 0.186 | 0.294 | +| config_fbank80_stmn_fwseresnet34.v1.2.2.sh | FW-SE ResNet34 / 0.1 x Cos Reg. | Cos+AHC+PLDA+AHC | Cos+AHC | DINO | Cosine | 7.41 | 0.377 | 0.526 | +| | | | | | PLDA | 5.95 | 0.269 | 0.438 | +| | | | | FT-1 | Cosine | 4.38 | 0.222 | 0.337 | +| | | | | FT-1 | PLDA | 4.68 | 0.237 | 0.375 | +| | | | | FT-2 | Cosine | +| | | | | FT-2 | PLDA | From d7540bc0552b001b79c998f0121fdbbe9d301588 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 27 May 2024 13:22:01 -0400 Subject: [PATCH 147/154] updated readme --- egs/voxceleb/ssl.v1/README.md | 15 ++++++++------- .../config_fbank80_stmn_fwseresnet34.v1.2.2.sh | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/egs/voxceleb/ssl.v1/README.md b/egs/voxceleb/ssl.v1/README.md index 0d12c86b..73b1d039 100644 --- a/egs/voxceleb/ssl.v1/README.md +++ b/egs/voxceleb/ssl.v1/README.md @@ -141,8 +141,9 @@ run_xxx_xxxx.sh --config-file global_conf/other_config.sh | | | | | | PLDA | 4.04 | 0.185 | 0.291 | | | | | | FT-1 | Cosine | 2.03 | 0.125 | 0.203 | | | | | | FT-1 | PLDA | 2.44 | 0.149 | 0.231 | -| | | | | FT-2 | Cosine | -| | | | | FT-2 | PLDA | +| | | | | FT-2 | Cosine | 1.88 | 0.115 | 0.198 | +| | | | | FT-2 | PLDA | 2.57 | 0.147 | 0.234 | + ### VoxCeleb 1 Entire-Clean trial list @@ -171,8 +172,9 @@ run_xxx_xxxx.sh --config-file global_conf/other_config.sh | | | | | | PLDA | 4.32 | 0.166 | 0.263 | | | | | | FT-1 | Cosine | 2.61 | 0.138 | 0.210 | | | | | | FT-1 | PLDA | 2.72 | 0.1366 | 0.216 | -| | | | | FT-2 | Cosine | -| | | | | FT-2 | PLDA | +| | | | | FT-2 | Cosine | 2.41 | 0.121 | 0.193 | +| | | | | FT-2 | PLDA | 2.82 | 0.140 | 0.219 | + ### VoxCeleb 1 Hard-Clean trial list @@ -201,6 +203,5 @@ run_xxx_xxxx.sh --config-file global_conf/other_config.sh | | | | | | PLDA | 5.95 | 0.269 | 0.438 | | | | | | FT-1 | Cosine | 4.38 | 0.222 | 0.337 | | | | | | FT-1 | PLDA | 4.68 | 0.237 | 0.375 | -| | | | | FT-2 | Cosine | -| | | | | FT-2 | PLDA | - +| | | | | FT-2 | Cosine | 4.07 | 0.197 | 0.301 | +| | | | | FT-2 | PLDA | 4.75 | 0.229 | 0.352 | diff --git a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh index e3ba0c3a..11aab111 100644 --- a/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh +++ b/egs/voxceleb/ssl.v1/global_conf/config_fbank80_stmn_fwseresnet34.v1.2.2.sh @@ -50,7 +50,7 @@ cluster_ft_s1_dir=exp/clustering/$nnet_ft_s1_2_name/$cluster_ft_s1_name nnet_ft_s2_1_base_cfg=conf/train_fwseresnet34_xvec_stage1.1_v1.2.2.yaml nnet_ft_s2_1_name=$nnet_name.s1.ft.s2.1 nnet_ft_s2_1_dir=exp/xvector_nnets/$nnet_ft_s2_1_name -nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0030.pth +nnet_ft_s2_1=$nnet_ft_s2_1_dir/model_ep0025.pth # finetuning stage 2.2 nnet_ft_s2_2_base_cfg=conf/train_fwseresnet34_xvec_stage1.2_v1.2.2.yaml From 955fb02a7aa5be25bda05a48333c2274332edc59 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 10 Jun 2024 18:03:07 -0400 Subject: [PATCH 148/154] fix kmeans call in mixture models --- hyperion/np/pdfs/mixtures/gmm.py | 13 +++++++------ hyperion/np/pdfs/mixtures/gmm_diag_cov.py | 4 ++-- hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py | 9 +++++---- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/hyperion/np/pdfs/mixtures/gmm.py b/hyperion/np/pdfs/mixtures/gmm.py index 7b080dae..934c6749 100644 --- a/hyperion/np/pdfs/mixtures/gmm.py +++ b/hyperion/np/pdfs/mixtures/gmm.py @@ -2,6 +2,7 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import h5py import numpy as np import scipy.linalg as la @@ -150,8 +151,8 @@ def _initialize_kmeans(self, num_comp, x): self.Lambda[0] = invert_pdmat(S, return_inv=True)[-1] return - kmeans = KMeans(num_clusters=num_comp) - loss, cluster_index = kmeans.fit(x, epochs=100) + kmeans = KMeans(num_clusters=num_comp, epochs=100) + loss, cluster_index = kmeans.fit(x) self.mu = kmeans.mu self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) @@ -253,7 +254,7 @@ def split_comp(self, K=2): """ num_comp = self.num_comp * K pi = np.repeat(self.pi, K) / K - Lambda = np.repeat(self.Lambda, K, axis=0) * (K ** 2) + Lambda = np.repeat(self.Lambda, K, axis=0) * (K**2) mu = np.repeat(self.mu, K, axis=0) for g in range(self.num_comp): @@ -400,7 +401,7 @@ def load_from_kaldi(cls, file_path): x_dim = len(fields) eta1 = np.zeros((num_comp, x_dim), dtype=float_cpu()) eta2 = np.zeros( - (num_comp, int((x_dim ** 2 + 3 * x_dim) / 2)), + (num_comp, int((x_dim**2 + 3 * x_dim) / 2)), dtype=float_cpu(), ) @@ -436,7 +437,7 @@ def _validate_Lambda(self): def _validate_eta(self): assert self.eta.shape[0] == self.num_comp - assert self.eta.shape[1] == (self.x_dim ** 2 + 3 * self.x_dim) / 2 + assert self.eta.shape[1] == (self.x_dim**2 + 3 * self.x_dim) / 2 def validate(self): """Validates the parameters of the distribution.""" @@ -454,7 +455,7 @@ def validate(self): def compute_eta(mu, Lambda): """Computes nat param. from mean and precision.""" x_dim = mu.shape[-1] - eta_dim = int((x_dim ** 2 + 3 * x_dim) / 2) + eta_dim = int((x_dim**2 + 3 * x_dim) / 2) eta = np.zeros((mu.shape[0], eta_dim), dtype=float_cpu()) for k in range(mu.shape[0]): eta[k] = Normal.compute_eta(mu[k], Lambda[k]) diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index ecc7bad7..a5135190 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -121,8 +121,8 @@ def _initialize_kmeans(self, num_comp, x): self.Lambda = 1 / np.std(x, axis=0, keepdims=True) ** 2 return - kmeans = KMeans(num_clusters=num_comp) - loss, cluster_index = kmeans.fit(x, epochs=100) + kmeans = KMeans(num_clusters=num_comp, epochs=100) + loss, cluster_index = kmeans.fit(x) self.mu = kmeans.mu self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) diff --git a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py index 6ef7c891..d696bbac 100644 --- a/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_tied_diag_cov.py @@ -2,6 +2,7 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import h5py import numpy as np from scipy.special import erf @@ -83,8 +84,8 @@ def _initialize_kmeans(self, num_comp, x): self.Lambda = 1 / np.std(x, axis=0, keepdims=True) ** 2 return - kmeans = KMeans(num_clusters=num_comp) - loss, cluster_index = kmeans.fit(x, epochs=100) + kmeans = KMeans(num_clusters=num_comp, epochs=100) + loss, cluster_index = kmeans.fit(x) self.mu = kmeans.mu self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) @@ -93,7 +94,7 @@ def _initialize_kmeans(self, num_comp, x): r = cluster_index == k self.pi[k] = np.sum(r) / x.shape[0] delta = x[r] - self.mu[k] - C += np.sum(delta ** 2, axis=0) + C += np.sum(delta**2, axis=0) self.Lambda = x.shape[0] / C @@ -111,7 +112,7 @@ def Mstep(self, N, u_x): self.mu = F / N[:, None] if self.update_Lambda: - S = S / N[:, None] - self.mu ** 2 + S = S / N[:, None] - self.mu**2 S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) S = np.maximum(S, S_floor) Spool = np.sum(N[:, None] * S, axis=0) / np.sum(N) From b8fe5cb79013ac2022450d314f7c9848bae40f9a Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Mon, 10 Jun 2024 18:22:26 -0400 Subject: [PATCH 149/154] np.bool -> bool --- ...vec_cosine_scoring_from_transfer_adv_test_wav.py | 6 ++---- .../bin/generate_adv_attacks_xvector_classif.py | 13 +++---------- hyperion/bin/generate_adv_attacks_xvector_verif.py | 2 +- hyperion/np/classifiers/greedy_fusion.py | 2 +- hyperion/np/diarization/diar_ahc_plda.py | 4 ++-- hyperion/np/pdfs/jfa/jfa_total.py | 2 +- hyperion/utils/info_table.py | 2 +- hyperion/utils/rttm.py | 2 +- hyperion/utils/vad_utils.py | 4 ++-- 9 files changed, 14 insertions(+), 23 deletions(-) diff --git a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py index a6f8efa4..5cd4b864 100755 --- a/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py +++ b/hyperion/bin/eval_xvec_cosine_scoring_from_transfer_adv_test_wav.py @@ -243,7 +243,7 @@ def eval_cosine_scoring( vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + vad = torch.as_tensor(vad.astype(bool, copy=False), dtype=torch.bool).to( device ) model.vad_t = vad @@ -361,9 +361,7 @@ def main(): parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/generate_adv_attacks_xvector_classif.py b/hyperion/bin/generate_adv_attacks_xvector_classif.py index 4d0e762a..6f36e3d3 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_classif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_classif.py @@ -208,18 +208,13 @@ def generate_attacks( vad = v_reader.read([key])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + vad = torch.as_tensor(vad.astype(bool, copy=False), dtype=torch.bool).to( device ) model.vad = vad logging.info( "utt %s detected %d/%d (%.2f %%) speech frames" - % ( - key, - speech_frames, - tot_frames, - speech_frames / tot_frames * 100, - ) + % (key, speech_frames, tot_frames, speech_frames / tot_frames * 100,) ) t2 = time.time() @@ -329,9 +324,7 @@ def main(): parser.add_argument("--vad", dest="vad_spec", default=None) parser.add_argument( - "--vad-path-prefix", - default=None, - help=("scp file_path prefix for vad"), + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad"), ) parser.add_argument("--model-path", required=True) diff --git a/hyperion/bin/generate_adv_attacks_xvector_verif.py b/hyperion/bin/generate_adv_attacks_xvector_verif.py index f858ea22..ae78ea5b 100755 --- a/hyperion/bin/generate_adv_attacks_xvector_verif.py +++ b/hyperion/bin/generate_adv_attacks_xvector_verif.py @@ -217,7 +217,7 @@ def generate_attacks( vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + vad = torch.as_tensor(vad.astype(bool, copy=False), dtype=torch.bool).to( device ) model.vad_t = vad diff --git a/hyperion/np/classifiers/greedy_fusion.py b/hyperion/np/classifiers/greedy_fusion.py index f03a05a0..646af8d3 100644 --- a/hyperion/np/classifiers/greedy_fusion.py +++ b/hyperion/np/classifiers/greedy_fusion.py @@ -226,7 +226,7 @@ def fit(self, x, class_ids, sample_weights=None): num_cands = len(cand_systems) cand_min_dcf = np.zeros((num_cands,), dtype=float_cpu()) cand_act_dcf = np.zeros((num_cands,), dtype=float_cpu()) - all_pos = np.zeros((num_cands,), dtype=np.bool) + all_pos = np.zeros((num_cands,), dtype=bool) cand_weights = [] for j in range(num_cands): system_idx_ij = np.concatenate( diff --git a/hyperion/np/diarization/diar_ahc_plda.py b/hyperion/np/diarization/diar_ahc_plda.py index 4bfbc06b..7bffa633 100644 --- a/hyperion/np/diarization/diar_ahc_plda.py +++ b/hyperion/np/diarization/diar_ahc_plda.py @@ -66,7 +66,7 @@ def _plot_score_hist(scores, output_file, thr=None, gmm=None): output_dir = Path(output_file).parent output_dir.mkdir(parents=True, exist_ok=True) - mask = np.triu(np.ones(scores.shape, dtype=np.bool), 1) + mask = np.triu(np.ones(scores.shape, dtype=bool), 1) scores_r = scores[mask].ravel() _, bins, _ = plt.hist( @@ -96,7 +96,7 @@ def _plot_score_hist(scores, output_file, thr=None, gmm=None): @staticmethod def _unsup_gmm_calibration(scores): """Performs unsupervised calibration on the scores by training a GMM.""" - mask = np.triu(np.ones(scores.shape, dtype=np.bool), 1) + mask = np.triu(np.ones(scores.shape, dtype=bool), 1) scores_r = scores[mask].ravel()[:, None] # N x 1 gmm_1c = GMM(num_comp=1) gmm_1c.fit(scores_r, epochs=1) diff --git a/hyperion/np/pdfs/jfa/jfa_total.py b/hyperion/np/pdfs/jfa/jfa_total.py index 6e2b79e3..97450e0e 100644 --- a/hyperion/np/pdfs/jfa/jfa_total.py +++ b/hyperion/np/pdfs/jfa/jfa_total.py @@ -281,7 +281,7 @@ def TT(self): def _upptr(self): """Upper triangular mask.""" if self.__upptr is None: - self.__upptr = np.triu(np.ones(self.y_dim, dtype=np.bool)) + self.__upptr = np.triu(np.ones(self.y_dim, dtype=bool)) return self.__upptr @staticmethod diff --git a/hyperion/utils/info_table.py b/hyperion/utils/info_table.py index ad8d3d68..70ec49a0 100644 --- a/hyperion/utils/info_table.py +++ b/hyperion/utils/info_table.py @@ -347,7 +347,7 @@ def get_loc(self, keys): if isinstance(loc, int): return loc - if isinstance(loc, np.ndarray) and loc.dtype == np.bool: + if isinstance(loc, np.ndarray) and loc.dtype == bool: return np.nonzero(loc)[0] return list(range(loc.start, loc.stop, loc.step)) diff --git a/hyperion/utils/rttm.py b/hyperion/utils/rttm.py index c691fc17..db7c0fae 100644 --- a/hyperion/utils/rttm.py +++ b/hyperion/utils/rttm.py @@ -636,7 +636,7 @@ def get_bin_sample_mask_for_spk( tend[tend > max_samples] = max_samples - vad = np.zeros((max_samples,), dtype=np.bool) + vad = np.zeros((max_samples,), dtype=bool) for i, j in zip(tbeg, tend): if j > i: vad[i:j] = True diff --git a/hyperion/utils/vad_utils.py b/hyperion/utils/vad_utils.py index 2d68bc5c..4f3f980e 100644 --- a/hyperion/utils/vad_utils.py +++ b/hyperion/utils/vad_utils.py @@ -135,7 +135,7 @@ def vad_timestamps_to_bin( if max_frames is not None and num_frames < max_frames: num_frames = max_frames - vad = np.zeros((num_frames,), dtype=np.bool) + vad = np.zeros((num_frames,), dtype=bool) frame_start = np.ceil( (in_timestamps[:, 0] - (pad + frame_center)) / frame_shift ).astype(dtype=np.int) @@ -242,7 +242,7 @@ def intersect_segment_timestamps_with_vad(in_timestamps, vad_timestamps): vad_start = vad_timestamps[:, 0] vad_end = vad_timestamps[:, 1] num_vad_segs = len(vad_start) - speech_idx = np.zeros((in_timestamps.shape[0],), dtype=np.bool) + speech_idx = np.zeros((in_timestamps.shape[0],), dtype=bool) out_timestamps = [] out_timestamps2speech_segs = [] count_speech = 0 From ad9de1c5f449219856e57a6f37410b1bcc1d45c8 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 11 Jun 2024 09:13:47 -0400 Subject: [PATCH 150/154] debug gmm --- hyperion/np/clustering/kmeans.py | 4 ++-- hyperion/np/pdfs/mixtures/exp_family_mixture.py | 3 +++ hyperion/np/pdfs/mixtures/gmm_diag_cov.py | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/hyperion/np/clustering/kmeans.py b/hyperion/np/clustering/kmeans.py index 82d257d1..59983cae 100644 --- a/hyperion/np/clustering/kmeans.py +++ b/hyperion/np/clustering/kmeans.py @@ -188,14 +188,14 @@ def _compute_centroids(self, x, index): for k in range(self.num_clusters): r = index == k if np.sum(r) > 0: - mu[k] = np.mean(x[index == k], axis=0) + mu[k] = np.mean(x[r], axis=0) return mu @staticmethod def _compute_centroid(x, index, k): r = index == k if np.sum(r) > 0: - return np.mean(x[index == k], axis=0) + return np.mean(x[r], axis=0) else: return None diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index d1cf7f68..91ca19a2 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -2,6 +2,7 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import numpy as np @@ -104,6 +105,8 @@ def fit( ) elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) + print(self.log_prob(x, mode="nat"), self.log_prob(x, mode="std")) + if x_val is None: return elbo, elbo / x.shape[0] else: diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index a5135190..f8fab693 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -180,9 +180,11 @@ def Mstep(self, N, u_x): if self.update_Lambda: S = S / N[:, None] - self.mu**2 + print("1", S) S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) S_floor = np.maximum(S_floor, 1e-10) S = np.maximum(S, S_floor) + print("2", S) print(np.min(S)) self.Lambda = 1 / S self._Sigma = S From e27af1b86cea33d190a9368b98814352d9b2d21b Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 11 Jun 2024 09:26:56 -0400 Subject: [PATCH 151/154] debug gmm --- hyperion/np/pdfs/mixtures/exp_family_mixture.py | 6 +++++- hyperion/np/pdfs/mixtures/gmm_diag_cov.py | 9 +++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index 91ca19a2..6fd2a5b1 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -105,7 +105,11 @@ def fit( ) elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) - print(self.log_prob(x, mode="nat"), self.log_prob(x, mode="std")) + print( + elbo[epoch], + np.mean(self.log_prob(x, mode="nat")), + np.mean(self.log_prob(x, mode="std")), + ) if x_val is None: return elbo, elbo / x.shape[0] diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index f8fab693..29c3a571 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -180,12 +180,9 @@ def Mstep(self, N, u_x): if self.update_Lambda: S = S / N[:, None] - self.mu**2 - print("1", S) - S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) - S_floor = np.maximum(S_floor, 1e-10) - S = np.maximum(S, S_floor) - print("2", S) - print(np.min(S)) + # S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) + # S_floor = np.maximum(S_floor, 1e-10) + # S = np.maximum(S, S_floor) self.Lambda = 1 / S self._Sigma = S self._cholLambda = None From 44f7abba12dfc83806982ad9a15eb19e11890e87 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 11 Jun 2024 09:44:03 -0400 Subject: [PATCH 152/154] debug gmm --- hyperion/np/pdfs/mixtures/exp_family_mixture.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index 6fd2a5b1..01181b61 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -106,7 +106,7 @@ def fit( elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) print( - elbo[epoch], + elbo[epoch] / x.shape[0], np.mean(self.log_prob(x, mode="nat")), np.mean(self.log_prob(x, mode="std")), ) @@ -210,7 +210,6 @@ def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): N = np.sum(z, axis=0) acc_u_x = np.dot(z.T, u_x) - # L_z=gmm.ElnP_z_w(N,gmm.lnw)-gmm.Elnq_z(z); return N, acc_u_x def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): @@ -473,8 +472,8 @@ def sum_suff_stats(self, N, u_x): Accumalted N and u_x. """ assert len(N) == len(u_x) - acc_N = N[1] - acc_u_x = u_x[1] + acc_N = N[0] + acc_u_x = u_x[0] for i in range(1, len(N)): acc_N += N[i] acc_u_x += u_x[i] From 025a1376bee1bccfae88bbc3eff2f03c2b1b7643 Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Tue, 11 Jun 2024 09:56:04 -0400 Subject: [PATCH 153/154] debug gmm --- hyperion/np/pdfs/mixtures/exp_family_mixture.py | 6 ------ hyperion/np/pdfs/mixtures/gmm_diag_cov.py | 6 +++--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/hyperion/np/pdfs/mixtures/exp_family_mixture.py b/hyperion/np/pdfs/mixtures/exp_family_mixture.py index 01181b61..e1355dc5 100644 --- a/hyperion/np/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/np/pdfs/mixtures/exp_family_mixture.py @@ -105,12 +105,6 @@ def fit( ) elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) - print( - elbo[epoch] / x.shape[0], - np.mean(self.log_prob(x, mode="nat")), - np.mean(self.log_prob(x, mode="std")), - ) - if x_val is None: return elbo, elbo / x.shape[0] else: diff --git a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py index 29c3a571..c3985aef 100644 --- a/hyperion/np/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/np/pdfs/mixtures/gmm_diag_cov.py @@ -180,9 +180,9 @@ def Mstep(self, N, u_x): if self.update_Lambda: S = S / N[:, None] - self.mu**2 - # S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) - # S_floor = np.maximum(S_floor, 1e-10) - # S = np.maximum(S, S_floor) + S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) + S_floor = np.maximum(S_floor, 1e-10) + S = np.maximum(S, S_floor) self.Lambda = 1 / S self._Sigma = S self._cholLambda = None From 9303c0d4f141ae249d05a7f4cfbe811216ee422d Mon Sep 17 00:00:00 2001 From: Jesus Villalba Date: Sat, 20 Jul 2024 12:34:41 -0400 Subject: [PATCH 154/154] remove nan assert from glob pool --- README.md | 17 +------------- hyperion/torch/data/audio_dataset.py | 2 +- hyperion/torch/layers/global_pool.py | 33 ++++++++++++++-------------- hyperion/utils/class_info.py | 2 +- hyperion/utils/trial_key.py | 4 ++-- 5 files changed, 22 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 6f7a8490..04f4d269 100644 --- a/README.md +++ b/README.md @@ -52,27 +52,12 @@ conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit git clone https://github.com/hyperion-ml/hyperion.git ``` -- You can choose to install hyperion in the environment +- Then install hyperion in the environment ```bash cd hyperion pip install -e . ``` -- Or add the hyperion toolkit to the PYTHONPATH envirnoment variable - This option will allow you to share the same environment if you are working with several hyperion branches - at the same time, while installing it requires to have an enviroment per branch. - For this, you need to install the requirements -```bash -cd hyperion -pip install -r requirements.txt -``` -Then add these lines to your `~/.bashrc` or to each script that uses hyperion -```bash -HYP_ROOT= #substitute this by your hyperion location -export PYTHONPATH=${HYP_ROOT}:$PYTHONPATH -export PATH=${HYP_ROOT}/bin:$PATH -``` - ## Recipes There are recipes for several tasks in the `./egs` directory. diff --git a/hyperion/torch/data/audio_dataset.py b/hyperion/torch/data/audio_dataset.py index 9d8bebc6..a8e45bda 100644 --- a/hyperion/torch/data/audio_dataset.py +++ b/hyperion/torch/data/audio_dataset.py @@ -153,7 +153,7 @@ def _load_legacy_durations(self, time_durs_file): time_durs = SegmentSet.load(time_durs_file) self.seg_set["duration"] = time_durs.loc[ self.seg_set["id"] - ].class_id.values.astype(np.float, copy=False) + ].class_id.values.astype(float, copy=False) def _load_bpe_model(self, bpe_model, is_val): if self.rank == 0: diff --git a/hyperion/torch/layers/global_pool.py b/hyperion/torch/layers/global_pool.py index aa14f743..f4174e3d 100644 --- a/hyperion/torch/layers/global_pool.py +++ b/hyperion/torch/layers/global_pool.py @@ -2,6 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ + import logging import math @@ -201,7 +202,7 @@ def forward(self, x, x_lengths=None, weights=None): # this can produce slightly negative variance when relu6 saturates in all time steps # add 1e-5 for stability s = torch.sqrt( - torch.mean(delta ** 2, dim=self.dim, keepdim=False).clamp(min=SQRT_EPS) + torch.mean(delta**2, dim=self.dim, keepdim=False).clamp(min=SQRT_EPS) ) mus = torch.cat((mu, s), dim=1) @@ -214,7 +215,7 @@ def forward(self, x, x_lengths=None, weights=None): wbar = torch.mean(weights, dim=self.dim, keepdim=True) mu = xbar / wbar delta = x - mu - var = torch.mean(weights * delta ** 2, dim=self.dim, keepdim=True) / wbar + var = torch.mean(weights * delta**2, dim=self.dim, keepdim=True) / wbar s = torch.sqrt(var.clamp(min=SQRT_EPS)) mu = mu.squeeze(self.dim) s = s.squeeze(self.dim) @@ -254,9 +255,9 @@ def _forward_slidwin_int(self, x, win_length, win_shift, snip_edges): c_x = torch.cumsum(x, dim=-1).view(-1, x.shape[-1]) m_x = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length - c_x = torch.cumsum(x ** 2, dim=-1).view(-1, x.shape[-1]) + c_x = torch.cumsum(x**2, dim=-1).view(-1, x.shape[-1]) m_x2 = (c_x[:, win_shift:] - c_x[:, :-win_shift]) / win_length - s_x = torch.sqrt(m_x2 - m_x ** 2).clamp(min=SQRT_EPS) + s_x = torch.sqrt(m_x2 - m_x**2).clamp(min=SQRT_EPS) mus = self._post_slidwin(m_x, s_x, out_shape) return mus @@ -265,7 +266,7 @@ def _forward_slidwin_float(self, x, win_length, win_shift, snip_edges): x, out_shape = self._pre_slidwin(x, win_length, win_shift, snip_edges) num_frames = out_shape[-1] c_x = torch.cumsum(x, dim=-1).view(-1, x.shape[-1]) - c_x2 = torch.cumsum(x ** 2, dim=-1).view(-1, x.shape[-1]) + c_x2 = torch.cumsum(x**2, dim=-1).view(-1, x.shape[-1]) # xx = x.view(-1, x.shape[-1]) # print(xx.shape[1]) @@ -309,7 +310,7 @@ def _forward_slidwin_float(self, x, win_length, win_shift, snip_edges): k += win_shift - var_x = (m_x2 - m_x ** 2).clamp(min=SQRT_EPS) + var_x = (m_x2 - m_x**2).clamp(min=SQRT_EPS) s_x = torch.sqrt(var_x) # idx = torch.isnan(s_x) #.any(dim=1) # if torch.sum(idx) > 0: @@ -400,14 +401,14 @@ def forward(self, x, x_lengths=None, weights=None): weights = self._standardize_weights(x, x_lengths, weights) if weights is None: mu = torch.mean(x, dim=self.dim, keepdim=self.keepdim) - x2bar = torch.mean(x ** 2, dim=self.dim, keepdim=self.keepdim) + x2bar = torch.mean(x**2, dim=self.dim, keepdim=self.keepdim) logvar = torch.log(x2bar - mu * mu + 1e-5) # for stability in case var=0 return torch.cat((mu, logvar), dim=-1) xbar = torch.mean(weights * x, dim=self.dim, keepdim=self.keepdim) wbar = torch.mean(weights, dim=self.dim, keepdim=self.keepdim) mu = xbar / wbar - x2bar = torch.mean(weights * x ** 2, dim=self.dim, keepdim=self.keepdim) / wbar + x2bar = torch.mean(weights * x**2, dim=self.dim, keepdim=self.keepdim) / wbar var = (x2bar - mu * mu).clamp(min=1e-5) logvar = torch.log(var) @@ -444,7 +445,7 @@ def __init__( if dist_pow == 1: self.dist_f = lambda x: torch.norm(x, p=2, dim=-1) else: - self.dist_f = lambda x: torch.sum(x ** 2, dim=-1) + self.dist_f = lambda x: torch.sum(x**2, dim=-1) self.size_multiplier = num_comp @@ -503,7 +504,7 @@ def forward(self, x, x_lengths=None, weights=None): delta = x - self.mu # (batch, time, num_comp, feat_dim) dist = self.dist_f(delta) # (batch, time, num_comp) - llk = -self.prec ** 2 * dist + self.bias + llk = -self.prec**2 * dist + self.bias r = nnf.softmax(llk, dim=-1) # (batch, time, num_comp) if weights is not None: r *= weights @@ -778,9 +779,9 @@ def forward(self, x, x_lengths=None, weights=None): # x = (batch, feat_dim, time) weights = self._standardize_weights(x, x_lengths, weights) # (batch, 1, time) x_inner = self.conv1(x) # (batch, inner_dim, time) - assert not torch.any( - torch.isnan(x_inner) - ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(x))} {torch.mean(x)} {torch.sum(torch.isinf(x))} {x.size()}" + # assert not torch.any( + # torch.isnan(x_inner) + # ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(x))} {torch.mean(x)} {torch.sum(torch.isinf(x))} {x.size()}" # assert not torch.any( # torch.isinf(x_inner) # ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(x))}" @@ -788,9 +789,9 @@ def forward(self, x, x_lengths=None, weights=None): if self.use_global_context: global_mus = self.stats_pool(x, weights=weights) x_inner = x_inner + self.lin_global(global_mus).unsqueeze(-1) - assert not torch.any( - torch.isnan(x_inner) - ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(global_mus))}" + # assert not torch.any( + # torch.isnan(x_inner) + # ), f"xinner is nan {torch.sum(torch.isnan(x_inner))} {torch.sum(torch.isnan(global_mus))}" # assert not torch.any( # torch.isinf(x_inner) # ), f"xinner is inf {torch.sum(torch.isinf(x_inner))} {torch.sum(torch.isinf(global_mus))}" diff --git a/hyperion/utils/class_info.py b/hyperion/utils/class_info.py index 3cb03659..b3a08178 100644 --- a/hyperion/utils/class_info.py +++ b/hyperion/utils/class_info.py @@ -76,7 +76,7 @@ def load(cls, file_path, sep=None): sep=" ", header=None, names=["id"], - dtype={"id": np.str}, + dtype={"id": str}, ) return cls(df) diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py index 5d8019b6..539a049d 100644 --- a/hyperion/utils/trial_key.py +++ b/hyperion/utils/trial_key.py @@ -12,7 +12,7 @@ import pandas as pd # from .list_utils import * -from .list_utils import sort, intersect, ismember, split_list, list2ndarray +from .list_utils import intersect, ismember, list2ndarray, sort, split_list from .trial_ndx import TrialNdx @@ -421,7 +421,7 @@ def filter(self, model_set, seg_set, keep=True): assert np.all(f) model_set = self.model_set[mod_idx] - set_set = self.seg_set[seg_idx] + seg_set = self.seg_set[seg_idx] ix = np.ix_(mod_idx, seg_idx) tar = self.tar[ix] non = self.non[ix]