Skip to content

[Bug] XTTS Training with LJSpeech leads to weird inference output #362

@regedavid

Description

@regedavid

Describe the bug

I tried training XTTS model with LJSpeech dataset, using the xtts v2 training script from the recipes folder.
However, when doing inference, I don't get any speech, only loud metallic buzzing noise. I tried doing inference with original model and it worked fine. I also tried to train with custom tokenizer, mel_stats but the issue still persists. I will share my training script, inference script and config.json file. Thank you in advance.

To Reproduce

Training Script

import os

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
from TTS.tts.models.xtts import XttsAudioConfig
from TTS.utils.manage import ModelManager

# Logging parameters
RUN_NAME = "GPT_XTTS_v2.0_LJSpeech_FT"
PROJECT_NAME = "XTTS_trainer"
DASHBOARD_LOGGER = "tensorboard"
LOGGER_URI = None
output_path = os.path.dirname(os.path.abspath(__file__))
# Set here the path that the checkpoints will be saved. Default: ./run/training/
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")

# Training Parameters
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
START_WITH_EVAL = True  # if True it will star with evaluation
BATCH_SIZE = 32  # set here the batch size
GRAD_ACUMM_STEPS = 84  # set here the grad accumulation steps
# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.

# Define here the dataset that you want to use for the fine-tuning on.
config_dataset = BaseDatasetConfig(
    formatter="ljspeech",
    dataset_name="ljspeech",
    path=os.path.join(output_path, "../LJSpeech-1.1/"),
    meta_file_train="metadata.csv",
    language="en",
)

# Add here the configs of the datasets
DATASETS_CONFIG_LIST = [config_dataset]

# Define the path where XTTS v2.0.1 files will be downloaded
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)


# DVAE files
DVAE_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/dvae.pth"
MEL_NORM_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/mel_stats.pth"

# Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))

# download DVAE files if needed
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)


# Download XTTS v2.0 checkpoint if needed
TOKENIZER_FILE_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth"

# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file

# download XTTS v2.0 files if needed
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
    )


# Training sentences generations
SPEAKER_REFERENCE = [
    os.path.join(output_path, "../LJSpeech-1.1/wavs/LJ001-0002.wav")  # speaker reference to be used in training test sentences
]
LANGUAGE = config_dataset.language

TOKENIZER_FILE_LJS = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tokenizer_ljs.json')
MEL_NORM_FILE_LJS = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mel_mean_ljs.pth')
XTTS_CHECKPOINT = os.path.join(OUT_PATH, 'GPT_XTTS_v2.0_LJSpeech_FT-April-02-2025_09+37PM-0000000/model.pth')

def main():
    # init args and config
    model_args = GPTArgs(
        max_conditioning_length=132300,  # 6 secs
        min_conditioning_length=66150,  # 3 secs
        debug_loading_failures=False,
        max_wav_length=255995,  # ~11.6 seconds
        max_text_length=200,
        mel_norm_file=MEL_NORM_FILE_LJS,
        dvae_checkpoint=DVAE_CHECKPOINT,
        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
        tokenizer_file=TOKENIZER_FILE_LJS,
        gpt_num_audio_tokens=1026,
        gpt_start_audio_token=1024,
        gpt_stop_audio_token=1025,
        gpt_use_masking_gt_prompt_approach=True,
        gpt_use_perceiver_resampler=True,
    )
    # define audio config
    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
    # training parameters config
    config = GPTTrainerConfig(
        output_path=OUT_PATH,
        model_args=model_args,
        run_name=RUN_NAME,
        project_name=PROJECT_NAME,
        run_description="""
            GPT XTTS training
            """,
        dashboard_logger=DASHBOARD_LOGGER,
        logger_uri=LOGGER_URI,
        audio=audio_config,
        batch_size=BATCH_SIZE,
        batch_group_size=48,
        eval_batch_size=int(BATCH_SIZE / 2),
        num_loader_workers=4,
        eval_split_max_size=256,
        print_step=50,
        plot_step=100,
        log_model_step=1000,
        save_step=10000,
        save_n_checkpoints=0,
        save_checkpoints=False,
        # target_loss="loss",
        print_eval=False,
        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
        optimizer="AdamW",
        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
        lr=5e-06,  # learning rate
        lr_scheduler="MultiStepLR",
        # it was adjusted accordly for the new step scheme
        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
        test_sentences=[
            {
                "text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": LANGUAGE,
            },
            {
                "text": "This cake is great. It's so delicious and moist.",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": LANGUAGE,
            },
        ],
        languages=[LANGUAGE],
    )

    # init the model from config
    model = GPTTrainer.init_from_config(config)

    # load training samples
    train_samples, eval_samples = load_tts_samples(
        DATASETS_CONFIG_LIST,
        eval_split=True,
        eval_split_max_size=config.eval_split_max_size,
        eval_split_size=config.eval_split_size,
    )

    # init the trainer and 🚀
    trainer = Trainer(
        TrainerArgs(
            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
            skip_train_epoch=False,
            start_with_eval=START_WITH_EVAL,
            grad_accum_steps=GRAD_ACUMM_STEPS,
        ),
        config,
        output_path=OUT_PATH,
        model=model,
        train_samples=train_samples,
        eval_samples=eval_samples,
    )
    trainer.fit()


if __name__ == "__main__":
    main()

Inference Script

import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

# Get the directory of the current file
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
TRAINING_DIR = os.path.join(CURRENT_DIR, "run/training")
TRAIN_CHECKPOINT_DIR = os.path.join(TRAINING_DIR, "GPT_XTTS_v2.0_LJSpeech_FT-April-03-2025_11+52AM-0000000")
# Add here the xtts_config path
CONFIG_PATH = os.path.join(TRAIN_CHECKPOINT_DIR, 'config.json')
# Add here the vocab file that you have used to train the model
TOKENIZER_PATH = os.path.join(CURRENT_DIR, 'tokenizer_ljs.json')
# Add here the checkpoint that you want to do inference with
XTTS_CHECKPOINT = os.path.join(TRAIN_CHECKPOINT_DIR, 'best_model.pth')
# Get the parent directory of the current file
PARENT_DIR = os.path.dirname(CURRENT_DIR)
SPEAKER_REFERENCE = os.path.join(PARENT_DIR, "LJSpeech-1.1/wavs/LJ001-0002.wav")  # speaker reference to be used in training test sentences
# output wav path
OUTPUT_WAV_PATH = "xtts-ft_3.wav"

print("Loading model...")
config = XttsConfig()
config.load_json(CONFIG_PATH)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENIZER_PATH, use_deepspeed=False)

print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])

print("Inference...")
out = model.inference(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    "en",
    gpt_cond_latent,
    speaker_embedding,
    temperature=0.7, # Add custom parameters here
)
torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)

config.json file

{
    "output_path": "/projects/coqui/LJTraining/run/training",
    "logger_uri": null,
    "run_name": "GPT_XTTS_v2.0_LJSpeech_FT",
    "project_name": "XTTS_trainer",
    "run_description": "\n            GPT XTTS training\n            ",
    "print_step": 50,
    "plot_step": 100,
    "model_param_stats": false,
    "wandb_entity": null,
    "dashboard_logger": "tensorboard",
    "save_on_interrupt": true,
    "log_model_step": 1000,
    "save_step": 10000,
    "save_n_checkpoints": 0,
    "save_checkpoints": false,
    "save_all_best": false,
    "save_best_after": 0,
    "target_loss": null,
    "print_eval": false,
    "test_delay_epochs": 0,
    "run_eval": true,
    "run_eval_steps": null,
    "distributed_backend": "nccl",
    "distributed_url": "tcp://localhost:54321",
    "mixed_precision": false,
    "precision": "fp16",
    "epochs": 1000,
    "batch_size": 32,
    "eval_batch_size": 16,
    "grad_clip": 0.0,
    "scheduler_after_epoch": true,
    "lr": 5e-06,
    "optimizer": "AdamW",
    "optimizer_params": {
        "betas": [
            0.9,
            0.96
        ],
        "eps": 1e-08,
        "weight_decay": 0.01
    },
    "lr_scheduler": "MultiStepLR",
    "lr_scheduler_params": {
        "milestones": [
            900000,
            2700000,
            5400000
        ],
        "gamma": 0.5,
        "last_epoch": -1
    },
    "use_grad_scaler": false,
    "allow_tf32": false,
    "cudnn_enable": true,
    "cudnn_deterministic": false,
    "cudnn_benchmark": false,
    "training_seed": 1,
    "model": "xtts",
    "num_loader_workers": 4,
    "num_eval_loader_workers": 0,
    "use_noise_augment": false,
    "audio": {
        "sample_rate": 22050,
        "output_sample_rate": 24000,
        "dvae_sample_rate": 22050
    },
    "use_phonemes": false,
    "phonemizer": null,
    "phoneme_language": null,
    "compute_input_seq_cache": false,
    "text_cleaner": null,
    "enable_eos_bos_chars": false,
    "test_sentences_file": "",
    "phoneme_cache_path": null,
    "characters": null,
    "add_blank": false,
    "batch_group_size": 48,
    "loss_masking": null,
    "min_audio_len": 1,
    "max_audio_len": Infinity,
    "min_text_len": 1,
    "max_text_len": Infinity,
    "compute_f0": false,
    "compute_energy": false,
    "compute_linear_spec": false,
    "precompute_num_workers": 0,
    "start_by_longest": false,
    "shuffle": false,
    "drop_last": false,
    "datasets": [
        {
            "formatter": "",
            "dataset_name": "",
            "path": "",
            "meta_file_train": "",
            "ignored_speakers": null,
            "language": "",
            "phonemizer": "",
            "meta_file_val": "",
            "meta_file_attn_mask": ""
        }
    ],
    "test_sentences": [
        {
            "text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
            "speaker_wav": [
                "/projects/coqui/LJTraining/../LJSpeech-1.1/wavs/LJ001-0002.wav"
            ],
            "language": "en"
        },
        {
            "text": "This cake is great. It's so delicious and moist.",
            "speaker_wav": [
                "/projects/coqui/LJTraining/../LJSpeech-1.1/wavs/LJ001-0002.wav"
            ],
            "language": "en"
        }
    ],
    "eval_split_max_size": 256,
    "eval_split_size": 0.01,
    "use_speaker_weighted_sampler": false,
    "speaker_weighted_sampler_alpha": 1.0,
    "use_language_weighted_sampler": false,
    "language_weighted_sampler_alpha": 1.0,
    "use_length_weighted_sampler": false,
    "length_weighted_sampler_alpha": 1.0,
    "model_args": {
        "gpt_batch_size": 1,
        "enable_redaction": false,
        "kv_cache": true,
        "gpt_checkpoint": "",
        "clvp_checkpoint": null,
        "decoder_checkpoint": null,
        "num_chars": 255,
        "tokenizer_file": "/projects/coqui/LJTraining/tokenizer_ljs.json",
        "gpt_max_audio_tokens": 605,
        "gpt_max_text_tokens": 402,
        "gpt_max_prompt_tokens": 70,
        "gpt_layers": 30,
        "gpt_n_model_channels": 1024,
        "gpt_n_heads": 16,
        "gpt_number_text_tokens": 21657,
        "gpt_start_text_token": 0,
        "gpt_stop_text_token": 1,
        "gpt_num_audio_tokens": 1026,
        "gpt_start_audio_token": 1024,
        "gpt_stop_audio_token": 1025,
        "gpt_code_stride_len": 1024,
        "gpt_use_masking_gt_prompt_approach": true,
        "gpt_use_perceiver_resampler": true,
        "input_sample_rate": 22050,
        "output_sample_rate": 24000,
        "output_hop_length": 256,
        "decoder_input_dim": 1024,
        "d_vector_dim": 512,
        "cond_d_vector_in_each_upsampling_layer": true,
        "duration_const": 102400,
        "min_conditioning_length": 66150,
        "max_conditioning_length": 132300,
        "gpt_loss_text_ce_weight": 0.01,
        "gpt_loss_mel_ce_weight": 1.0,
        "debug_loading_failures": false,
        "max_wav_length": 255995,
        "max_text_length": 200,
        "mel_norm_file": "/projects/coqui/LJTraining/mel_mean_ljs.pth",
        "dvae_checkpoint": "/projects/coqui/LJTraining/run/training/XTTS_v2.0_original_model_files/dvae.pth",
        "xtts_checkpoint": "/projects/coqui/LJTraining/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-02-2025_09+37PM-0000000/model.pth",
        "vocoder": ""
    },
    "model_dir": null,
    "languages": [
        "en"
    ],
    "temperature": 0.85,
    "length_penalty": 1.0,
    "repetition_penalty": 2.0,
    "top_k": 50,
    "top_p": 0.85,
    "num_gpt_outputs": 1,
    "gpt_cond_len": 12,
    "gpt_cond_chunk_len": 4,
    "max_ref_len": 10,
    "sound_norm_refs": false,
    "optimizer_wd_only_on_weights": true,
    "weighted_loss_attrs": {},
    "weighted_loss_multipliers": {},
    "github_branch": "inside_docker"
}

Expected behavior

No response

Logs

Environment

{
    "CUDA": {
        "GPU": [
            "NVIDIA H100 80GB HBM3"
        ],
        "available": true,
        "version": "12.4"
    },
    "Packages": {
        "PyTorch_debug": false,
        "PyTorch_version": "2.7.0.dev20250226+cu124",
        "TTS": "0.26.0",
        "numpy": "1.26.4"
    },
    "System": {
        "OS": "Linux",
        "architecture": [
            "64bit",
            ""
        ],
        "processor": "x86_64",
        "python": "3.11.11",
        "version": "#1 SMP Thu Aug 8 17:47:08 UTC 2024"
    }
}

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions