[Bug] XTTS Training with LJSpeech leads to weird inference output

### Describe the bug

I tried training XTTS model with LJSpeech dataset, using the xtts v2 training script from the recipes folder. 
However, when doing inference, I don't get any speech, only loud metallic buzzing noise. I tried doing inference with original model and it worked fine. I also tried to train with custom tokenizer, mel_stats but the issue still persists. I will share my training script, inference script and config.json file. Thank you in advance.

### To Reproduce

# Training Script
```
import os

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
from TTS.tts.models.xtts import XttsAudioConfig
from TTS.utils.manage import ModelManager

# Logging parameters
RUN_NAME = "GPT_XTTS_v2.0_LJSpeech_FT"
PROJECT_NAME = "XTTS_trainer"
DASHBOARD_LOGGER = "tensorboard"
LOGGER_URI = None
output_path = os.path.dirname(os.path.abspath(__file__))
# Set here the path that the checkpoints will be saved. Default: ./run/training/
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")

# Training Parameters
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
START_WITH_EVAL = True  # if True it will star with evaluation
BATCH_SIZE = 32  # set here the batch size
GRAD_ACUMM_STEPS = 84  # set here the grad accumulation steps
# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.

# Define here the dataset that you want to use for the fine-tuning on.
config_dataset = BaseDatasetConfig(
    formatter="ljspeech",
    dataset_name="ljspeech",
    path=os.path.join(output_path, "../LJSpeech-1.1/"),
    meta_file_train="metadata.csv",
    language="en",
)

# Add here the configs of the datasets
DATASETS_CONFIG_LIST = [config_dataset]

# Define the path where XTTS v2.0.1 files will be downloaded
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)


# DVAE files
DVAE_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/dvae.pth"
MEL_NORM_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/mel_stats.pth"

# Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))

# download DVAE files if needed
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)


# Download XTTS v2.0 checkpoint if needed
TOKENIZER_FILE_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth"

# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file

# download XTTS v2.0 files if needed
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
    )


# Training sentences generations
SPEAKER_REFERENCE = [
    os.path.join(output_path, "../LJSpeech-1.1/wavs/LJ001-0002.wav")  # speaker reference to be used in training test sentences
]
LANGUAGE = config_dataset.language

TOKENIZER_FILE_LJS = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tokenizer_ljs.json')
MEL_NORM_FILE_LJS = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mel_mean_ljs.pth')
XTTS_CHECKPOINT = os.path.join(OUT_PATH, 'GPT_XTTS_v2.0_LJSpeech_FT-April-02-2025_09+37PM-0000000/model.pth')

def main():
    # init args and config
    model_args = GPTArgs(
        max_conditioning_length=132300,  # 6 secs
        min_conditioning_length=66150,  # 3 secs
        debug_loading_failures=False,
        max_wav_length=255995,  # ~11.6 seconds
        max_text_length=200,
        mel_norm_file=MEL_NORM_FILE_LJS,
        dvae_checkpoint=DVAE_CHECKPOINT,
        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
        tokenizer_file=TOKENIZER_FILE_LJS,
        gpt_num_audio_tokens=1026,
        gpt_start_audio_token=1024,
        gpt_stop_audio_token=1025,
        gpt_use_masking_gt_prompt_approach=True,
        gpt_use_perceiver_resampler=True,
    )
    # define audio config
    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
    # training parameters config
    config = GPTTrainerConfig(
        output_path=OUT_PATH,
        model_args=model_args,
        run_name=RUN_NAME,
        project_name=PROJECT_NAME,
        run_description="""
            GPT XTTS training
            """,
        dashboard_logger=DASHBOARD_LOGGER,
        logger_uri=LOGGER_URI,
        audio=audio_config,
        batch_size=BATCH_SIZE,
        batch_group_size=48,
        eval_batch_size=int(BATCH_SIZE / 2),
        num_loader_workers=4,
        eval_split_max_size=256,
        print_step=50,
        plot_step=100,
        log_model_step=1000,
        save_step=10000,
        save_n_checkpoints=0,
        save_checkpoints=False,
        # target_loss="loss",
        print_eval=False,
        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
        optimizer="AdamW",
        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
        lr=5e-06,  # learning rate
        lr_scheduler="MultiStepLR",
        # it was adjusted accordly for the new step scheme
        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
        test_sentences=[
            {
                "text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": LANGUAGE,
            },
            {
                "text": "This cake is great. It's so delicious and moist.",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": LANGUAGE,
            },
        ],
        languages=[LANGUAGE],
    )

    # init the model from config
    model = GPTTrainer.init_from_config(config)

    # load training samples
    train_samples, eval_samples = load_tts_samples(
        DATASETS_CONFIG_LIST,
        eval_split=True,
        eval_split_max_size=config.eval_split_max_size,
        eval_split_size=config.eval_split_size,
    )

    # init the trainer and 🚀
    trainer = Trainer(
        TrainerArgs(
            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
            skip_train_epoch=False,
            start_with_eval=START_WITH_EVAL,
            grad_accum_steps=GRAD_ACUMM_STEPS,
        ),
        config,
        output_path=OUT_PATH,
        model=model,
        train_samples=train_samples,
        eval_samples=eval_samples,
    )
    trainer.fit()


if __name__ == "__main__":
    main()

```
# Inference Script
```
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

# Get the directory of the current file
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
TRAINING_DIR = os.path.join(CURRENT_DIR, "run/training")
TRAIN_CHECKPOINT_DIR = os.path.join(TRAINING_DIR, "GPT_XTTS_v2.0_LJSpeech_FT-April-03-2025_11+52AM-0000000")
# Add here the xtts_config path
CONFIG_PATH = os.path.join(TRAIN_CHECKPOINT_DIR, 'config.json')
# Add here the vocab file that you have used to train the model
TOKENIZER_PATH = os.path.join(CURRENT_DIR, 'tokenizer_ljs.json')
# Add here the checkpoint that you want to do inference with
XTTS_CHECKPOINT = os.path.join(TRAIN_CHECKPOINT_DIR, 'best_model.pth')
# Get the parent directory of the current file
PARENT_DIR = os.path.dirname(CURRENT_DIR)
SPEAKER_REFERENCE = os.path.join(PARENT_DIR, "LJSpeech-1.1/wavs/LJ001-0002.wav")  # speaker reference to be used in training test sentences
# output wav path
OUTPUT_WAV_PATH = "xtts-ft_3.wav"

print("Loading model...")
config = XttsConfig()
config.load_json(CONFIG_PATH)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENIZER_PATH, use_deepspeed=False)

print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])

print("Inference...")
out = model.inference(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    "en",
    gpt_cond_latent,
    speaker_embedding,
    temperature=0.7, # Add custom parameters here
)
torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)
```
# config.json file
```
{
    "output_path": "/projects/coqui/LJTraining/run/training",
    "logger_uri": null,
    "run_name": "GPT_XTTS_v2.0_LJSpeech_FT",
    "project_name": "XTTS_trainer",
    "run_description": "\n            GPT XTTS training\n            ",
    "print_step": 50,
    "plot_step": 100,
    "model_param_stats": false,
    "wandb_entity": null,
    "dashboard_logger": "tensorboard",
    "save_on_interrupt": true,
    "log_model_step": 1000,
    "save_step": 10000,
    "save_n_checkpoints": 0,
    "save_checkpoints": false,
    "save_all_best": false,
    "save_best_after": 0,
    "target_loss": null,
    "print_eval": false,
    "test_delay_epochs": 0,
    "run_eval": true,
    "run_eval_steps": null,
    "distributed_backend": "nccl",
    "distributed_url": "tcp://localhost:54321",
    "mixed_precision": false,
    "precision": "fp16",
    "epochs": 1000,
    "batch_size": 32,
    "eval_batch_size": 16,
    "grad_clip": 0.0,
    "scheduler_after_epoch": true,
    "lr": 5e-06,
    "optimizer": "AdamW",
    "optimizer_params": {
        "betas": [
            0.9,
            0.96
        ],
        "eps": 1e-08,
        "weight_decay": 0.01
    },
    "lr_scheduler": "MultiStepLR",
    "lr_scheduler_params": {
        "milestones": [
            900000,
            2700000,
            5400000
        ],
        "gamma": 0.5,
        "last_epoch": -1
    },
    "use_grad_scaler": false,
    "allow_tf32": false,
    "cudnn_enable": true,
    "cudnn_deterministic": false,
    "cudnn_benchmark": false,
    "training_seed": 1,
    "model": "xtts",
    "num_loader_workers": 4,
    "num_eval_loader_workers": 0,
    "use_noise_augment": false,
    "audio": {
        "sample_rate": 22050,
        "output_sample_rate": 24000,
        "dvae_sample_rate": 22050
    },
    "use_phonemes": false,
    "phonemizer": null,
    "phoneme_language": null,
    "compute_input_seq_cache": false,
    "text_cleaner": null,
    "enable_eos_bos_chars": false,
    "test_sentences_file": "",
    "phoneme_cache_path": null,
    "characters": null,
    "add_blank": false,
    "batch_group_size": 48,
    "loss_masking": null,
    "min_audio_len": 1,
    "max_audio_len": Infinity,
    "min_text_len": 1,
    "max_text_len": Infinity,
    "compute_f0": false,
    "compute_energy": false,
    "compute_linear_spec": false,
    "precompute_num_workers": 0,
    "start_by_longest": false,
    "shuffle": false,
    "drop_last": false,
    "datasets": [
        {
            "formatter": "",
            "dataset_name": "",
            "path": "",
            "meta_file_train": "",
            "ignored_speakers": null,
            "language": "",
            "phonemizer": "",
            "meta_file_val": "",
            "meta_file_attn_mask": ""
        }
    ],
    "test_sentences": [
        {
            "text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
            "speaker_wav": [
                "/projects/coqui/LJTraining/../LJSpeech-1.1/wavs/LJ001-0002.wav"
            ],
            "language": "en"
        },
        {
            "text": "This cake is great. It's so delicious and moist.",
            "speaker_wav": [
                "/projects/coqui/LJTraining/../LJSpeech-1.1/wavs/LJ001-0002.wav"
            ],
            "language": "en"
        }
    ],
    "eval_split_max_size": 256,
    "eval_split_size": 0.01,
    "use_speaker_weighted_sampler": false,
    "speaker_weighted_sampler_alpha": 1.0,
    "use_language_weighted_sampler": false,
    "language_weighted_sampler_alpha": 1.0,
    "use_length_weighted_sampler": false,
    "length_weighted_sampler_alpha": 1.0,
    "model_args": {
        "gpt_batch_size": 1,
        "enable_redaction": false,
        "kv_cache": true,
        "gpt_checkpoint": "",
        "clvp_checkpoint": null,
        "decoder_checkpoint": null,
        "num_chars": 255,
        "tokenizer_file": "/projects/coqui/LJTraining/tokenizer_ljs.json",
        "gpt_max_audio_tokens": 605,
        "gpt_max_text_tokens": 402,
        "gpt_max_prompt_tokens": 70,
        "gpt_layers": 30,
        "gpt_n_model_channels": 1024,
        "gpt_n_heads": 16,
        "gpt_number_text_tokens": 21657,
        "gpt_start_text_token": 0,
        "gpt_stop_text_token": 1,
        "gpt_num_audio_tokens": 1026,
        "gpt_start_audio_token": 1024,
        "gpt_stop_audio_token": 1025,
        "gpt_code_stride_len": 1024,
        "gpt_use_masking_gt_prompt_approach": true,
        "gpt_use_perceiver_resampler": true,
        "input_sample_rate": 22050,
        "output_sample_rate": 24000,
        "output_hop_length": 256,
        "decoder_input_dim": 1024,
        "d_vector_dim": 512,
        "cond_d_vector_in_each_upsampling_layer": true,
        "duration_const": 102400,
        "min_conditioning_length": 66150,
        "max_conditioning_length": 132300,
        "gpt_loss_text_ce_weight": 0.01,
        "gpt_loss_mel_ce_weight": 1.0,
        "debug_loading_failures": false,
        "max_wav_length": 255995,
        "max_text_length": 200,
        "mel_norm_file": "/projects/coqui/LJTraining/mel_mean_ljs.pth",
        "dvae_checkpoint": "/projects/coqui/LJTraining/run/training/XTTS_v2.0_original_model_files/dvae.pth",
        "xtts_checkpoint": "/projects/coqui/LJTraining/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-02-2025_09+37PM-0000000/model.pth",
        "vocoder": ""
    },
    "model_dir": null,
    "languages": [
        "en"
    ],
    "temperature": 0.85,
    "length_penalty": 1.0,
    "repetition_penalty": 2.0,
    "top_k": 50,
    "top_p": 0.85,
    "num_gpt_outputs": 1,
    "gpt_cond_len": 12,
    "gpt_cond_chunk_len": 4,
    "max_ref_len": 10,
    "sound_norm_refs": false,
    "optimizer_wd_only_on_weights": true,
    "weighted_loss_attrs": {},
    "weighted_loss_multipliers": {},
    "github_branch": "inside_docker"
}
```

### Expected behavior

_No response_

### Logs

```shell

```

### Environment

```shell
{
    "CUDA": {
        "GPU": [
            "NVIDIA H100 80GB HBM3"
        ],
        "available": true,
        "version": "12.4"
    },
    "Packages": {
        "PyTorch_debug": false,
        "PyTorch_version": "2.7.0.dev20250226+cu124",
        "TTS": "0.26.0",
        "numpy": "1.26.4"
    },
    "System": {
        "OS": "Linux",
        "architecture": [
            "64bit",
            ""
        ],
        "processor": "x86_64",
        "python": "3.11.11",
        "version": "#1 SMP Thu Aug 8 17:47:08 UTC 2024"
    }
}
```

### Additional context

_No response_

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Bug] XTTS Training with LJSpeech leads to weird inference output #362

Describe the bug

To Reproduce

Training Script

Inference Script

config.json file

Expected behavior

Logs

Environment

Additional context

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[Bug] XTTS Training with LJSpeech leads to weird inference output #362

Description

Describe the bug

To Reproduce

Training Script

Inference Script

config.json file

Expected behavior

Logs

Environment

Additional context

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions