3
3
import torch
4
4
from torch import no_grad , LongTensor
5
5
import argparse
6
- from models_infer import spectrogram_torch
6
+ from mel_processing import spectrogram_torch
7
7
import utils
8
8
from models_infer import SynthesizerTrn
9
9
import gradio as gr
10
- import torchaudio
10
+ import librosa
11
11
import webbrowser
12
12
device = "cuda:0" if torch .cuda .is_available () else "cpu"
13
13
@@ -20,15 +20,16 @@ def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
20
20
original_speaker_id = speaker_ids [original_speaker ]
21
21
target_speaker_id = speaker_ids [target_speaker ]
22
22
23
- audio = torch . tensor (audio ). type ( torch .float32 )
24
- audio = audio .squeeze (). unsqueeze ( 0 )
25
- audio = audio / max ( - audio .min (), audio . max ()) / 0.99
23
+ audio = ( audio / np . iinfo (audio . dtype ). max ). astype ( np .float32 )
24
+ if len ( audio .shape ) > 1 :
25
+ audio = librosa . to_mono ( audio .transpose ( 1 , 0 ))
26
26
if sampling_rate != hps .data .sampling_rate :
27
- audio = torchaudio . transforms . Resample ( orig_freq = sampling_rate , new_freq = 22050 )( audio )
27
+ audio = librosa . resample ( audio , orig_sr = sampling_rate , target_sr = hps . data . sampling_rate )
28
28
with no_grad ():
29
29
y = torch .FloatTensor (audio )
30
30
y = y / max (- y .min (), y .max ()) / 0.99
31
31
y = y .to (device )
32
+ y = y .unsqueeze (0 )
32
33
spec = spectrogram_torch (y , hps .data .filter_length ,
33
34
hps .data .sampling_rate , hps .data .hop_length , hps .data .win_length ,
34
35
center = False ).to (device )
0 commit comments