Skip to content

Commit 5e27552

Browse files
committed
feat: 调整webui支持流式推理(新增流式UI)
1 parent 7fc2161 commit 5e27552

File tree

2 files changed

+357
-112
lines changed

2 files changed

+357
-112
lines changed

GPT_SoVITS/inference_stream.py

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
import os
2+
import tempfile
3+
import gradio as gr
4+
from pydub import AudioSegment
5+
from tools.i18n.i18n import I18nAuto
6+
from GPT_SoVITS.inference_webui import (
7+
get_weights_names,
8+
custom_sort_key,
9+
change_choices,
10+
change_gpt_weights,
11+
change_sovits_weights,
12+
get_tts_wav,
13+
)
14+
15+
16+
i18n = I18nAuto()
17+
18+
gpt_path = os.environ.get(
19+
"gpt_path",
20+
"GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
21+
)
22+
sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth")
23+
SoVITS_names, GPT_names = get_weights_names()
24+
25+
EXAMPLES = [
26+
[
27+
"中文",
28+
"根据过年的传说,远古时代有一隻凶残年兽,每到岁末就会从海底跑出来吃人。"
29+
+ "人们为了不被年兽吃掉,家家户户都会祭拜祖先祈求平安,也会聚在一起吃一顿丰盛的晚餐。"
30+
+ "后来人们发现年兽害怕红色、噪音与火光,便开始在当天穿上红衣、门上贴上红纸、燃烧爆竹声,藉此把年兽赶走。"
31+
+ "而这些之后也成为过年吃团圆饭、穿红衣、放鞭炮、贴春联的过年习俗。",
32+
],
33+
[
34+
"英文",
35+
"Give every man thy ear, but few thy voice; Take each man's censure, but reserve thy judgment."
36+
+ " This is a line spoken by 'the tedious old fool' Polonius - chief counsellor to the villain Claudius in Act 1, Scene 3 of Shakespeare's Hamlet."
37+
+ "The implication is that it's important to be a good listener and accept criticism but not be judgmental.",
38+
],
39+
]
40+
41+
42+
def get_streaming_tts_wav(
43+
ref_wav_path,
44+
prompt_text,
45+
prompt_language,
46+
text,
47+
text_language,
48+
how_to_cut,
49+
top_k,
50+
top_p,
51+
temperature,
52+
):
53+
chunks = get_tts_wav(
54+
ref_wav_path=ref_wav_path,
55+
prompt_text=prompt_text,
56+
prompt_language=prompt_language,
57+
text=text,
58+
text_language=text_language,
59+
how_to_cut=how_to_cut,
60+
top_k=top_k,
61+
top_p=top_p,
62+
temperature=temperature,
63+
stream=True,
64+
)
65+
66+
# Send chunk files
67+
i = 0
68+
format = "wav"
69+
for chunk in chunks:
70+
i += 1
71+
file = f"{tempfile.gettempdir()}/{i}.{format}"
72+
segment = AudioSegment(chunk, frame_rate=32000, sample_width=2, channels=1)
73+
segment.export(file, format=format)
74+
yield file
75+
76+
77+
def main():
78+
with gr.Blocks(title="GPT-SoVITS Streaming Demo") as app:
79+
gr.Markdown(
80+
value=i18n(
81+
"流式输出演示,分句推理后推送到组件中。由于目前bytes模式的限制,采用<a href='https://github.com/gradio-app/gradio/blob/gradio%404.17.0/demo/stream_audio_out/run.py'>stream_audio_out</a>中临时文件的方案输出分句。这种方式相比bytes,会增加wav文件解析的延迟。"
82+
),
83+
)
84+
with gr.Row():
85+
GPT_dropdown = gr.Dropdown(
86+
label=i18n("GPT模型列表"),
87+
choices=sorted(GPT_names, key=custom_sort_key),
88+
value=gpt_path,
89+
interactive=True,
90+
)
91+
SoVITS_dropdown = gr.Dropdown(
92+
label=i18n("SoVITS模型列表"),
93+
choices=sorted(SoVITS_names, key=custom_sort_key),
94+
value=sovits_path,
95+
interactive=True,
96+
)
97+
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
98+
refresh_button.click(
99+
fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]
100+
)
101+
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], [])
102+
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
103+
104+
def load_text(file):
105+
with open(file.name, "r", encoding="utf-8") as file:
106+
return file.read()
107+
108+
gr.Markdown(value=i18n("*请上传并填写参考信息"))
109+
with gr.Row():
110+
inp_ref = gr.Audio(
111+
label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath"
112+
)
113+
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
114+
prompt_language = gr.Dropdown(
115+
label=i18n("参考音频的语种"),
116+
choices=[
117+
i18n("中文"),
118+
i18n("英文"),
119+
i18n("日文"),
120+
i18n("中英混合"),
121+
i18n("日英混合"),
122+
i18n("多语种混合"),
123+
],
124+
value=i18n("中文"),
125+
)
126+
load_button = gr.UploadButton(i18n("加载参考文本"), variant="secondary")
127+
load_button.upload(load_text, load_button, prompt_text)
128+
129+
with gr.Row():
130+
text = gr.Textbox(label=i18n("需要合成的文本"), value="", interactive=True)
131+
text_language = gr.Dropdown(
132+
label=i18n("需要合成的语种"),
133+
choices=[
134+
i18n("中文"),
135+
i18n("英文"),
136+
i18n("日文"),
137+
i18n("中英混合"),
138+
i18n("日英混合"),
139+
i18n("多语种混合"),
140+
],
141+
value=i18n("中文"),
142+
)
143+
how_to_cut = gr.Radio(
144+
label=i18n("怎么切"),
145+
choices=[
146+
i18n("不切"),
147+
i18n("凑四句一切"),
148+
i18n("凑50字一切"),
149+
i18n("按中文句号。切"),
150+
i18n("按英文句号.切"),
151+
i18n("按标点符号切"),
152+
],
153+
value=i18n("按标点符号切"),
154+
interactive=True,
155+
)
156+
157+
with gr.Row():
158+
top_k = gr.Slider(
159+
minimum=1,
160+
maximum=100,
161+
step=1,
162+
label=i18n("top_k"),
163+
value=5,
164+
interactive=True,
165+
)
166+
top_p = gr.Slider(
167+
minimum=0,
168+
maximum=1,
169+
step=0.05,
170+
label=i18n("top_p"),
171+
value=1,
172+
interactive=True,
173+
)
174+
temperature = gr.Slider(
175+
minimum=0,
176+
maximum=1,
177+
step=0.05,
178+
label=i18n("temperature"),
179+
value=1,
180+
interactive=True,
181+
)
182+
inference_button = gr.Button(i18n("合成语音"), variant="primary")
183+
184+
with gr.Group():
185+
with gr.Row():
186+
audio_file = gr.Audio(
187+
value=None,
188+
label=i18n("输出的语音"),
189+
streaming=True,
190+
autoplay=True,
191+
interactive=False,
192+
show_label=True,
193+
)
194+
195+
text_msgs = inference_button.click(
196+
get_streaming_tts_wav,
197+
[
198+
inp_ref,
199+
prompt_text,
200+
prompt_language,
201+
text,
202+
text_language,
203+
how_to_cut,
204+
top_k,
205+
top_p,
206+
temperature,
207+
],
208+
[audio_file],
209+
)
210+
text_msgs.then(lambda: gr.update(interactive=True), None, [text], queue=False)
211+
212+
with gr.Row():
213+
gr.Examples(
214+
EXAMPLES,
215+
[text_language, text],
216+
cache_examples=False,
217+
run_on_click=False, # Will not work , user should submit it
218+
)
219+
220+
app.queue().launch(
221+
server_name="0.0.0.0",
222+
inbrowser=True,
223+
share=False,
224+
server_port=8080,
225+
quiet=True,
226+
)
227+
228+
229+
if __name__ == "__main__":
230+
main()

0 commit comments

Comments
 (0)