diff --git a/GPT_SoVITS/inference_stream.py b/GPT_SoVITS/inference_stream.py
new file mode 100644
index 000000000..042e26804
--- /dev/null
+++ b/GPT_SoVITS/inference_stream.py
@@ -0,0 +1,383 @@
+import tempfile, io, wave
+import gradio as gr
+import uvicorn
+import argparse
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from pydub import AudioSegment
+from tools.i18n.i18n import I18nAuto
+from GPT_SoVITS.inference_webui import (
+ get_weights_names,
+ custom_sort_key,
+ change_choices,
+ change_gpt_weights,
+ change_sovits_weights,
+ get_tts_wav,
+)
+
+api_app = FastAPI()
+i18n = I18nAuto()
+
+# API mode Usage: python GPT_SoVITS/inference_stream.py --api
+parser = argparse.ArgumentParser(description="GPT-SoVITS Streaming API")
+parser.add_argument(
+ "-api",
+ "--api",
+ action="store_true",
+ default=False,
+ help="是否开启API模式(不开启则是WebUI模式)",
+)
+parser.add_argument(
+ "-s",
+ "--sovits_path",
+ type=str,
+ default="GPT_SoVITS/pretrained_models/s2G488k.pth",
+ help="SoVITS模型路径",
+)
+parser.add_argument(
+ "-g",
+ "--gpt_path",
+ type=str,
+ default="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
+ help="GPT模型路径",
+)
+parser.add_argument(
+ "-rw",
+ "--ref_wav",
+ type=str,
+ default="./example/archive_ruanmei_8.wav",
+ help="参考音频路径",
+)
+parser.add_argument(
+ "-rt",
+ "--prompt_text",
+ type=str,
+ default="我听不惯现代乐,听戏却极易入迷,琴弦拨动,时间便流往过去。",
+ help="参考音频文本",
+)
+parser.add_argument(
+ "-rl",
+ "--prompt_language",
+ type=str,
+ default=i18n("中文"),
+ help="参考音频语种",
+)
+
+args = parser.parse_args()
+
+sovits_path = args.sovits_path
+gpt_path = args.gpt_path
+SoVITS_names, GPT_names = get_weights_names()
+
+EXAMPLES = [
+ [
+ i18n("中文"),
+ "根据过年的传说,远古时代有一隻凶残年兽,每到岁末就会从海底跑出来吃人。"
+ + "人们为了不被年兽吃掉,家家户户都会祭拜祖先祈求平安,也会聚在一起吃一顿丰盛的晚餐。"
+ + "后来人们发现年兽害怕红色、噪音与火光,便开始在当天穿上红衣、门上贴上红纸、燃烧爆竹声,藉此把年兽赶走。"
+ + "而这些之后也成为过年吃团圆饭、穿红衣、放鞭炮、贴春联的过年习俗。",
+ ],
+ [
+ i18n("中文"),
+ "在父母的葬礼上,她穿一身全黑的丧服。她依旧把头发束得很好,如墨的发丝遮掩着她的神情。她没有掉一滴眼泪。"
+ + "直到夜色降临,在实验室的屏幕上,数据螺旋制成的层层几何花纹在戏声中变换、舒展、流动,而将那花纹层层剥去后,是她万般呵护、小心制作的秘密。"
+ + "阖眼的父亲与母亲,二者冰冷如沉睡般的面容。是辜负。他们没能遵守和外婆的约定。我也没能履行保护父母的承诺,同样辜负了他们。"
+ + "唯有科学…不会辜负。她说。少女希望父母平等,便将自己从前的昵称抹去,将名字从二人姓氏中各取一。"
+ + "自那以后,每当有人问及家,她的眼中总是闪过一丝迷茫,她似乎把一切都忘了。在这样的状态中,她开始了忽视时间规律的演算。"
+ + "由于沉迷科研,她的进食毫不规律,兴致起来便研究几日几夜,到了极限便昏昏睡去。很快,她只借着火萤微弱的光,在每个夜晚收获研究的进展。"
+ + "她愈对已有的生命法则置若罔闻,她的前进就愈加迅速;她完全不在乎公式,接着她漠视了生命的意义。"
+ + "她只去观察、用双手揣摩,将数据握在手里感受,接着就编纂出新的物种规律。于是,在她的实验室中,那些蕨类植物与花愈发生长地茂盛,"
+ + "它们以肉眼可见的速度长高,接着充斥了所有空间。在那花叶开合的缝隙中——笼罩着父母清冷而素净的、由数据汇聚而成的面庞。"
+ + "在沉睡的父母将要睁开双眼的时刻——她几乎摧毁了整个星球原本的物种衍变规律,但她仍然在向着自己的目标前进。"
+ + "直到她从研究中抬起头,猛烈地望向天空:智识的瞥视降临到了她的身上。",
+ ],
+ [
+ i18n("中文"),
+ "神霄折戟录其二"
+ +"「嗯,好吃。」被附体的未央变得温柔了许多,也冷淡了很多。"
+ + "她拿起弥耳做的馅饼,小口小口吃了起来。第一口被烫到了,还很可爱地吐着舌头吸气。"
+ + "「我一下子有点接受不了, 需要消化消化。」用一只眼睛作为代价维持降灵的弥耳自己也拿了一个馅饼,「你再说一 遍?」"
+ + "「当年所谓的陨铁其实是神戟。它被凡人折断,铸成魔剑九柄。这一把是雾海魔剑。 加上他们之前已经收集了两柄。「然后你是?」"
+ + "「我是曾经的天帝之女,名字已经忘了。我司掌审判与断罪,用你们的话说,就是刑律。」"
+ + "因为光禄寺执掌祭祀典礼的事情,所以仪式、祝词什么的,弥耳被老爹逼得倒是能倒背如流。同时因为尽是接触怪力乱神,弥耳也是知道一些小]道的。 神明要是被知道了真正的秘密名讳,就只能任人驱使了。眼前这位未必是忘了。"
+ + "「所以朝廷是想重铸神霄之戟吗?」弥耳说服自己接受了这个设定,追问道。"
+ + "「我不知道。这具身体的主人并不知道别的事。她只是很愤怒,想要证明自己。」未央把手放在了胸口上。"
+ +"「那接下来,我是应该弄个什么送神仪式把你送走吗?」弥耳摸了摸绷带下已经失去功能的眼睛,「然后我的眼睛也会回来?」"
+ ],
+]
+
+
+# from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py
+def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000):
+ # This will create a wave header then append the frame input
+ # It should be first on a streaming wav file
+ # Other frames better should not have it (else you will hear some artifacts each chunk start)
+ wav_buf = io.BytesIO()
+ with wave.open(wav_buf, "wb") as vfout:
+ vfout.setnchannels(channels)
+ vfout.setsampwidth(sample_width)
+ vfout.setframerate(sample_rate)
+ vfout.writeframes(frame_input)
+
+ wav_buf.seek(0)
+ return wav_buf.read()
+
+
+def get_streaming_tts_wav(
+ ref_wav_path,
+ prompt_text,
+ prompt_language,
+ text,
+ text_language,
+ how_to_cut,
+ top_k,
+ top_p,
+ temperature,
+ ref_free,
+ byte_stream=True,
+):
+ chunks = get_tts_wav(
+ ref_wav_path=ref_wav_path,
+ prompt_text=prompt_text,
+ prompt_language=prompt_language,
+ text=text,
+ text_language=text_language,
+ how_to_cut=how_to_cut,
+ top_k=top_k,
+ top_p=top_p,
+ temperature=temperature,
+ ref_free=ref_free,
+ stream=True,
+ )
+
+ if byte_stream:
+ yield wave_header_chunk()
+ for chunk in chunks:
+ yield chunk
+ else:
+ # Send chunk files
+ i = 0
+ format = "wav"
+ for chunk in chunks:
+ i += 1
+ file = f"{tempfile.gettempdir()}/{i}.{format}"
+ segment = AudioSegment(chunk, frame_rate=32000, sample_width=2, channels=1)
+ segment.export(file, format=format)
+ yield file
+
+
+def webui():
+ with gr.Blocks(title="GPT-SoVITS Streaming Demo") as app:
+ gr.Markdown(
+ value=i18n(
+ "流式输出演示,分句推理后推送到组件中。由于目前bytes模式的限制,采用stream_audio_out中临时文件的方案输出分句。这种方式相比bytes,会增加wav文件解析的延迟。"
+ ),
+ )
+
+ gr.Markdown(value=i18n("模型切换"))
+ with gr.Row():
+ GPT_dropdown = gr.Dropdown(
+ label=i18n("GPT模型列表"),
+ choices=sorted(GPT_names, key=custom_sort_key),
+ value=gpt_path,
+ interactive=True,
+ )
+ SoVITS_dropdown = gr.Dropdown(
+ label=i18n("SoVITS模型列表"),
+ choices=sorted(SoVITS_names, key=custom_sort_key),
+ value=sovits_path,
+ interactive=True,
+ )
+ refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
+ refresh_button.click(
+ fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]
+ )
+ SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], [])
+ GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
+
+ gr.Markdown(value=i18n("*请上传并填写参考信息"))
+ with gr.Row():
+ inp_ref = gr.Audio(
+ label=i18n("请上传3~10秒内参考音频,超过会报错!"), value=args.ref_wav, type="filepath"
+ )
+ with gr.Column():
+ ref_text_free = gr.Checkbox(
+ label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"),
+ value=False,
+ interactive=True,
+ show_label=True,
+ )
+ gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT"))
+ prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value=args.prompt_text)
+ prompt_language = gr.Dropdown(
+ label=i18n("参考音频的语种"),
+ choices=[
+ i18n("中文"),
+ i18n("英文"),
+ i18n("日文"),
+ i18n("中英混合"),
+ i18n("日英混合"),
+ i18n("多语种混合"),
+ ],
+ value=args.prompt_language,
+ )
+
+ def load_text(file):
+ with open(file.name, "r", encoding="utf-8") as file:
+ return file.read()
+
+ load_button = gr.UploadButton(i18n("加载参考文本"), variant="secondary")
+ load_button.upload(load_text, load_button, prompt_text)
+
+ gr.Markdown(
+ value=i18n(
+ "*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。"
+ )
+ )
+ with gr.Row():
+ text = gr.Textbox(
+ label=i18n("需要合成的文本"), value="", lines=5, interactive=True
+ )
+ text_language = gr.Dropdown(
+ label=i18n("需要合成的语种"),
+ choices=[
+ i18n("中文"),
+ i18n("英文"),
+ i18n("日文"),
+ i18n("中英混合"),
+ i18n("日英混合"),
+ i18n("多语种混合"),
+ ],
+ value=i18n("中文"),
+ )
+ how_to_cut = gr.Radio(
+ label=i18n("怎么切"),
+ choices=[
+ i18n("不切"),
+ i18n("凑四句一切"),
+ i18n("凑50字一切"),
+ i18n("按中文句号。切"),
+ i18n("按英文句号.切"),
+ i18n("按标点符号切"),
+ ],
+ value=i18n("按标点符号切"),
+ interactive=True,
+ )
+
+ gr.Markdown(value=i18n("* 参数设置"))
+ with gr.Row():
+ with gr.Column():
+ top_k = gr.Slider(
+ minimum=1,
+ maximum=100,
+ step=1,
+ label=i18n("top_k"),
+ value=5,
+ interactive=True,
+ )
+ top_p = gr.Slider(
+ minimum=0,
+ maximum=1,
+ step=0.05,
+ label=i18n("top_p"),
+ value=1,
+ interactive=True,
+ )
+ temperature = gr.Slider(
+ minimum=0,
+ maximum=1,
+ step=0.05,
+ label=i18n("temperature"),
+ value=1,
+ interactive=True,
+ )
+ inference_button = gr.Button(i18n("合成语音"), variant="primary")
+
+ gr.Markdown(value=i18n("* 结果输出(等待第2句推理结束后会自动播放)"))
+ with gr.Row():
+ audio_file = gr.Audio(
+ value=None,
+ label=i18n("输出的语音"),
+ streaming=True,
+ autoplay=True,
+ interactive=False,
+ show_label=True,
+ )
+
+ inference_button.click(
+ get_streaming_tts_wav,
+ [
+ inp_ref,
+ prompt_text,
+ prompt_language,
+ text,
+ text_language,
+ how_to_cut,
+ top_k,
+ top_p,
+ temperature,
+ ref_text_free,
+ ],
+ [audio_file],
+ ).then(lambda: gr.update(interactive=True), None, [text], queue=False)
+
+ with gr.Row():
+ gr.Examples(
+ EXAMPLES,
+ [text_language, text],
+ cache_examples=False,
+ run_on_click=False, # Will not work , user should submit it
+ )
+
+ app.queue().launch(
+ server_name="0.0.0.0",
+ inbrowser=True,
+ share=False,
+ server_port=8080,
+ quiet=True,
+ )
+
+
+@api_app.get("/")
+async def tts(
+ text: str, # 必选参数
+ language: str = i18n("中文"),
+ top_k: int = 5,
+ top_p: float = 1,
+ temperature: float = 1,
+):
+ ref_wav_path = args.ref_wav
+ prompt_text = args.prompt_text
+ prompt_language = args.prompt_language
+ how_to_cut = i18n("按标点符号切")
+
+ return StreamingResponse(
+ get_streaming_tts_wav(
+ ref_wav_path=ref_wav_path,
+ prompt_text=prompt_text,
+ prompt_language=prompt_language,
+ text=text,
+ text_language=language,
+ how_to_cut=how_to_cut,
+ top_k=top_k,
+ top_p=top_p,
+ temperature=temperature,
+ ref_free=False,
+ byte_stream=True,
+ ),
+ media_type="audio/x-wav",
+ )
+
+
+def api():
+ uvicorn.run(
+ app="inference_stream:api_app", host="127.0.0.1", port=8080, reload=True
+ )
+
+
+if __name__ == "__main__":
+ # 模式选择,默认是webui模式
+ if not args.api:
+ webui()
+ else:
+ api()
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index d2f3f9491..891c36a7a 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -306,7 +306,7 @@ def merge_short_text_in_array(texts, threshold):
result[len(result) - 1] += text
return result
-def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False):
+def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free=False, stream=False):
if prompt_text is None or len(prompt_text) == 0:
ref_free = True
t0 = ttime()
@@ -364,68 +364,81 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
texts = merge_short_text_in_array(texts, 5)
audio_opt = []
if not ref_free:
- phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language)
+ phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language)
+ else:
+ phones1, bert1 = None, None
for text in texts:
# 解决输入目标文本的空行导致报错的问题
if (len(text.strip()) == 0):
continue
- if (text[-1] not in splits): text += "。" if text_language != "en" else "."
- print(i18n("实际输入的目标文本(每句):"), text)
- phones2,bert2,norm_text2=get_phones_and_bert(text, text_language)
- print(i18n("前端处理后的文本(每句):"), norm_text2)
- if not ref_free:
- bert = torch.cat([bert1, bert2], 1)
- all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
- else:
- bert = bert2
- all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0)
-
- bert = bert.to(device).unsqueeze(0)
- all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
- prompt = prompt_semantic.unsqueeze(0).to(device)
- t2 = ttime()
- with torch.no_grad():
- # pred_semantic = t2s_model.model.infer(
- pred_semantic, idx = t2s_model.model.infer_panel(
- all_phoneme_ids,
- all_phoneme_len,
- None if ref_free else prompt,
- bert,
- # prompt_phone_len=ph_offset,
- top_k=top_k,
- top_p=top_p,
- temperature=temperature,
- early_stop_num=hz * max_sec,
- )
- t3 = ttime()
- # print(pred_semantic.shape,idx)
- pred_semantic = pred_semantic[:, -idx:].unsqueeze(
- 0
- ) # .unsqueeze(0)#mq要多unsqueeze一次
- refer = get_spepc(hps, ref_wav_path) # .to(device)
- if is_half == True:
- refer = refer.half().to(device)
- else:
- refer = refer.to(device)
- # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
- audio = (
- vq_model.decode(
- pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer
- )
- .detach()
- .cpu()
- .numpy()[0, 0]
- ) ###试试重建不带上prompt部分
- max_audio=np.abs(audio).max()#简单防止16bit爆音
- if max_audio>1:audio/=max_audio
+ audio = get_tts_chunk(ref_wav_path, text, text_language, bert1, phones1, prompt_semantic,
+ top_k, top_p, temperature, ref_free, t0, t1)
audio_opt.append(audio)
audio_opt.append(zero_wav)
- t4 = ttime()
+ if (stream):
+ # 流式模式下每句返回一次
+ yield (np.concatenate([audio, zero_wav], 0) * 32768).astype(np.int16).tobytes()
+
+ if (not stream):
+ # 非流式最终合并后返回
+ yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(
+ np.int16
+ )
+
+def get_tts_chunk(ref_wav_path, text, text_language, bert1, phones1, prompt_semantic, top_k, top_p, temperature, ref_free, t0, t1):
+ if (text[-1] not in splits): text += "。" if text_language != "en" else "."
+ print(i18n("实际输入的目标文本(每句):"), text)
+ phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language)
+ print(i18n("前端处理后的文本(每句):"), norm_text2)
+ if not ref_free:
+ bert = torch.cat([bert1, bert2], 1)
+ all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
+ else:
+ bert = bert2
+ all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0)
+
+ bert = bert.to(device).unsqueeze(0)
+ all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
+ prompt = prompt_semantic.unsqueeze(0).to(device)
+ t2 = ttime()
+ with torch.no_grad():
+ # pred_semantic = t2s_model.model.infer(
+ pred_semantic, idx = t2s_model.model.infer_panel(
+ all_phoneme_ids,
+ all_phoneme_len,
+ None if ref_free else prompt,
+ bert,
+ # prompt_phone_len=ph_offset,
+ top_k=top_k,
+ top_p=top_p,
+ temperature=temperature,
+ early_stop_num=hz * max_sec,
+ )
+ t3 = ttime()
+ # print(pred_semantic.shape,idx)
+ pred_semantic = pred_semantic[:, -idx:].unsqueeze(
+ 0
+ ) # .unsqueeze(0)#mq要多unsqueeze一次
+ refer = get_spepc(hps, ref_wav_path) # .to(device)
+ if is_half == True:
+ refer = refer.half().to(device)
+ else:
+ refer = refer.to(device)
+ # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
+ audio = (
+ vq_model.decode(
+ pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer
+ )
+ .detach()
+ .cpu()
+ .numpy()[0, 0]
+ ) ###试试重建不带上prompt部分
+ max_audio=np.abs(audio).max()#简单防止16bit爆音
+ if max_audio>1:audio/=max_audio
+ t4 = ttime()
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
- yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(
- np.int16
- )
+ return audio
def split(todo_text):
@@ -543,75 +556,79 @@ def get_weights_names():
SoVITS_names, GPT_names = get_weights_names()
-with gr.Blocks(title="GPT-SoVITS WebUI") as app:
- gr.Markdown(
- value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
- )
- with gr.Group():
- gr.Markdown(value=i18n("模型切换"))
- with gr.Row():
- GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True)
- SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True)
- refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
- refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
- SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], [])
- GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
- gr.Markdown(value=i18n("*请上传并填写参考信息"))
- with gr.Row():
- inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath")
- with gr.Column():
- ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
- gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。"))
- prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
- prompt_language = gr.Dropdown(
- label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
- )
- gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式"))
- with gr.Row():
- text = gr.Textbox(label=i18n("需要合成的文本"), value="")
- text_language = gr.Dropdown(
- label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
- )
- how_to_cut = gr.Radio(
- label=i18n("怎么切"),
- choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
- value=i18n("凑四句一切"),
- interactive=True,
- )
+def main():
+ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
+ gr.Markdown(
+ value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
+ )
+ with gr.Group():
+ gr.Markdown(value=i18n("模型切换"))
+ with gr.Row():
+ GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True)
+ SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True)
+ refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
+ refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
+ SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], [])
+ GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
+ gr.Markdown(value=i18n("*请上传并填写参考信息"))
with gr.Row():
- gr.Markdown("gpt采样参数(无参考文本时不要太低):")
- top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
- top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
- temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
- inference_button = gr.Button(i18n("合成语音"), variant="primary")
- output = gr.Audio(label=i18n("输出的语音"))
-
- inference_button.click(
- get_tts_wav,
- [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free],
- [output],
+ inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath")
+ with gr.Column():
+ ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
+ gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。"))
+ prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
+ prompt_language = gr.Dropdown(
+ label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
+ )
+ gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式"))
+ with gr.Row():
+ text = gr.Textbox(label=i18n("需要合成的文本"), value="")
+ text_language = gr.Dropdown(
+ label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
+ )
+ how_to_cut = gr.Radio(
+ label=i18n("怎么切"),
+ choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
+ value=i18n("凑四句一切"),
+ interactive=True,
+ )
+ with gr.Row():
+ gr.Markdown("gpt采样参数(无参考文本时不要太低):")
+ top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
+ top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
+ temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
+ inference_button = gr.Button(i18n("合成语音"), variant="primary")
+ output = gr.Audio(label=i18n("输出的语音"))
+
+ inference_button.click(
+ get_tts_wav,
+ [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free],
+ [output],
+ )
+
+ gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
+ with gr.Row():
+ text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
+ button1 = gr.Button(i18n("凑四句一切"), variant="primary")
+ button2 = gr.Button(i18n("凑50字一切"), variant="primary")
+ button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
+ button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
+ button5 = gr.Button(i18n("按标点符号切"), variant="primary")
+ text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
+ button1.click(cut1, [text_inp], [text_opt])
+ button2.click(cut2, [text_inp], [text_opt])
+ button3.click(cut3, [text_inp], [text_opt])
+ button4.click(cut4, [text_inp], [text_opt])
+ button5.click(cut5, [text_inp], [text_opt])
+ gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
+
+ app.queue(max_size=1022).launch(
+ server_name="0.0.0.0",
+ inbrowser=True,
+ share=is_share,
+ server_port=infer_ttswebui,
+ quiet=True,
)
- gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
- with gr.Row():
- text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
- button1 = gr.Button(i18n("凑四句一切"), variant="primary")
- button2 = gr.Button(i18n("凑50字一切"), variant="primary")
- button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
- button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
- button5 = gr.Button(i18n("按标点符号切"), variant="primary")
- text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
- button1.click(cut1, [text_inp], [text_opt])
- button2.click(cut2, [text_inp], [text_opt])
- button3.click(cut3, [text_inp], [text_opt])
- button4.click(cut4, [text_inp], [text_opt])
- button5.click(cut5, [text_inp], [text_opt])
- gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
-
-app.queue(concurrency_count=511, max_size=1022).launch(
- server_name="0.0.0.0",
- inbrowser=True,
- share=is_share,
- server_port=infer_ttswebui,
- quiet=True,
-)
+if __name__ == '__main__':
+ main()
diff --git a/example/archive_ruanmei_8.lab b/example/archive_ruanmei_8.lab
new file mode 100644
index 000000000..d7782ad02
--- /dev/null
+++ b/example/archive_ruanmei_8.lab
@@ -0,0 +1 @@
+我听不惯现代乐,听戏却极易入迷,琴弦拨动,时间便流往过去。
\ No newline at end of file
diff --git a/example/archive_ruanmei_8.wav b/example/archive_ruanmei_8.wav
new file mode 100644
index 000000000..d16052a71
Binary files /dev/null and b/example/archive_ruanmei_8.wav differ