diff --git a/GPT_SoVITS/inference_stream.py b/GPT_SoVITS/inference_stream.py new file mode 100644 index 000000000..042e26804 --- /dev/null +++ b/GPT_SoVITS/inference_stream.py @@ -0,0 +1,383 @@ +import tempfile, io, wave +import gradio as gr +import uvicorn +import argparse +from fastapi import FastAPI +from fastapi.responses import StreamingResponse +from pydub import AudioSegment +from tools.i18n.i18n import I18nAuto +from GPT_SoVITS.inference_webui import ( + get_weights_names, + custom_sort_key, + change_choices, + change_gpt_weights, + change_sovits_weights, + get_tts_wav, +) + +api_app = FastAPI() +i18n = I18nAuto() + +# API mode Usage: python GPT_SoVITS/inference_stream.py --api +parser = argparse.ArgumentParser(description="GPT-SoVITS Streaming API") +parser.add_argument( + "-api", + "--api", + action="store_true", + default=False, + help="是否开启API模式(不开启则是WebUI模式)", +) +parser.add_argument( + "-s", + "--sovits_path", + type=str, + default="GPT_SoVITS/pretrained_models/s2G488k.pth", + help="SoVITS模型路径", +) +parser.add_argument( + "-g", + "--gpt_path", + type=str, + default="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", + help="GPT模型路径", +) +parser.add_argument( + "-rw", + "--ref_wav", + type=str, + default="./example/archive_ruanmei_8.wav", + help="参考音频路径", +) +parser.add_argument( + "-rt", + "--prompt_text", + type=str, + default="我听不惯现代乐,听戏却极易入迷,琴弦拨动,时间便流往过去。", + help="参考音频文本", +) +parser.add_argument( + "-rl", + "--prompt_language", + type=str, + default=i18n("中文"), + help="参考音频语种", +) + +args = parser.parse_args() + +sovits_path = args.sovits_path +gpt_path = args.gpt_path +SoVITS_names, GPT_names = get_weights_names() + +EXAMPLES = [ + [ + i18n("中文"), + "根据过年的传说,远古时代有一隻凶残年兽,每到岁末就会从海底跑出来吃人。" + + "人们为了不被年兽吃掉,家家户户都会祭拜祖先祈求平安,也会聚在一起吃一顿丰盛的晚餐。" + + "后来人们发现年兽害怕红色、噪音与火光,便开始在当天穿上红衣、门上贴上红纸、燃烧爆竹声,藉此把年兽赶走。" + + "而这些之后也成为过年吃团圆饭、穿红衣、放鞭炮、贴春联的过年习俗。", + ], + [ + i18n("中文"), + "在父母的葬礼上,她穿一身全黑的丧服。她依旧把头发束得很好,如墨的发丝遮掩着她的神情。她没有掉一滴眼泪。" + + "直到夜色降临,在实验室的屏幕上,数据螺旋制成的层层几何花纹在戏声中变换、舒展、流动,而将那花纹层层剥去后,是她万般呵护、小心制作的秘密。" + + "阖眼的父亲与母亲,二者冰冷如沉睡般的面容。是辜负。他们没能遵守和外婆的约定。我也没能履行保护父母的承诺,同样辜负了他们。" + + "唯有科学…不会辜负。她说。少女希望父母平等,便将自己从前的昵称抹去,将名字从二人姓氏中各取一。" + + "自那以后,每当有人问及家,她的眼中总是闪过一丝迷茫,她似乎把一切都忘了。在这样的状态中,她开始了忽视时间规律的演算。" + + "由于沉迷科研,她的进食毫不规律,兴致起来便研究几日几夜,到了极限便昏昏睡去。很快,她只借着火萤微弱的光,在每个夜晚收获研究的进展。" + + "她愈对已有的生命法则置若罔闻,她的前进就愈加迅速;她完全不在乎公式,接着她漠视了生命的意义。" + + "她只去观察、用双手揣摩,将数据握在手里感受,接着就编纂出新的物种规律。于是,在她的实验室中,那些蕨类植物与花愈发生长地茂盛," + + "它们以肉眼可见的速度长高,接着充斥了所有空间。在那花叶开合的缝隙中——笼罩着父母清冷而素净的、由数据汇聚而成的面庞。" + + "在沉睡的父母将要睁开双眼的时刻——她几乎摧毁了整个星球原本的物种衍变规律,但她仍然在向着自己的目标前进。" + + "直到她从研究中抬起头,猛烈地望向天空:智识的瞥视降临到了她的身上。", + ], + [ + i18n("中文"), + "神霄折戟录其二" + +"「嗯,好吃。」被附体的未央变得温柔了许多,也冷淡了很多。" + + "她拿起弥耳做的馅饼,小口小口吃了起来。第一口被烫到了,还很可爱地吐着舌头吸气。" + + "「我一下子有点接受不了, 需要消化消化。」用一只眼睛作为代价维持降灵的弥耳自己也拿了一个馅饼,「你再说一 遍?」" + + "「当年所谓的陨铁其实是神戟。它被凡人折断,铸成魔剑九柄。这一把是雾海魔剑。 加上他们之前已经收集了两柄。「然后你是?」" + + "「我是曾经的天帝之女,名字已经忘了。我司掌审判与断罪,用你们的话说,就是刑律。」" + + "因为光禄寺执掌祭祀典礼的事情,所以仪式、祝词什么的,弥耳被老爹逼得倒是能倒背如流。同时因为尽是接触怪力乱神,弥耳也是知道一些小]道的。 神明要是被知道了真正的秘密名讳,就只能任人驱使了。眼前这位未必是忘了。" + + "「所以朝廷是想重铸神霄之戟吗?」弥耳说服自己接受了这个设定,追问道。" + + "「我不知道。这具身体的主人并不知道别的事。她只是很愤怒,想要证明自己。」未央把手放在了胸口上。" + +"「那接下来,我是应该弄个什么送神仪式把你送走吗?」弥耳摸了摸绷带下已经失去功能的眼睛,「然后我的眼睛也会回来?」" + ], +] + + +# from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py +def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000): + # This will create a wave header then append the frame input + # It should be first on a streaming wav file + # Other frames better should not have it (else you will hear some artifacts each chunk start) + wav_buf = io.BytesIO() + with wave.open(wav_buf, "wb") as vfout: + vfout.setnchannels(channels) + vfout.setsampwidth(sample_width) + vfout.setframerate(sample_rate) + vfout.writeframes(frame_input) + + wav_buf.seek(0) + return wav_buf.read() + + +def get_streaming_tts_wav( + ref_wav_path, + prompt_text, + prompt_language, + text, + text_language, + how_to_cut, + top_k, + top_p, + temperature, + ref_free, + byte_stream=True, +): + chunks = get_tts_wav( + ref_wav_path=ref_wav_path, + prompt_text=prompt_text, + prompt_language=prompt_language, + text=text, + text_language=text_language, + how_to_cut=how_to_cut, + top_k=top_k, + top_p=top_p, + temperature=temperature, + ref_free=ref_free, + stream=True, + ) + + if byte_stream: + yield wave_header_chunk() + for chunk in chunks: + yield chunk + else: + # Send chunk files + i = 0 + format = "wav" + for chunk in chunks: + i += 1 + file = f"{tempfile.gettempdir()}/{i}.{format}" + segment = AudioSegment(chunk, frame_rate=32000, sample_width=2, channels=1) + segment.export(file, format=format) + yield file + + +def webui(): + with gr.Blocks(title="GPT-SoVITS Streaming Demo") as app: + gr.Markdown( + value=i18n( + "流式输出演示,分句推理后推送到组件中。由于目前bytes模式的限制,采用stream_audio_out中临时文件的方案输出分句。这种方式相比bytes,会增加wav文件解析的延迟。" + ), + ) + + gr.Markdown(value=i18n("模型切换")) + with gr.Row(): + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=gpt_path, + interactive=True, + ) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=sovits_path, + interactive=True, + ) + refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") + refresh_button.click( + fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown] + ) + SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], []) + GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) + + gr.Markdown(value=i18n("*请上传并填写参考信息")) + with gr.Row(): + inp_ref = gr.Audio( + label=i18n("请上传3~10秒内参考音频,超过会报错!"), value=args.ref_wav, type="filepath" + ) + with gr.Column(): + ref_text_free = gr.Checkbox( + label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), + value=False, + interactive=True, + show_label=True, + ) + gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")) + prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value=args.prompt_text) + prompt_language = gr.Dropdown( + label=i18n("参考音频的语种"), + choices=[ + i18n("中文"), + i18n("英文"), + i18n("日文"), + i18n("中英混合"), + i18n("日英混合"), + i18n("多语种混合"), + ], + value=args.prompt_language, + ) + + def load_text(file): + with open(file.name, "r", encoding="utf-8") as file: + return file.read() + + load_button = gr.UploadButton(i18n("加载参考文本"), variant="secondary") + load_button.upload(load_text, load_button, prompt_text) + + gr.Markdown( + value=i18n( + "*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。" + ) + ) + with gr.Row(): + text = gr.Textbox( + label=i18n("需要合成的文本"), value="", lines=5, interactive=True + ) + text_language = gr.Dropdown( + label=i18n("需要合成的语种"), + choices=[ + i18n("中文"), + i18n("英文"), + i18n("日文"), + i18n("中英混合"), + i18n("日英混合"), + i18n("多语种混合"), + ], + value=i18n("中文"), + ) + how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("按标点符号切"), + interactive=True, + ) + + gr.Markdown(value=i18n("* 参数设置")) + with gr.Row(): + with gr.Column(): + top_k = gr.Slider( + minimum=1, + maximum=100, + step=1, + label=i18n("top_k"), + value=5, + interactive=True, + ) + top_p = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("top_p"), + value=1, + interactive=True, + ) + temperature = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("temperature"), + value=1, + interactive=True, + ) + inference_button = gr.Button(i18n("合成语音"), variant="primary") + + gr.Markdown(value=i18n("* 结果输出(等待第2句推理结束后会自动播放)")) + with gr.Row(): + audio_file = gr.Audio( + value=None, + label=i18n("输出的语音"), + streaming=True, + autoplay=True, + interactive=False, + show_label=True, + ) + + inference_button.click( + get_streaming_tts_wav, + [ + inp_ref, + prompt_text, + prompt_language, + text, + text_language, + how_to_cut, + top_k, + top_p, + temperature, + ref_text_free, + ], + [audio_file], + ).then(lambda: gr.update(interactive=True), None, [text], queue=False) + + with gr.Row(): + gr.Examples( + EXAMPLES, + [text_language, text], + cache_examples=False, + run_on_click=False, # Will not work , user should submit it + ) + + app.queue().launch( + server_name="0.0.0.0", + inbrowser=True, + share=False, + server_port=8080, + quiet=True, + ) + + +@api_app.get("/") +async def tts( + text: str, # 必选参数 + language: str = i18n("中文"), + top_k: int = 5, + top_p: float = 1, + temperature: float = 1, +): + ref_wav_path = args.ref_wav + prompt_text = args.prompt_text + prompt_language = args.prompt_language + how_to_cut = i18n("按标点符号切") + + return StreamingResponse( + get_streaming_tts_wav( + ref_wav_path=ref_wav_path, + prompt_text=prompt_text, + prompt_language=prompt_language, + text=text, + text_language=language, + how_to_cut=how_to_cut, + top_k=top_k, + top_p=top_p, + temperature=temperature, + ref_free=False, + byte_stream=True, + ), + media_type="audio/x-wav", + ) + + +def api(): + uvicorn.run( + app="inference_stream:api_app", host="127.0.0.1", port=8080, reload=True + ) + + +if __name__ == "__main__": + # 模式选择,默认是webui模式 + if not args.api: + webui() + else: + api() diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index d2f3f9491..891c36a7a 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -306,7 +306,7 @@ def merge_short_text_in_array(texts, threshold): result[len(result) - 1] += text return result -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False): +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free=False, stream=False): if prompt_text is None or len(prompt_text) == 0: ref_free = True t0 = ttime() @@ -364,68 +364,81 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, texts = merge_short_text_in_array(texts, 5) audio_opt = [] if not ref_free: - phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language) + phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language) + else: + phones1, bert1 = None, None for text in texts: # 解决输入目标文本的空行导致报错的问题 if (len(text.strip()) == 0): continue - if (text[-1] not in splits): text += "。" if text_language != "en" else "." - print(i18n("实际输入的目标文本(每句):"), text) - phones2,bert2,norm_text2=get_phones_and_bert(text, text_language) - print(i18n("前端处理后的文本(每句):"), norm_text2) - if not ref_free: - bert = torch.cat([bert1, bert2], 1) - all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0) - else: - bert = bert2 - all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0) - - bert = bert.to(device).unsqueeze(0) - all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device) - prompt = prompt_semantic.unsqueeze(0).to(device) - t2 = ttime() - with torch.no_grad(): - # pred_semantic = t2s_model.model.infer( - pred_semantic, idx = t2s_model.model.infer_panel( - all_phoneme_ids, - all_phoneme_len, - None if ref_free else prompt, - bert, - # prompt_phone_len=ph_offset, - top_k=top_k, - top_p=top_p, - temperature=temperature, - early_stop_num=hz * max_sec, - ) - t3 = ttime() - # print(pred_semantic.shape,idx) - pred_semantic = pred_semantic[:, -idx:].unsqueeze( - 0 - ) # .unsqueeze(0)#mq要多unsqueeze一次 - refer = get_spepc(hps, ref_wav_path) # .to(device) - if is_half == True: - refer = refer.half().to(device) - else: - refer = refer.to(device) - # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0] - audio = ( - vq_model.decode( - pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer - ) - .detach() - .cpu() - .numpy()[0, 0] - ) ###试试重建不带上prompt部分 - max_audio=np.abs(audio).max()#简单防止16bit爆音 - if max_audio>1:audio/=max_audio + audio = get_tts_chunk(ref_wav_path, text, text_language, bert1, phones1, prompt_semantic, + top_k, top_p, temperature, ref_free, t0, t1) audio_opt.append(audio) audio_opt.append(zero_wav) - t4 = ttime() + if (stream): + # 流式模式下每句返回一次 + yield (np.concatenate([audio, zero_wav], 0) * 32768).astype(np.int16).tobytes() + + if (not stream): + # 非流式最终合并后返回 + yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype( + np.int16 + ) + +def get_tts_chunk(ref_wav_path, text, text_language, bert1, phones1, prompt_semantic, top_k, top_p, temperature, ref_free, t0, t1): + if (text[-1] not in splits): text += "。" if text_language != "en" else "." + print(i18n("实际输入的目标文本(每句):"), text) + phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language) + print(i18n("前端处理后的文本(每句):"), norm_text2) + if not ref_free: + bert = torch.cat([bert1, bert2], 1) + all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0) + else: + bert = bert2 + all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0) + + bert = bert.to(device).unsqueeze(0) + all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device) + prompt = prompt_semantic.unsqueeze(0).to(device) + t2 = ttime() + with torch.no_grad(): + # pred_semantic = t2s_model.model.infer( + pred_semantic, idx = t2s_model.model.infer_panel( + all_phoneme_ids, + all_phoneme_len, + None if ref_free else prompt, + bert, + # prompt_phone_len=ph_offset, + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=hz * max_sec, + ) + t3 = ttime() + # print(pred_semantic.shape,idx) + pred_semantic = pred_semantic[:, -idx:].unsqueeze( + 0 + ) # .unsqueeze(0)#mq要多unsqueeze一次 + refer = get_spepc(hps, ref_wav_path) # .to(device) + if is_half == True: + refer = refer.half().to(device) + else: + refer = refer.to(device) + # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0] + audio = ( + vq_model.decode( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer + ) + .detach() + .cpu() + .numpy()[0, 0] + ) ###试试重建不带上prompt部分 + max_audio=np.abs(audio).max()#简单防止16bit爆音 + if max_audio>1:audio/=max_audio + t4 = ttime() print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) - yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype( - np.int16 - ) + return audio def split(todo_text): @@ -543,75 +556,79 @@ def get_weights_names(): SoVITS_names, GPT_names = get_weights_names() -with gr.Blocks(title="GPT-SoVITS WebUI") as app: - gr.Markdown( - value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - ) - with gr.Group(): - gr.Markdown(value=i18n("模型切换")) - with gr.Row(): - GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True) - SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True) - refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") - refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) - SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], []) - GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) - gr.Markdown(value=i18n("*请上传并填写参考信息")) - with gr.Row(): - inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath") - with gr.Column(): - ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True) - gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。")) - prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="") - prompt_language = gr.Dropdown( - label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") - ) - gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式")) - with gr.Row(): - text = gr.Textbox(label=i18n("需要合成的文本"), value="") - text_language = gr.Dropdown( - label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") - ) - how_to_cut = gr.Radio( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), - interactive=True, - ) +def main(): + with gr.Blocks(title="GPT-SoVITS WebUI") as app: + gr.Markdown( + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + ) + with gr.Group(): + gr.Markdown(value=i18n("模型切换")) + with gr.Row(): + GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True) + SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True) + refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") + refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) + SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], []) + GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) + gr.Markdown(value=i18n("*请上传并填写参考信息")) with gr.Row(): - gr.Markdown("gpt采样参数(无参考文本时不要太低):") - top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) - top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) - temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) - inference_button = gr.Button(i18n("合成语音"), variant="primary") - output = gr.Audio(label=i18n("输出的语音")) - - inference_button.click( - get_tts_wav, - [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free], - [output], + inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath") + with gr.Column(): + ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True) + gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。")) + prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="") + prompt_language = gr.Dropdown( + label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") + ) + gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式")) + with gr.Row(): + text = gr.Textbox(label=i18n("需要合成的文本"), value="") + text_language = gr.Dropdown( + label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") + ) + how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], + value=i18n("凑四句一切"), + interactive=True, + ) + with gr.Row(): + gr.Markdown("gpt采样参数(无参考文本时不要太低):") + top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) + top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) + temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) + inference_button = gr.Button(i18n("合成语音"), variant="primary") + output = gr.Audio(label=i18n("输出的语音")) + + inference_button.click( + get_tts_wav, + [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free], + [output], + ) + + gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) + with gr.Row(): + text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="") + button1 = gr.Button(i18n("凑四句一切"), variant="primary") + button2 = gr.Button(i18n("凑50字一切"), variant="primary") + button3 = gr.Button(i18n("按中文句号。切"), variant="primary") + button4 = gr.Button(i18n("按英文句号.切"), variant="primary") + button5 = gr.Button(i18n("按标点符号切"), variant="primary") + text_opt = gr.Textbox(label=i18n("切分后文本"), value="") + button1.click(cut1, [text_inp], [text_opt]) + button2.click(cut2, [text_inp], [text_opt]) + button3.click(cut3, [text_inp], [text_opt]) + button4.click(cut4, [text_inp], [text_opt]) + button5.click(cut5, [text_inp], [text_opt]) + gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) + + app.queue(max_size=1022).launch( + server_name="0.0.0.0", + inbrowser=True, + share=is_share, + server_port=infer_ttswebui, + quiet=True, ) - gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) - with gr.Row(): - text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="") - button1 = gr.Button(i18n("凑四句一切"), variant="primary") - button2 = gr.Button(i18n("凑50字一切"), variant="primary") - button3 = gr.Button(i18n("按中文句号。切"), variant="primary") - button4 = gr.Button(i18n("按英文句号.切"), variant="primary") - button5 = gr.Button(i18n("按标点符号切"), variant="primary") - text_opt = gr.Textbox(label=i18n("切分后文本"), value="") - button1.click(cut1, [text_inp], [text_opt]) - button2.click(cut2, [text_inp], [text_opt]) - button3.click(cut3, [text_inp], [text_opt]) - button4.click(cut4, [text_inp], [text_opt]) - button5.click(cut5, [text_inp], [text_opt]) - gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) - -app.queue(concurrency_count=511, max_size=1022).launch( - server_name="0.0.0.0", - inbrowser=True, - share=is_share, - server_port=infer_ttswebui, - quiet=True, -) +if __name__ == '__main__': + main() diff --git a/example/archive_ruanmei_8.lab b/example/archive_ruanmei_8.lab new file mode 100644 index 000000000..d7782ad02 --- /dev/null +++ b/example/archive_ruanmei_8.lab @@ -0,0 +1 @@ +我听不惯现代乐,听戏却极易入迷,琴弦拨动,时间便流往过去。 \ No newline at end of file diff --git a/example/archive_ruanmei_8.wav b/example/archive_ruanmei_8.wav new file mode 100644 index 000000000..d16052a71 Binary files /dev/null and b/example/archive_ruanmei_8.wav differ