openai接口支持语音克隆(#55)

HuiResearch · HuiResearch · commit d9d7f11a15da · 2025-04-29T11:12:32.000+08:00
diff --git a/docs/en/server/client.md b/docs/en/server/client.md
@@ -199,17 +199,50 @@ def clone_voice_stream():
 
 ### Sample Code
 
+Call a built-in audio character:
+
 ```python
 from openai import OpenAI
 
 
 def openai_speech():
-    client = OpenAI(base_url=f"{BASE_URL}/v1", api_key="YOUR_KEY")
+    client = OpenAI(
+        base_url=f"{BASE_URL}/v1",
+        api_key="not-needed"  # If an API key is set, please provide it
+    )
+    with client.audio.speech.with_streaming_response.create(
+            model="spark",
+            voice="赞助商",  # Name of the built-in voice
+            input="Hello, I am the invincible little cutie."
+    ) as response:
+        response.stream_to_file("out.mp3")
+    print("Output file: out.mp3")
+```
+
+Or provide a reference audio to use the voice cloning feature:
+
+```python
+from openai import OpenAI
+import base64
+
+
+def openai_speech():
+    client = OpenAI(
+        base_url=f"{BASE_URL}/v1",
+        api_key="not-needed"  # If an API key is set, please provide it
+    )
+    with open("data/mega-roles/御姐/御姐配音.wav", "rb") as f:
+        audio_bytes = f.read()
+    # Convert the binary audio data into a base64-encoded string
+    audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+
     with client.audio.speech.with_streaming_response.create(
-            model="orpheus", voice="tara", input="Hello"
-    ) as r:
-        r.stream_to_file("out.mp3")
-    print("Output saved: out.mp3")
+            model="spark",
+            voice=audio_base64,  # Replace the 'voice' parameter with the audio's base64 to trigger voice cloning
+            input="Hello, I am the invincible little cutie."
+    ) as response:
+        response.stream_to_file("clone.mp3")
+    print("Cloned file: clone.mp3")
 ```
 
 ### Steps
diff --git a/docs/en/server/server.md b/docs/en/server/server.md
@@ -156,7 +156,7 @@ curl -X POST http://localhost:8000/clone_voice \
 - Uses `OpenAISpeechRequest` format:
     - `model`: Model ID or name
     - `input`: Text to synthesize
-    - `voice`: Voice name or preset
+    - `voice`: The name of the audio character you want to use, or a URL or base64 of a reference audio.
     - Other parameters same as Clone/Speak
 
 #### 4.5 Retrieve Available Roles: `GET /audio_roles` or `GET /v1/audio_roles`
diff --git a/docs/zh/server/client.md b/docs/zh/server/client.md
@@ -193,19 +193,49 @@ def clone_voice_stream():
 
 ### 示例代码
 
+调用内置的音频角色
 ```python
 from openai import OpenAI
 
 
 def openai_speech():
-    client = OpenAI(base_url=f"{BASE_URL}/v1", api_key="YOUR_KEY")
+    client = OpenAI(
+       base_url=f"{BASE_URL}/v1",
+       api_key="not-needed"  # 如果设置了api key，请传入
+    )
     with client.audio.speech.with_streaming_response.create(
-            model="orpheus", voice="tara", input="Hello"
-    ) as r:
-        r.stream_to_file("out.mp3")
+            model="spark",
+            voice="赞助商",
+            input="你好，我是无敌的小可爱。"
+    ) as response:
+        response.stream_to_file("out.mp3")
     print("输出文件：out.mp3")
 ```
+或者传入参考音频，调用语音克隆功能
 
+```python
+from openai import OpenAI
+import base64
+
+
+def openai_speech():
+   client = OpenAI(
+      base_url=f"{BASE_URL}/v1",
+      api_key="not-needed"  # 如果设置了api key，请传入
+   )
+   with open("data/mega-roles/御姐/御姐配音.wav", "rb") as f:
+      audio_bytes = f.read()
+   # 将二进制音频数据转换为 base64 字符串
+   audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+
+   with client.audio.speech.with_streaming_response.create(
+           model="spark",
+           voice=audio_base64,  # 使用音频的base64编码替换voice，即可触发语音克隆
+           input="你好，我是无敌的小可爱。"
+   ) as response:
+      response.stream_to_file("clone.mp3")
+   print("克隆文件：out.mp3")
+```
 ### 步骤说明
 
 1. 初始化 OpenAI 客户端，指定 base_url。
diff --git a/docs/zh/server/server.md b/docs/zh/server/server.md
@@ -150,7 +150,7 @@ curl -X POST http://localhost:8000/clone_voice \
 - 路径与功能与上述接口一致，使用 `OpenAISpeechRequest` 协议：
     - `model`: 模型 ID 或名称
     - `input`: 合成文本
-    - `voice`: 基础或组合 voice 名称
+    - `voice`: 您想要使用的音频字符的名称，或者参考音频的 URL 或 base64。
     - 其他参数同 Clone/Speak。
 
 #### 4.5 获取角色列表：`GET /audio_roles` 或 `GET /v1/audio_roles`
diff --git a/examples/client.py b/examples/client.py
@@ -166,12 +166,37 @@ def openai_speech():
         api_key="not-needed"  # 如果设置了api key，请传入
     )
     with client.audio.speech.with_streaming_response.create(
-            model="orpheus",
-            voice="tara",
-            input="Hey there guys. It's, <giggle> Tara here, and let me introduce you to Zac.. who seems to asleep. Zac, it's time to wakey-wakey!"
+            model="spark",
+            voice="赞助商",
+            input="你好，我是无敌的小可爱。"
     ) as response:
         response.stream_to_file("output.mp3")
 
+def openai_clone():
+    """
+    openai 克隆模式，目前仅支持spark tts
+    Returns:
+
+    """
+    from openai import OpenAI
+
+    client = OpenAI(
+        base_url=f"{BASE_URL}/v1",
+        api_key="not-needed"  # 如果设置了api key，请传入
+    )
+
+    # 选取一个没有在spark tts内置角色中的音频
+    with open("data/mega-roles/御姐/御姐配音.wav", "rb") as f:
+        audio_bytes = f.read()
+    # 将二进制音频数据转换为 base64 字符串
+    audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+
+    with client.audio.speech.with_streaming_response.create(
+            model="spark",
+            voice=audio_base64,  # 使用音频的base64编码替换voice，即可触发语音克隆
+            input="你好，我是无敌的小可爱。"
+    ) as response:
+        response.stream_to_file("output.mp3")
 
 if __name__ == "__main__":
     clone_voice_stream()
diff --git a/flashtts/server/openai_router.py b/flashtts/server/openai_router.py
@@ -6,7 +6,7 @@
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from .protocol import OpenAISpeechRequest, ModelCard, ModelList
 from .utils.audio_writer import StreamingAudioWriter
-from .utils.utils import generate_audio, generate_audio_stream
+from .utils.utils import generate_audio, generate_audio_stream, load_base64_or_url
 from ..engine import AutoEngine
 from ..logger import get_logger
 
@@ -91,7 +91,6 @@ async def create_speech(
                 "type": "invalid_request_error",
             },
         )
-    audio_writer = StreamingAudioWriter(request.response_format, sample_rate=engine.SAMPLE_RATE)
 
     # Set content type based on format
     content_type = {
@@ -104,7 +103,6 @@ async def create_speech(
     }.get(request.response_format, f"audio/{request.response_format}")
 
     api_inputs = dict(
-        name=request.voice,
         text=request.input,
         temperature=request.temperature,
         top_k=request.top_k,
@@ -117,10 +115,33 @@ async def create_speech(
     if engine.engine_name.lower() == 'spark':
         api_inputs['pitch'] = float_to_speed_label(request.pitch)
         api_inputs['speed'] = float_to_speed_label(request.speed)
+
+    if engine._SUPPORT_CLONE and request.voice not in engine.list_roles():
+        # 如果传入的voice为url或者base64，将启动语音克隆，暂且不支持mega3
+        if engine.engine_name == 'mega':
+            err_msg = ("Openai router does not currently support the voice cloning function of mega tts, "
+                       "because the model requires an additional `latent_file`.")
+            logger.error(err_msg)
+            raise HTTPException(status_code=400, detail={"error": err_msg})
+        ref_audio = await load_base64_or_url(request.voice)
+        api_inputs['reference_audio'] = ref_audio
+
+        if request.stream:
+            tts_fn = engine.clone_voice_stream_async
+        else:
+            tts_fn = engine.clone_voice_async
+    else:
+        api_inputs['name'] = request.voice
+        if request.stream:
+            tts_fn = engine.speak_stream_async
+        else:
+            tts_fn = engine.speak_async
+
+    audio_writer = StreamingAudioWriter(request.response_format, sample_rate=engine.SAMPLE_RATE)
     if request.stream:
         return StreamingResponse(
             generate_audio_stream(
-                engine.speak_stream_async,
+                tts_fn,
                 api_inputs,
                 audio_writer,
                 client_request
@@ -140,7 +161,7 @@ async def create_speech(
         }
         try:
             # Generate complete audio using public interface
-            audio_data = await engine.speak_async(
+            audio_data = await tts_fn(
                 **api_inputs
             )
         except Exception as e:
diff --git a/flashtts/server/protocol.py b/flashtts/server/protocol.py
@@ -215,7 +215,7 @@ class OpenAISpeechRequest(BaseModel):
     input: str = Field(..., description="The text to generate audio for")
     voice: str = Field(
         default=None,
-        description="The voice to use for generation. Can be a base voice or a combined voice name.",
+        description="The name of the audio character you want to use, or a URL or base64 of a reference audio.",
     )
     pitch: float = Field(
         default=1.0,
diff --git a/flashtts/server/utils/utils.py b/flashtts/server/utils/utils.py
@@ -23,23 +23,28 @@ async def get_audio_bytes_from_url(url: str) -> bytes:
         return response.content
 
 
-async def load_audio_bytes(audio_file, audio):
-    if audio_file is None:
-        # 根据 reference_audio 内容判断读取方式
-        if audio.startswith("http://") or audio.startswith("https://"):
-            audio_bytes = await get_audio_bytes_from_url(audio)
-        else:
-            try:
-                audio_bytes = base64.b64decode(audio)
-            except Exception as e:
-                logger.warning("无效的 base64 音频数据: " + str(e))
-                raise HTTPException(status_code=400, detail="无效的 base64 音频数据: " + str(e))
-        # 利用 BytesIO 包装字节数据，然后使用 soundfile 读取为 numpy 数组
+async def load_base64_or_url(audio):
+    # 根据 reference_audio 内容判断读取方式
+    if audio.startswith("http://") or audio.startswith("https://"):
+        audio_bytes = await get_audio_bytes_from_url(audio)
+    else:
         try:
-            bytes_io = io.BytesIO(audio_bytes)
+            audio_bytes = base64.b64decode(audio)
         except Exception as e:
-            logger.warning("读取参考音频失败: " + str(e))
-            raise HTTPException(status_code=400, detail="读取参考音频失败: " + str(e))
+            logger.warning("无效的 base64 音频数据: " + str(e))
+            raise HTTPException(status_code=400, detail="无效的 base64 音频数据: " + str(e))
+    # 利用 BytesIO 包装字节数据，然后使用 soundfile 读取为 numpy 数组
+    try:
+        bytes_io = io.BytesIO(audio_bytes)
+    except Exception as e:
+        logger.warning("读取参考音频失败: " + str(e))
+        raise HTTPException(status_code=400, detail="读取参考音频失败: " + str(e))
+    return bytes_io
+
+
+async def load_audio_bytes(audio_file, audio):
+    if audio_file is None:
+        bytes_io = await load_base64_or_url(audio)
     else:
         content = await audio_file.read()
         if not content:
diff --git a/setup.py b/setup.py
@@ -24,7 +24,7 @@ def get_readme() -> str:
 
 setup(
     name='flashtts',
-    version='0.1.3',
+    version='0.1.4',
     description='A Fast TTS toolkit',
     long_description=get_readme(),
     long_description_content_type='text/markdown',

Original file line number	Diff line number	Diff line change
`@@ -215,7 +215,7 @@ class OpenAISpeechRequest(BaseModel):`
`215`	`215`	`input: str = Field(..., description="The text to generate audio for")`
`216`	`216`	`voice: str = Field(`
`217`	`217`	`default=None,`
`218`		`- description="The voice to use for generation. Can be a base voice or a combined voice name.",`
	`218`	`+ description="The name of the audio character you want to use, or a URL or base64 of a reference audio.",`
`219`	`219`	`)`
`220`	`220`	`pitch: float = Field(`
`221`	`221`	`default=1.0,`