HuiResearch
diff --git a/‎README.MD
Lines changed: 5 additions & 0 deletions b/‎README.MD
Lines changed: 5 additions & 0 deletions
diff --git a/‎README_EN.MD
Lines changed: 5 additions & 0 deletions b/‎README_EN.MD
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/en/server/server.md
Lines changed: 2 additions & 0 deletions b/‎docs/en/server/server.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/zh/server/server.md
Lines changed: 2 additions & 0 deletions b/‎docs/zh/server/server.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/inference.py
Lines changed: 6 additions & 6 deletions b/‎examples/inference.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎flashtts/commands/serve.py
Lines changed: 12 additions & 3 deletions b/‎flashtts/commands/serve.py
Lines changed: 12 additions & 3 deletions
diff --git a/‎flashtts/engine/spark_engine.py
Lines changed: 42 additions & 14 deletions b/‎flashtts/engine/spark_engine.py
Lines changed: 42 additions & 14 deletions
@@ -209,9 +209,14 @@ flashtts infer \
  --torch_dtype "bfloat16" \ 
  --max_length 32768 \
  --llm_gpu_memory_utilization 0.6 \
+ --fix_voice \  # 启动后将固定住spark-tts中内置音色（female和male）
  --host 0.0.0.0 \
  --port 8000
  ```
+web地址：`http://localhost:8000`
+
+接口文档地址：`http://localhost:8000/docs`
+
 详细部署说明，请参考：[server.md](docs/zh/server/server.md)
 
 ## ⚡ 推理速度
 
@@ -161,10 +161,15 @@ Server deployment:
  --torch_dtype "bfloat16" \ 
  --max_length 32768 \
  --llm_gpu_memory_utilization 0.6 \
+ --fix_voice \  # Whether to fix the spark-tts timbre (female and male)
  --host 0.0.0.0 \
  --port 8000
  ```
 
+Web address: `http://localhost:8000`
+
+Interface document address: `http://localhost:8000/docs`
+
 For detailed deployment，please refer to: [server.md](docs/en/server/server.md)
 
 ## ⚡ Inference Speed
 
@@ -18,6 +18,7 @@
    --torch_dtype "bfloat16" \ # Spark-TTS does not support bfloat16 on all devices; use float32 if needed
    --max_length 32768 \
    --llm_gpu_memory_utilization 0.6 \
+   --fix_voice \ # Whether to fix the spark-tts timbre (female and male)
    --host 0.0.0.0 \
    --port 8000
    ```
@@ -89,6 +90,7 @@
 | `--wait_timeout`                | float | Timeout (in seconds) for dynamic batching                                                                                                         | 0.01                    |
 | `--host`                        | str   | Host address to bind                                                                                                                              | `0.0.0.0`               |
 | `--port`                        | int   | Port number to listen on                                                                                                                          | 8000                    |
+| `--fix_voice`                   | bool  | Fixes the female and male timbres in the spark-tts model, ensuring they remain unchanged.                                                         | False                   |
 
 ---
 
 
@@ -18,6 +18,7 @@
     --torch_dtype "bfloat16" \ # 对于spark-tts模型，不支持bfloat16的设备，只能设置为float32.
     --max_length 32768 \
     --llm_gpu_memory_utilization 0.6 \
+    --fix_voice \ # 是否固定spark-tts音色（female和male）
     --host 0.0.0.0 \
     --port 8000
     ```
@@ -89,6 +90,7 @@
 | `--wait_timeout`                | float | 动态批处理请求超时秒数                                                                                    | 0.01                    |
 | `--host`                        | str   | 服务监听地址                                                                                         | `0.0.0.0`               |
 | `--port`                        | int   | 服务监听端口                                                                                         | 8000                    |
+| `--fix_voice`                   | bool  | 是否固定住spark-tts模型的内置音色                                                                          | False                   |
 
 ### 3. 接口使用流程
 
 
@@ -178,16 +178,16 @@ async def retain_acoustic_example(engine: AutoEngine):
         name="female",
         return_acoustic_tokens=True
     )
-    # 2. 真巧，这是我想要的音色，直接保存为txt
-    tokens.save("acoustic_tokens.txt")
+    # 2. 真巧，这是我想要的音色，直接保存为json
+    tokens.save("acoustic_tokens.json")
     # 同时保存第一次生成的音频，以便对比
     engine.write_audio(wav, "first.wav")
 
     # 3. 加载保存的音色，生成第二个音频
     wav = await engine.speak_async(
         text="国际局势中，某国领导人围绕地区冲突停火问题展开对话，双方同意停止攻击对方能源设施并推动谈判，但对全面停火提议的落实仍存分歧。",
         name="female",
-        acoustic_tokens=SparkAcousticTokens.load("acoustic_tokens.txt"),
+        acoustic_tokens=SparkAcousticTokens.load("acoustic_tokens.json"),
     )
     engine.write_audio(wav, "second.wav")
     # 4. 试听first.wav和second.wav，惊奇发现，这两个音频的音色是一致的
@@ -212,16 +212,16 @@ async def retain_acoustic_stream_example(engine: AutoEngine):
             audios.append(chunk)
     audio = np.concatenate(audios)
 
-    # 2. 真巧，这是我想要的音色，直接保存为txt
+    # 2. 真巧，这是我想要的音色，直接保存为json
     engine.write_audio(audio, "first.wav")
-    acoustic_tokens.save("acoustic_tokens.txt")
+    acoustic_tokens.save("acoustic_tokens.json")
 
     # 3. 加载保存的音色，生成第二个音频
     audios = []
     async for chunk in engine.speak_stream_async(
             text="今日是二零二五年三月十九日，国内外热点事件聚焦于国际局势、经济政策及社会民生领域。",
             name="female",
-            acoustic_tokens=SparkAcousticTokens.load("acoustic_tokens.txt")
+            acoustic_tokens=SparkAcousticTokens.load("acoustic_tokens.json")
     ):
         audios.append(chunk)
 
 
@@ -18,6 +18,7 @@
 from flashtts.server.base_router import base_router, SPEAKER_TMP_PATH
 from flashtts.server.openai_router import openai_router
 from flashtts.commands.utils import add_model_parser
+from flashtts.server.protocol import StateInfo
 
 logger = get_logger()
 
@@ -132,8 +133,11 @@ async def lifespan(app: FastAPI):
         await warmup_engine(engine)
         # 将 engine 保存到 app.state 中，方便路由中使用
         app.state.engine = engine
-        app.state.model_name = args.model_name or engine.engine_name
-        app.state.db_path = args.db_path
+        app.state.state_info = StateInfo(
+            model_name=args.model_name or engine.engine_name,
+            db_path=args.db_path,
+            fix_voice=args.fix_voice
+        )
         yield
 
         if os.path.exists(SPEAKER_TMP_PATH):
@@ -194,7 +198,12 @@ def register_subcommand(parser: ArgumentParser):
             "--api_key",
             type=str,
             default=None,
-            help="API key for request authentication",
+            help="API key for request authentication"
+        )
+        serve_parser.add_argument(
+            "--fix_voice",
+            action="store_true",
+            help="Fixes the female and male timbres in the spark-tts model, ensuring they remain unchanged."
         )
 
         serve_parser.add_argument(
 
@@ -2,6 +2,7 @@
 # Time      :2025/3/29 11:16
 # Author    :Hui Huang
 import asyncio
+import json
 import math
 import os.path
 import re
@@ -42,15 +43,18 @@
     "very_high": 4,
 }
 
-GENDER_MAP = {
+GENDER_MAP: dict[Literal["male", "female"], int] = {
     "female": 0,
     "male": 1,
 }
 
+ID2GENDER = {v: k for k, v in GENDER_MAP.items()}
+
 
 @dataclass
 class SparkAcousticTokens:
     prompt: str
+    gender: Literal["female", "male"]
     global_tokens: Optional[torch.Tensor] = None
 
     def __post_init__(self):
@@ -73,15 +77,21 @@ def _parse_prompt(self):
             )
             self.global_tokens = global_token_ids
 
+    def to_dict(self) -> dict[str, str]:
+        return {
+            "prompt": self.prompt,
+            "gender": self.gender
+        }
+
     def save(self, filepath: str):
         with open(filepath, 'w', encoding='utf8') as w:
-            w.write(self.prompt)
+            w.write(json.dumps(self.to_dict(), ensure_ascii=False, indent=2))
 
     @classmethod
     def load(cls, filepath: str):
         with open(filepath, 'r', encoding='utf8') as r:
-            prompt = r.read()
-        return cls(prompt=prompt)
+            data = json.load(r)
+        return cls(**data)
 
 
 def process_prompt(
@@ -619,6 +629,18 @@ async def _control_generate(
             acoustic_tokens: Optional[SparkAcousticTokens | str] = None,
             return_acoustic_tokens: bool = False,
             **kwargs):
+        gender: Literal["female", "male"] = gender if gender in ["female", "male"] else "female"
+
+        if acoustic_tokens is not None and isinstance(acoustic_tokens, str):
+            acoustic_tokens = SparkAcousticTokens.load(acoustic_tokens)
+
+        if acoustic_tokens is not None:
+            if acoustic_tokens.gender != gender:
+                logger.warning(
+                    f"The provided `acoustic_tokens` belong to the `{acoustic_tokens.gender}`, but the specified gender is {gender}. "
+                    f"The `acoustic_tokens` will therefore not be used.")
+                acoustic_tokens = None
+
         segments = self.preprocess_text(
             text,
             window_size=window_size,
@@ -654,14 +676,11 @@ async def generate_audio(
                 "completion": generated['completion']
             }
 
-        if acoustic_tokens is not None and isinstance(acoustic_tokens, str):
-            acoustic_tokens = SparkAcousticTokens(acoustic_tokens)
-
         audios = []
         if acoustic_tokens is None:
             # 如果没有传入音色，使用第一段生成音色token，将其与后面片段一起拼接，使用相同音色token引导输出semantic tokens。
             first_output = await generate_audio(segments[0], acoustic_token=None)
-            acoustic_tokens = SparkAcousticTokens(first_output['completion'])
+            acoustic_tokens = SparkAcousticTokens(first_output['completion'], gender=gender)
             audios.append(first_output['audio'])
             segments = segments[1:]
 
@@ -706,7 +725,7 @@ async def speak_async(
             logger.error(err_msg)
             raise ValueError(err_msg)
         self.set_seed(seed=self.seed)
-        acoustic_tokens = None
+        out_acoustic_tokens = None
         if name in ["female", "male"]:
             output = await self._control_generate(
                 text=text,
@@ -727,7 +746,7 @@ async def speak_async(
             )
             if return_acoustic_tokens and isinstance(output, tuple):
                 audio = output[0]
-                acoustic_tokens = output[1]
+                out_acoustic_tokens = output[1]
             else:
                 audio = output
         else:
@@ -756,8 +775,8 @@ async def speak_async(
 
         torch.cuda.empty_cache()
 
-        if acoustic_tokens is not None:
-            return audio, acoustic_tokens
+        if out_acoustic_tokens is not None:
+            return audio, out_acoustic_tokens
         return audio
 
     async def _control_stream_generate(
@@ -782,6 +801,8 @@ async def _control_stream_generate(
             return_acoustic_tokens: bool = False,
             **kwargs
     ):
+        gender: Literal["female", "male"] = gender if gender in ["female", "male"] else "female"
+
         if audio_chunk_duration < 0.5:
             err_msg = "audio_chunk_duration at least 0.5 seconds"
             logger.error(err_msg)
@@ -792,7 +813,14 @@ async def _control_stream_generate(
             raise ValueError(err_msg)
 
         if acoustic_tokens is not None and isinstance(acoustic_tokens, str):
-            acoustic_tokens = SparkAcousticTokens(acoustic_tokens)
+            acoustic_tokens = SparkAcousticTokens.load(acoustic_tokens)
+
+        if acoustic_tokens is not None:
+            if acoustic_tokens.gender != gender:
+                logger.warning(
+                    f"The provided `acoustic_tokens` belong to the `{acoustic_tokens.gender}`, but the specified gender is {gender}. "
+                    f"The `acoustic_tokens` will therefore not be used.")
+                acoustic_tokens = None
 
         audio_tokenizer_frame_rate = 50
         max_chunk_size = math.ceil(max_audio_chunk_duration * audio_tokenizer_frame_rate)
@@ -840,7 +868,7 @@ async def _control_stream_generate(
                         r"(<\|start_acoustic_token\|>.*?<\|end_global_token\|>)",
                         completion)
                     if len(acoustics) > 0:
-                        acoustic_tokens = SparkAcousticTokens(acoustics[0])
+                        acoustic_tokens = SparkAcousticTokens(acoustics[0], gender=gender)
                         completion = ""
                     else:
                         continue