Skip to content

Commit 75ea68e

Browse files
committed
Add ability to change voices on the fly with yaml config
1 parent 87afecd commit 75ea68e

File tree

3 files changed

+120
-28
lines changed

3 files changed

+120
-28
lines changed

README.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,37 @@ You can override other parameters via environment variables (see below).
9292
| `THTTS_VOCAB_FILE` | *(auto-selected by backend)* | F5 vocab file path |
9393
| `THTTS_SPEAK_SPEED` | | |
9494
| `THTTS_MAX_WAIT_MS` | | |
95-
| `THTTS_MIN_SENT_CHARS` | | |
95+
| `THTTS_MIN_SENT_CHARS` | | |
96+
| `THTTS_VOICES_YAML` | | Voices List YAML (For multiple voice support) (see [#voice-list-file]) |
9697

98+
### Voices List yaml File
99+
100+
You can specify `THTTS_VOICES_YAML` to the path containning the following to support multiple voice at the same time
101+
102+
```yaml
103+
- name: default
104+
attribution:
105+
name: VIZINTZOR/F5-TTS-THAI
106+
url: https://huggingface.co/VIZINTZOR/F5-TTS-THAI
107+
languages: ["th", "th-TH"]
108+
description: Default Original
109+
installed: true
110+
version: "1.0"
111+
ref_sound_path: /mnt/data/services/thtts/ref_sound/original__ฉันเดินทางไปเที่ยวที่จังหวัดเชียงใหม่ในช่วงฤดูหนาวเพื่อสัมผัสอากาศเย็นสบาย.wav
112+
ref_sound_sentence: ฉันเดินทางไปเที่ยวที่จังหวัดเชียงใหม่ในช่วงฤดูหนาวเพื่อสัมผัสอากาศเย็นสบาย
113+
114+
- name: meme
115+
attribution:
116+
name: VIZINTZOR/F5-TTS-THAI
117+
url: https://huggingface.co/VIZINTZOR/F5-TTS-THAI
118+
languages: ["th", "th-TH"]
119+
description: meme Female
120+
installed: true
121+
version: "1.0"
122+
ref_sound_path: /mnt/data/services/thtts/ref_sound/meme__ชั้นเดินทางไปเที่ยวที่จังหวัดเชียงใหม่ในช่วงฤดูหนาวเพื่อสัมผัสอากาศเย็นสบาย.mp3
123+
ref_sound_sentence: ชั้นเดินทางไปเที่ยวที่จังหวัดเชียงใหม่ในช่วงฤดูหนาวเพื่อสัมผัสอากาศเย็นสบาย
124+
125+
```
97126

98127
## 3. Docker Compose (NVIDIA GPU)
99128

entrypoint.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ set -Eeuo pipefail
1818
: "${THTTS_MAX_CONCURRENT:=1}"
1919
: "${THTTS_CKPT_FILE:=}" # optional override
2020
: "${THTTS_VOCAB_FILE:=}" # optional override
21+
: "${THTTS_VOICES_YAML:=}" # optional voices.yaml path
2122

2223
BACKEND="${THTTS_BACKEND:-VITS}"
2324
BACKEND_UPPER="$(echo "$BACKEND" | tr '[:lower:]' '[:upper:]')"
@@ -63,6 +64,9 @@ run_f5 () {
6364
if [[ -n "${THTTS_REF_TEXT}" ]]; then
6465
args+=( --ref-text "${THTTS_REF_TEXT}" )
6566
fi
67+
if [[ -n "${THTTS_VOICES_YAML}" ]]; then
68+
args+=( --voices-yaml "${THTTS_VOICES_YAML}" )
69+
fi
6670

6771
exec uv run python src/wyoming_thai_f5.py "${args[@]}"
6872
}

src/wyoming_thai_f5.py

Lines changed: 86 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,9 @@
3939
import os
4040
import re
4141
import unicodedata
42-
from typing import List
42+
from typing import Any, List
4343
from functools import partial
44+
from dataclasses import dataclass
4445
import numpy as np
4546

4647
from util.cleantext import process_thai_repeat, replace_numbers_with_thai
@@ -171,14 +172,46 @@ def _split_ready_vs_tail(text: str, *, final: bool = False) -> tuple[list[str],
171172
return [], s
172173

173174

175+
@dataclass(kw_only=True)
176+
class TtsVoiceWithRef(TtsVoice):
177+
"""Text-to-speech voice. with ref_sound_path"""
178+
ref_sound_path: str
179+
ref_sound_sentence: str
180+
174181
# -----------------------
175182
# F5-TTS Thai Engine
176183
# -----------------------
184+
185+
177186
class ThaiF5Engine:
178187
"""
179188
Wraps F5-TTS (DiT + vocos) with Thai finetuned checkpoint.
180189
Produces 24 kHz mono float32 waveform via infer_process().
181190
"""
191+
DEFAULT_VOICE_LIST = [
192+
TtsVoice(
193+
name="thai-default",
194+
attribution=Attribution(
195+
name="VIZINTZOR/F5-TTS-THAI",
196+
url="https://huggingface.co/VIZINTZOR/F5-TTS-THAI",
197+
),
198+
languages=["th", "th-TH"],
199+
description="Thai female (F5-TTS finetune)",
200+
installed=True,
201+
version="1.0",
202+
),
203+
TtsVoice(
204+
name="default",
205+
attribution=Attribution(
206+
name="VIZINTZOR/F5-TTS-THAI",
207+
url="https://huggingface.co/VIZINTZOR/F5-TTS-THAI",
208+
),
209+
languages=["th", "th-TH"],
210+
description="Alias of thai-default",
211+
installed=True,
212+
version="1.0",
213+
),
214+
]
182215

183216
def __init__(
184217
self,
@@ -190,6 +223,7 @@ def __init__(
190223
device: str = "auto",
191224
speed: float = SPEAK_SPEED,
192225
nfe_steps: int = nfe_step,
226+
voices_yaml: str | None = None,
193227
):
194228
# Resolve device
195229
if device == "auto":
@@ -200,7 +234,9 @@ def __init__(
200234
# Hugging Face repo: VIZINTZOR/F5-TTS-THAI (model_1000000.pt, vocab.txt, sample/ref_audio.wav)
201235
self.ckpt_file = str(cached_path(ckpt_file or "hf://VIZINTZOR/F5-TTS-THAI/model_1000000.pt"))
202236
self.vocab_file = str(cached_path(vocab_file or "hf://VIZINTZOR/F5-TTS-THAI/vocab.txt"))
203-
237+
self.voices_yaml = voices_yaml
238+
self.processed_voices: dict[str, dict[str, Any]] = {}
239+
self.voices_list: list[TtsVoice] = self.load_voice_yaml() # pyright: ignore[reportAttributeAccessIssue]
204240
# Model base config from f5_tts package
205241
# model_cfg_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../resources/F5-TTS-THAI/F5TTS_Base_train.yaml')
206242
# model_cfg = OmegaConf.load(model_cfg_path).model.arch
@@ -246,6 +282,40 @@ def __init__(
246282
self.sr = 24000
247283
logging.info("Engine ready: device=%s sr=%d", self.device, self.sr)
248284

285+
def load_voice_yaml(self):
286+
if not self.voices_yaml:
287+
return ThaiF5Engine.DEFAULT_VOICE_LIST
288+
289+
try:
290+
import yaml
291+
with open(self.voices_yaml, "r", encoding="utf-8") as f:
292+
raw = yaml.safe_load(f)
293+
except Exception as err:
294+
logging.warning(f"Failed to read voices yaml at {self.voices_yaml}: {err}")
295+
return ThaiF5Engine.DEFAULT_VOICE_LIST
296+
if not isinstance(raw, list):
297+
logging.warning(f"Voices yaml must be a list at top-level, got {type(raw).__name__}")
298+
return ThaiF5Engine.DEFAULT_VOICE_LIST
299+
300+
validated: list[TtsVoiceWithRef] = []
301+
for i, v in enumerate(raw):
302+
try:
303+
parsedVoice: TtsVoiceWithRef = TtsVoiceWithRef.from_dict(v)
304+
validated.append(parsedVoice)
305+
ref_audio_p, ref_text_p = preprocess_ref_audio_text(parsedVoice.ref_sound_path, parsedVoice.ref_sound_sentence)
306+
self.processed_voices[parsedVoice.name] = {
307+
"ref_audio_p": ref_audio_p,
308+
"ref_text_p": ref_text_p,
309+
}
310+
logging.debug(f"Loaded #{i}: {parsedVoice.name} in {self.voices_yaml}")
311+
except Exception as e:
312+
logging.warning(f"Invalid voice entry #{i} in {self.voices_yaml}: {e}")
313+
if not validated:
314+
logging.warning(f"No valid voices found in {self.voices_yaml}; using DEFAULT_VOICE_LIST")
315+
return ThaiF5Engine.DEFAULT_VOICE_LIST
316+
317+
return validated
318+
249319
@torch.inference_mode()
250320
def synth_blocking(self, text: str) -> np.ndarray:
251321
text = (text or "").strip()
@@ -327,30 +397,7 @@ async def handle_event(self, event: Event) -> bool:
327397
name="VIZINTZOR/F5-TTS-THAI",
328398
url="https://huggingface.co/VIZINTZOR/F5-TTS-THAI",
329399
),
330-
voices=[
331-
TtsVoice(
332-
name="thai-default",
333-
attribution=Attribution(
334-
name="VIZINTZOR/F5-TTS-THAI",
335-
url="https://huggingface.co/VIZINTZOR/F5-TTS-THAI",
336-
),
337-
languages=["th", "th-TH"],
338-
description="Thai female (F5-TTS finetune)",
339-
installed=True,
340-
version="1.0",
341-
),
342-
TtsVoice(
343-
name="default",
344-
attribution=Attribution(
345-
name="VIZINTZOR/F5-TTS-THAI",
346-
url="https://huggingface.co/VIZINTZOR/F5-TTS-THAI",
347-
),
348-
languages=["th", "th-TH"],
349-
description="Alias of thai-default",
350-
installed=True,
351-
version="1.0",
352-
),
353-
],
400+
voices=self.engine.voices_list,
354401
installed=True,
355402
description="Thai TTS via F5-TTS (DiT + vocos, 24 kHz)",
356403
version="1.0",
@@ -377,6 +424,17 @@ async def handle_event(self, event: Event) -> bool:
377424
self._reset_buffer()
378425
self._audio_started = False
379426
logging.info("Synthesize streaming START: %s", event)
427+
# Event(type='synthesize-start', data={'voice': {'name': 'default'}}, payload=None)
428+
try:
429+
voice_name = event.data.get('voice', {}).get('name', "")
430+
target_voice_dict = self.engine.processed_voices.get(voice_name, None)
431+
if target_voice_dict is not None:
432+
self.engine.ref_audio_p = target_voice_dict.get("ref_audio_p")
433+
self.engine.ref_text_p = target_voice_dict.get("ref_text_p")
434+
logging.info("Voice switched to: %s", voice_name)
435+
except Exception as err:
436+
logging.warning(f"Voice was not selected: {err}")
437+
380438
# Prime playback immediately so the player opens.
381439
await self._ensure_audio_started()
382440
import numpy as _np
@@ -570,7 +628,7 @@ async def main():
570628
ap.add_argument("--speed", type=float, default=SPEAK_SPEED, help="Speech speed multiplier.")
571629
ap.add_argument("--nfe-steps", type=int, default=nfe_step, help="Denoising steps.")
572630
ap.add_argument("--max-concurrent", type=int, default=1, help="Legacy params, do not change")
573-
631+
ap.add_argument("--voices-yaml", default=None, help="Path to voices.yaml defining available TTS voices/programs.")
574632
ap.add_argument("--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"])
575633
args = ap.parse_args()
576634

@@ -590,6 +648,7 @@ async def main():
590648
device=args.device,
591649
speed=args.speed,
592650
nfe_steps=args.nfe_steps,
651+
voices_yaml=args.voices_yaml,
593652
)
594653
sem = asyncio.Semaphore(args.max_concurrent) # TODO: more than 1 is broken
595654

0 commit comments

Comments
 (0)