3939import os
4040import re
4141import unicodedata
42- from typing import List
42+ from typing import Any , List
4343from functools import partial
44+ from dataclasses import dataclass
4445import numpy as np
4546
4647from util .cleantext import process_thai_repeat , replace_numbers_with_thai
@@ -171,14 +172,46 @@ def _split_ready_vs_tail(text: str, *, final: bool = False) -> tuple[list[str],
171172 return [], s
172173
173174
175+ @dataclass (kw_only = True )
176+ class TtsVoiceWithRef (TtsVoice ):
177+ """Text-to-speech voice. with ref_sound_path"""
178+ ref_sound_path : str
179+ ref_sound_sentence : str
180+
174181# -----------------------
175182# F5-TTS Thai Engine
176183# -----------------------
184+
185+
177186class ThaiF5Engine :
178187 """
179188 Wraps F5-TTS (DiT + vocos) with Thai finetuned checkpoint.
180189 Produces 24 kHz mono float32 waveform via infer_process().
181190 """
191+ DEFAULT_VOICE_LIST = [
192+ TtsVoice (
193+ name = "thai-default" ,
194+ attribution = Attribution (
195+ name = "VIZINTZOR/F5-TTS-THAI" ,
196+ url = "https://huggingface.co/VIZINTZOR/F5-TTS-THAI" ,
197+ ),
198+ languages = ["th" , "th-TH" ],
199+ description = "Thai female (F5-TTS finetune)" ,
200+ installed = True ,
201+ version = "1.0" ,
202+ ),
203+ TtsVoice (
204+ name = "default" ,
205+ attribution = Attribution (
206+ name = "VIZINTZOR/F5-TTS-THAI" ,
207+ url = "https://huggingface.co/VIZINTZOR/F5-TTS-THAI" ,
208+ ),
209+ languages = ["th" , "th-TH" ],
210+ description = "Alias of thai-default" ,
211+ installed = True ,
212+ version = "1.0" ,
213+ ),
214+ ]
182215
183216 def __init__ (
184217 self ,
@@ -190,6 +223,7 @@ def __init__(
190223 device : str = "auto" ,
191224 speed : float = SPEAK_SPEED ,
192225 nfe_steps : int = nfe_step ,
226+ voices_yaml : str | None = None ,
193227 ):
194228 # Resolve device
195229 if device == "auto" :
@@ -200,7 +234,9 @@ def __init__(
200234 # Hugging Face repo: VIZINTZOR/F5-TTS-THAI (model_1000000.pt, vocab.txt, sample/ref_audio.wav)
201235 self .ckpt_file = str (cached_path (ckpt_file or "hf://VIZINTZOR/F5-TTS-THAI/model_1000000.pt" ))
202236 self .vocab_file = str (cached_path (vocab_file or "hf://VIZINTZOR/F5-TTS-THAI/vocab.txt" ))
203-
237+ self .voices_yaml = voices_yaml
238+ self .processed_voices : dict [str , dict [str , Any ]] = {}
239+ self .voices_list : list [TtsVoice ] = self .load_voice_yaml () # pyright: ignore[reportAttributeAccessIssue]
204240 # Model base config from f5_tts package
205241 # model_cfg_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../resources/F5-TTS-THAI/F5TTS_Base_train.yaml')
206242 # model_cfg = OmegaConf.load(model_cfg_path).model.arch
@@ -246,6 +282,40 @@ def __init__(
246282 self .sr = 24000
247283 logging .info ("Engine ready: device=%s sr=%d" , self .device , self .sr )
248284
285+ def load_voice_yaml (self ):
286+ if not self .voices_yaml :
287+ return ThaiF5Engine .DEFAULT_VOICE_LIST
288+
289+ try :
290+ import yaml
291+ with open (self .voices_yaml , "r" , encoding = "utf-8" ) as f :
292+ raw = yaml .safe_load (f )
293+ except Exception as err :
294+ logging .warning (f"Failed to read voices yaml at { self .voices_yaml } : { err } " )
295+ return ThaiF5Engine .DEFAULT_VOICE_LIST
296+ if not isinstance (raw , list ):
297+ logging .warning (f"Voices yaml must be a list at top-level, got { type (raw ).__name__ } " )
298+ return ThaiF5Engine .DEFAULT_VOICE_LIST
299+
300+ validated : list [TtsVoiceWithRef ] = []
301+ for i , v in enumerate (raw ):
302+ try :
303+ parsedVoice : TtsVoiceWithRef = TtsVoiceWithRef .from_dict (v )
304+ validated .append (parsedVoice )
305+ ref_audio_p , ref_text_p = preprocess_ref_audio_text (parsedVoice .ref_sound_path , parsedVoice .ref_sound_sentence )
306+ self .processed_voices [parsedVoice .name ] = {
307+ "ref_audio_p" : ref_audio_p ,
308+ "ref_text_p" : ref_text_p ,
309+ }
310+ logging .debug (f"Loaded #{ i } : { parsedVoice .name } in { self .voices_yaml } " )
311+ except Exception as e :
312+ logging .warning (f"Invalid voice entry #{ i } in { self .voices_yaml } : { e } " )
313+ if not validated :
314+ logging .warning (f"No valid voices found in { self .voices_yaml } ; using DEFAULT_VOICE_LIST" )
315+ return ThaiF5Engine .DEFAULT_VOICE_LIST
316+
317+ return validated
318+
249319 @torch .inference_mode ()
250320 def synth_blocking (self , text : str ) -> np .ndarray :
251321 text = (text or "" ).strip ()
@@ -327,30 +397,7 @@ async def handle_event(self, event: Event) -> bool:
327397 name = "VIZINTZOR/F5-TTS-THAI" ,
328398 url = "https://huggingface.co/VIZINTZOR/F5-TTS-THAI" ,
329399 ),
330- voices = [
331- TtsVoice (
332- name = "thai-default" ,
333- attribution = Attribution (
334- name = "VIZINTZOR/F5-TTS-THAI" ,
335- url = "https://huggingface.co/VIZINTZOR/F5-TTS-THAI" ,
336- ),
337- languages = ["th" , "th-TH" ],
338- description = "Thai female (F5-TTS finetune)" ,
339- installed = True ,
340- version = "1.0" ,
341- ),
342- TtsVoice (
343- name = "default" ,
344- attribution = Attribution (
345- name = "VIZINTZOR/F5-TTS-THAI" ,
346- url = "https://huggingface.co/VIZINTZOR/F5-TTS-THAI" ,
347- ),
348- languages = ["th" , "th-TH" ],
349- description = "Alias of thai-default" ,
350- installed = True ,
351- version = "1.0" ,
352- ),
353- ],
400+ voices = self .engine .voices_list ,
354401 installed = True ,
355402 description = "Thai TTS via F5-TTS (DiT + vocos, 24 kHz)" ,
356403 version = "1.0" ,
@@ -377,6 +424,17 @@ async def handle_event(self, event: Event) -> bool:
377424 self ._reset_buffer ()
378425 self ._audio_started = False
379426 logging .info ("Synthesize streaming START: %s" , event )
427+ # Event(type='synthesize-start', data={'voice': {'name': 'default'}}, payload=None)
428+ try :
429+ voice_name = event .data .get ('voice' , {}).get ('name' , "" )
430+ target_voice_dict = self .engine .processed_voices .get (voice_name , None )
431+ if target_voice_dict is not None :
432+ self .engine .ref_audio_p = target_voice_dict .get ("ref_audio_p" )
433+ self .engine .ref_text_p = target_voice_dict .get ("ref_text_p" )
434+ logging .info ("Voice switched to: %s" , voice_name )
435+ except Exception as err :
436+ logging .warning (f"Voice was not selected: { err } " )
437+
380438 # Prime playback immediately so the player opens.
381439 await self ._ensure_audio_started ()
382440 import numpy as _np
@@ -570,7 +628,7 @@ async def main():
570628 ap .add_argument ("--speed" , type = float , default = SPEAK_SPEED , help = "Speech speed multiplier." )
571629 ap .add_argument ("--nfe-steps" , type = int , default = nfe_step , help = "Denoising steps." )
572630 ap .add_argument ("--max-concurrent" , type = int , default = 1 , help = "Legacy params, do not change" )
573-
631+ ap . add_argument ( "--voices-yaml" , default = None , help = "Path to voices.yaml defining available TTS voices/programs." )
574632 ap .add_argument ("--log-level" , default = "INFO" , choices = ["DEBUG" , "INFO" , "WARNING" , "ERROR" ])
575633 args = ap .parse_args ()
576634
@@ -590,6 +648,7 @@ async def main():
590648 device = args .device ,
591649 speed = args .speed ,
592650 nfe_steps = args .nfe_steps ,
651+ voices_yaml = args .voices_yaml ,
593652 )
594653 sem = asyncio .Semaphore (args .max_concurrent ) # TODO: more than 1 is broken
595654
0 commit comments