Skip to content

Commit d67570a

Browse files
Merge pull request #210 from fireblade2534/preserve-custom-phenomes
This fix allows for inputing custom pronuncations through text. For example: "This is a test of a [bla bla](/ðɪs ɪz ˈoʊnli ɐ tˈɛst/) system." It ensures that normalization does not affect custom prnouncations
2 parents 2a54140 + 43576c4 commit d67570a

File tree

2 files changed

+23
-5
lines changed

2 files changed

+23
-5
lines changed

api/src/services/text_processing/normalizer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,6 @@ def normalize_text(text: str,normalization_options: NormalizationOptions) -> str
270270
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
271271
text = text.replace("«", chr(8220)).replace("»", chr(8221))
272272
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
273-
text = text.replace("(", "«").replace(")", "»")
274273

275274
# Handle CJK punctuation and some non standard chars
276275
for a, b in zip("、。!,:;?–", ",.!,:;?-"):

api/src/services/text_processing/text_processor.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import re
44
import time
5-
from typing import AsyncGenerator, List, Tuple
5+
from typing import AsyncGenerator, Dict, List, Tuple
66

77
from loguru import logger
88

@@ -12,6 +12,9 @@
1212
from .vocabulary import tokenize
1313
from ...structures.schemas import NormalizationOptions
1414

15+
# Pre-compiled regex patterns for performance
16+
CUSTOM_PHONEMES = re.compile(r"(\[([^\]]|\n)*?\])(\(\/([^\/)]|\n)*?\/\))")
17+
1518
def process_text_chunk(
1619
text: str, language: str = "a", skip_phonemize: bool = False
1720
) -> List[int]:
@@ -85,12 +88,21 @@ def process_text(text: str, language: str = "a") -> List[int]:
8588
return process_text_chunk(text, language)
8689

8790

88-
def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
91+
def get_sentence_info(text: str, custom_phenomes_list: Dict[str, str]) -> List[Tuple[str, List[int], int]]:
8992
"""Process all sentences and return info."""
9093
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
94+
phoneme_length, min_value = len(custom_phenomes_list), 0
95+
9196
results = []
9297
for i in range(0, len(sentences), 2):
9398
sentence = sentences[i].strip()
99+
for replaced in range(min_value, phoneme_length):
100+
current_id = f"</|custom_phonemes_{replaced}|/>"
101+
if current_id in sentence:
102+
sentence = sentence.replace(current_id, custom_phenomes_list.pop(current_id))
103+
min_value += 1
104+
105+
94106
punct = sentences[i + 1] if i + 1 < len(sentences) else ""
95107

96108
if not sentence:
@@ -102,6 +114,10 @@ def get_sentence_info(text: str) -> List[Tuple[str, List[int], int]]:
102114

103115
return results
104116

117+
def handle_custom_phonemes(s: re.Match[str], phenomes_list: Dict[str,str]) -> str:
118+
latest_id = f"</|custom_phonemes_{len(phenomes_list)}|/>"
119+
phenomes_list[latest_id] = s.group(0).strip()
120+
return latest_id
105121

106122
async def smart_split(
107123
text: str,
@@ -114,15 +130,18 @@ async def smart_split(
114130
chunk_count = 0
115131
logger.info(f"Starting smart split for {len(text)} chars")
116132

133+
custom_phoneme_list = {}
134+
117135
# Normalize text
118136
if settings.advanced_text_normalization and normalization_options.normalize:
119137
if lang_code in ["a","b","en-us","en-gb"]:
138+
text = CUSTOM_PHONEMES.sub(lambda s: handle_custom_phonemes(s, custom_phoneme_list), text)
120139
text=normalize_text(text,normalization_options)
121140
else:
122141
logger.info("Skipping text normalization as it is only supported for english")
123142

124143
# Process all sentences
125-
sentences = get_sentence_info(text)
144+
sentences = get_sentence_info(text, custom_phoneme_list)
126145

127146
current_chunk = []
128147
current_tokens = []
@@ -245,4 +264,4 @@ async def smart_split(
245264
total_time = time.time() - start_time
246265
logger.info(
247266
f"Split completed in {total_time * 1000:.2f}ms, produced {chunk_count} chunks"
248-
)
267+
)

0 commit comments

Comments
 (0)