Skip to content

Commit 06a478c

Browse files
authored
feat: make normalization dependent on spacing (#259)
1 parent a3c42a0 commit 06a478c

File tree

1 file changed

+9
-1
lines changed

1 file changed

+9
-1
lines changed

model2vec/tokenizer/normalizer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,18 @@ def replace_normalizer(
1818
:param tokenizer: The tokenizer to change.
1919
:return: The tokenizer with a replaced normalizer.
2020
"""
21+
spaces_punctuation = tokenizer.encode("a, ,", add_special_tokens=False).tokens
22+
if len(spaces_punctuation) != 3:
23+
add_space = False
24+
else:
25+
_, first_comma, second_comma = spaces_punctuation
26+
add_space = first_comma == second_comma == ","
27+
2128
normalizer = tokenizer.normalizer
2229
new_normalizers = []
2330
for char in punctuation:
24-
new_normalizers.append(Replace(char, f" {char} "))
31+
replacement = f" {char} " if add_space else f"{char} "
32+
new_normalizers.append(Replace(char, replacement))
2533

2634
new_normalizers.append(Replace(Regex(r"\s+"), " "))
2735
new_normalizers.append(Strip(right=True))

0 commit comments

Comments
 (0)