From 63a4bb44b96665f8e3272566ff65270832369f8b Mon Sep 17 00:00:00 2001 From: stephantul Date: Mon, 2 Jun 2025 20:16:28 +0200 Subject: [PATCH] feat: make normalization dependent on spacing --- model2vec/tokenizer/normalizer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/model2vec/tokenizer/normalizer.py b/model2vec/tokenizer/normalizer.py index c54308d..8f9d947 100644 --- a/model2vec/tokenizer/normalizer.py +++ b/model2vec/tokenizer/normalizer.py @@ -18,10 +18,18 @@ def replace_normalizer( :param tokenizer: The tokenizer to change. :return: The tokenizer with a replaced normalizer. """ + spaces_punctuation = tokenizer.encode("a, ,", add_special_tokens=False).tokens + if len(spaces_punctuation) != 3: + add_space = False + else: + _, first_comma, second_comma = spaces_punctuation + add_space = first_comma == second_comma == "," + normalizer = tokenizer.normalizer new_normalizers = [] for char in punctuation: - new_normalizers.append(Replace(char, f" {char} ")) + replacement = f" {char} " if add_space else f"{char} " + new_normalizers.append(Replace(char, replacement)) new_normalizers.append(Replace(Regex(r"\s+"), " ")) new_normalizers.append(Strip(right=True))