From 9bec4956358756fc4c3d90545d2bf08271797308 Mon Sep 17 00:00:00 2001 From: stephantul Date: Tue, 27 May 2025 11:08:31 +0200 Subject: [PATCH 1/3] wip --- model2vec/tokenizer/tokenizer.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/model2vec/tokenizer/tokenizer.py b/model2vec/tokenizer/tokenizer.py index 7e7bcc5..1a30fc4 100644 --- a/model2vec/tokenizer/tokenizer.py +++ b/model2vec/tokenizer/tokenizer.py @@ -65,7 +65,7 @@ def replace_vocabulary( # Remove old added tokens from added tokens tokenizer_json["added_tokens"] = [x for x in added_tokens if x["content"] in {"[UNK]", "[PAD]"}] - tokenizer_json = process_tokenizer(tokenizer_json, pre_tokenized_tokens, "[UNK]") + tokenizer_json = process_tokenizer(tokenizer_json, pre_tokenized_tokens, "[UNK]" if "[UNK]" in pre_tokenized_tokens else None) # Remap special tokens tokenizer_json["added_tokens"] = _remap_added_tokens( @@ -305,12 +305,7 @@ def turn_tokens_into_ids( :raises ValueError: If the tokenizer returns an unexpected number of tokens for a single token """ unk_id = None if unk_token is None else tokenizer.convert_tokens_to_ids(unk_token) - - encoding = tokenizer.encode("a", add_special_tokens=True) - - if len(encoding) != 3: - raise ValueError(f"Tokenizer returned {len(encoding)} tokens for a single token. This is not supported.") - bos, _, eos = encoding + prefix, suffix = find_eos_bos(tokenizer) token_ids: list[list[int]] = [] for token in tokens: @@ -321,13 +316,28 @@ def turn_tokens_into_ids( # Explicitly check and warn if `unk_id` appears, but don't crash. if unk_id is not None and token_id == unk_id and token.form != unk_token: logger.warning(f"Token {token.form} was set to unk. This is wrong.") - token_ids.append([bos, token_id, eos]) + token_ids.append([*prefix, token_id, *suffix]) else: token_ids.append(tokenizer.encode(token.form)) return token_ids +def find_eos_bos(tokenizer: PreTrainedTokenizerFast) -> tuple[list[int], list[int]]: + """Finds the eos and bos tokens for a tokenizer.""" + # Little bit complicated, because not all tokenizers have eos and bos tokens. + encoding = tokenizer.encode("a", add_special_tokens=True) + if len(encoding) != 3: + a_encoded = tokenizer.encode("a", add_special_tokens=False) + if len(a_encoded) != 1: + raise ValueError(f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{a_encoded}'") + a_idx = encoding.index(a_encoded[0]) + prefix, suffix = encoding[:a_idx], encoding[a_idx + 1:] + else: + prefix, suffix = encoding[:1], encoding[2:] + return prefix,suffix + + def _normalize_vocabulary_token(token: str, pre_tokenizer: PreTokenizer) -> str: """Normalize a token that is not in the initial token vocabulary.""" # Add prefix space for byte tokenizers. From fd7d473cf55eb89cb97fb202fe6762084bd6222d Mon Sep 17 00:00:00 2001 From: stephantul Date: Tue, 27 May 2025 11:28:56 +0200 Subject: [PATCH 2/3] fix: tokenizer bug --- model2vec/tokenizer/tokenizer.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/model2vec/tokenizer/tokenizer.py b/model2vec/tokenizer/tokenizer.py index 1a30fc4..88016c7 100644 --- a/model2vec/tokenizer/tokenizer.py +++ b/model2vec/tokenizer/tokenizer.py @@ -65,7 +65,9 @@ def replace_vocabulary( # Remove old added tokens from added tokens tokenizer_json["added_tokens"] = [x for x in added_tokens if x["content"] in {"[UNK]", "[PAD]"}] - tokenizer_json = process_tokenizer(tokenizer_json, pre_tokenized_tokens, "[UNK]" if "[UNK]" in pre_tokenized_tokens else None) + tokenizer_json = process_tokenizer( + tokenizer_json, pre_tokenized_tokens, "[UNK]" if "[UNK]" in pre_tokenized_tokens else None + ) # Remap special tokens tokenizer_json["added_tokens"] = _remap_added_tokens( @@ -111,11 +113,11 @@ def clean_and_create_vocabulary( internal_vocab: dict[str, int] = tokenizer.get_vocab() internal_tokens: list[str] = [k for k, _ in sorted(internal_vocab.items(), key=lambda x: x[1])] + cleaned_vocabulary = _process_internal_tokens(tokenizer, backend_tokenizer, internal_tokens, token_remove_regex) # Copy the backend tokenizer to avoid modifying the original. backend_tokenizer = backend_tokenizer.from_str(backend_tokenizer.to_str()) backend_tokenizer = replace_normalizer(backend_tokenizer) - cleaned_vocabulary = _process_internal_tokens(tokenizer, backend_tokenizer, internal_tokens, token_remove_regex) internal_tokens_set = {token.form for token in cleaned_vocabulary} normalizer: Normalizer | None = backend_tokenizer.normalizer @@ -302,7 +304,6 @@ def turn_tokens_into_ids( :param tokenizer: The tokenizer to use for converting tokens to IDs :param unk_token: The string form of the unk token. :return: List of token IDs corresponding to the input tokens - :raises ValueError: If the tokenizer returns an unexpected number of tokens for a single token """ unk_id = None if unk_token is None else tokenizer.convert_tokens_to_ids(unk_token) prefix, suffix = find_eos_bos(tokenizer) @@ -330,12 +331,14 @@ def find_eos_bos(tokenizer: PreTrainedTokenizerFast) -> tuple[list[int], list[in if len(encoding) != 3: a_encoded = tokenizer.encode("a", add_special_tokens=False) if len(a_encoded) != 1: - raise ValueError(f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{a_encoded}'") + raise ValueError( + f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{a_encoded}'" + ) a_idx = encoding.index(a_encoded[0]) - prefix, suffix = encoding[:a_idx], encoding[a_idx + 1:] + prefix, suffix = encoding[:a_idx], encoding[a_idx + 1 :] else: prefix, suffix = encoding[:1], encoding[2:] - return prefix,suffix + return prefix, suffix def _normalize_vocabulary_token(token: str, pre_tokenizer: PreTokenizer) -> str: From 32343b155d7cdde7a3316f59b5234af6456b6314 Mon Sep 17 00:00:00 2001 From: stephantul Date: Tue, 27 May 2025 12:15:05 +0200 Subject: [PATCH 3/3] add correct scaling for byte --- model2vec/tokenizer/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model2vec/tokenizer/model.py b/model2vec/tokenizer/model.py index 84c936a..12dd388 100644 --- a/model2vec/tokenizer/model.py +++ b/model2vec/tokenizer/model.py @@ -40,4 +40,4 @@ def _process_unigram( def _calculate_token_weight_for_unigram(token: str) -> float: """Calculate the token weight for Unigram.""" # Always prefer longer tokens. - return len(token) + token.count("▁") + return len(token) + token.count("▁") + token.count("Ġ")