Skip to content

fix: missing unk, fix bug #251

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion model2vec/tokenizer/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ def _process_unigram(
def _calculate_token_weight_for_unigram(token: str) -> float:
"""Calculate the token weight for Unigram."""
# Always prefer longer tokens.
return len(token) + token.count("▁")
return len(token) + token.count("▁") + token.count("Ġ")
33 changes: 23 additions & 10 deletions model2vec/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@

# Remove old added tokens from added tokens
tokenizer_json["added_tokens"] = [x for x in added_tokens if x["content"] in {"[UNK]", "[PAD]"}]
tokenizer_json = process_tokenizer(tokenizer_json, pre_tokenized_tokens, "[UNK]")
tokenizer_json = process_tokenizer(
tokenizer_json, pre_tokenized_tokens, "[UNK]" if "[UNK]" in pre_tokenized_tokens else None
)

# Remap special tokens
tokenizer_json["added_tokens"] = _remap_added_tokens(
Expand Down Expand Up @@ -111,11 +113,11 @@
internal_vocab: dict[str, int] = tokenizer.get_vocab()
internal_tokens: list[str] = [k for k, _ in sorted(internal_vocab.items(), key=lambda x: x[1])]

cleaned_vocabulary = _process_internal_tokens(tokenizer, backend_tokenizer, internal_tokens, token_remove_regex)
# Copy the backend tokenizer to avoid modifying the original.
backend_tokenizer = backend_tokenizer.from_str(backend_tokenizer.to_str())
backend_tokenizer = replace_normalizer(backend_tokenizer)

cleaned_vocabulary = _process_internal_tokens(tokenizer, backend_tokenizer, internal_tokens, token_remove_regex)
internal_tokens_set = {token.form for token in cleaned_vocabulary}

normalizer: Normalizer | None = backend_tokenizer.normalizer
Expand Down Expand Up @@ -302,15 +304,9 @@
:param tokenizer: The tokenizer to use for converting tokens to IDs
:param unk_token: The string form of the unk token.
:return: List of token IDs corresponding to the input tokens
:raises ValueError: If the tokenizer returns an unexpected number of tokens for a single token
"""
unk_id = None if unk_token is None else tokenizer.convert_tokens_to_ids(unk_token)

encoding = tokenizer.encode("a", add_special_tokens=True)

if len(encoding) != 3:
raise ValueError(f"Tokenizer returned {len(encoding)} tokens for a single token. This is not supported.")
bos, _, eos = encoding
prefix, suffix = find_eos_bos(tokenizer)

token_ids: list[list[int]] = []
for token in tokens:
Expand All @@ -321,13 +317,30 @@
# Explicitly check and warn if `unk_id` appears, but don't crash.
if unk_id is not None and token_id == unk_id and token.form != unk_token:
logger.warning(f"Token {token.form} was set to unk. This is wrong.")
token_ids.append([bos, token_id, eos])
token_ids.append([*prefix, token_id, *suffix])
else:
token_ids.append(tokenizer.encode(token.form))

return token_ids


def find_eos_bos(tokenizer: PreTrainedTokenizerFast) -> tuple[list[int], list[int]]:
"""Finds the eos and bos tokens for a tokenizer."""
# Little bit complicated, because not all tokenizers have eos and bos tokens.
encoding = tokenizer.encode("a", add_special_tokens=True)
if len(encoding) != 3:
a_encoded = tokenizer.encode("a", add_special_tokens=False)
if len(a_encoded) != 1:
raise ValueError(

Check warning on line 334 in model2vec/tokenizer/tokenizer.py

View check run for this annotation

Codecov / codecov/patch

model2vec/tokenizer/tokenizer.py#L332-L334

Added lines #L332 - L334 were not covered by tests
f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{a_encoded}'"
)
a_idx = encoding.index(a_encoded[0])
prefix, suffix = encoding[:a_idx], encoding[a_idx + 1 :]

Check warning on line 338 in model2vec/tokenizer/tokenizer.py

View check run for this annotation

Codecov / codecov/patch

model2vec/tokenizer/tokenizer.py#L337-L338

Added lines #L337 - L338 were not covered by tests
else:
prefix, suffix = encoding[:1], encoding[2:]
return prefix, suffix


def _normalize_vocabulary_token(token: str, pre_tokenizer: PreTokenizer) -> str:
"""Normalize a token that is not in the initial token vocabulary."""
# Add prefix space for byte tokenizers.
Expand Down