Skip to content

feat: track token provenance #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion model2vec/distill/distillation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from model2vec.distill.inference import create_embeddings
from model2vec.distill.tokenizer import replace_vocabulary
from model2vec.distill.utils import select_optimal_device
from model2vec.distill.utils import Token, select_optimal_device
from model2vec.model import StaticModel
from model2vec.quantization import DType, quantize_embeddings

Expand Down
11 changes: 6 additions & 5 deletions model2vec/distill/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from transformers import PreTrainedModel, PreTrainedTokenizerFast
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions

from model2vec.distill.utils import filter_vocabulary_by_regex
from model2vec.distill.utils import Token, filter_vocabulary_by_regex

logger = logging.getLogger(__name__)

Expand All @@ -35,7 +35,7 @@ def create_embeddings(
tokens: list[str],
device: str,
token_remove_regex: re.Pattern | None,
) -> tuple[list[str], np.ndarray]:
) -> tuple[list[Token], np.ndarray]:
"""
Create output embeddings for a bunch of tokens using a pretrained model.

Expand All @@ -55,7 +55,7 @@ def create_embeddings(
out_weights: np.ndarray
intermediate_weights: list[np.ndarray] = []

out_tokens = []
out_tokens: list[Token] = []
tokenized: list[torch.Tensor] = []
pad_token = tokenizer.special_tokens_map.get("pad_token")
pad_token_id = tokenizer.convert_tokens_to_ids(pad_token)
Expand Down Expand Up @@ -89,7 +89,8 @@ def create_embeddings(
eos = torch.full([len(ids)], fill_value=eos_token_id)

tokenized.extend(torch.stack([bos, ids, eos], dim=1))
out_tokens.extend(tokenizer.convert_ids_to_tokens(ids))
subword_tokens = [Token(x, True) for x in tokenizer.convert_ids_to_tokens(ids.tolist())]
out_tokens.extend(subword_tokens)

tokenized.extend([tokenizer.encode_plus(token, return_tensors="pt")["input_ids"][0] for token in tokens])

Expand Down Expand Up @@ -119,7 +120,7 @@ def create_embeddings(

# Sort the output back to the original order
intermediate_weights = [intermediate_weights[i] for i in np.argsort(sort_order)]
out_tokens.extend(tokens)
out_tokens.extend([Token(x, False) for x in tokens])
out_weights = np.stack(intermediate_weights)

return out_tokens, out_weights
Expand Down
18 changes: 10 additions & 8 deletions model2vec/distill/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from tokenizers import Tokenizer

from model2vec.distill.utils import Token

logger = logging.getLogger(__name__)


Expand All @@ -17,7 +19,7 @@
}


def _pre_tokenize_vocabulary(tokenizer: Tokenizer, tokens: list[str]) -> list[str]:
def _pre_tokenize_vocabulary(tokenizer: Tokenizer, tokens: list[Token]) -> list[str]:
"""
Apply pre-tokenization to vocabulary tokens if a pre-tokenizer is present.

Expand All @@ -33,14 +35,14 @@

if tokenizer.pre_tokenizer is not None:
for token in tokens:
if token in current_tokenizer_vocab:
pre_tokenized_tokens.append(token)
if token.is_subword:
pre_tokenized_tokens.append(token.form)
else:
# We know 100% sure that all pretokenized tokens will have length 1.
pretokenized_tokens, _ = zip(*tokenizer.pre_tokenizer.pre_tokenize_str(f" {token}"))
pretokenized_tokens, _ = zip(*tokenizer.pre_tokenizer.pre_tokenize_str(f" {token.form}"))
pre_tokenized_tokens.append(pretokenized_tokens[-1])
else:
pre_tokenized_tokens = tokens
pre_tokenized_tokens = [token.form for token in tokens]

Check warning on line 45 in model2vec/distill/tokenizer.py

View check run for this annotation

Codecov / codecov/patch

model2vec/distill/tokenizer.py#L45

Added line #L45 was not covered by tests

return pre_tokenized_tokens

Expand Down Expand Up @@ -106,7 +108,7 @@


def replace_vocabulary(
tokenizer: Tokenizer, new_vocabulary: list[str], unk_token: str | None, pad_token: str | None
tokenizer: Tokenizer, new_vocabulary: list[Token], unk_token: str | None, pad_token: str | None
) -> Tokenizer:
"""Replace the vocabulary of a tokenizer with a new one."""
tokenizer_json: dict[str, Any] = json.loads(tokenizer.to_str())
Expand Down Expand Up @@ -139,8 +141,8 @@
vocab = tokenizer_json["model"]["vocab"]
unk_token = vocab[unk_id][0] if unk_id is not None else None
current_probas = dict(tokenizer_json["model"]["vocab"])
lowest_proba = min(current_probas.values())
new_probas = {word: current_probas.get(word, lowest_proba) for word in pre_tokenized_tokens}
avg_proba = sum(current_probas.values()) / len(current_probas)
new_probas = {word: current_probas.get(word, avg_proba) for word in pre_tokenized_tokens}

Check warning on line 145 in model2vec/distill/tokenizer.py

View check run for this annotation

Codecov / codecov/patch

model2vec/distill/tokenizer.py#L144-L145

Added lines #L144 - L145 were not covered by tests
tokenizer_json["model"]["vocab"] = sorted(new_probas.items(), key=lambda x: x[1], reverse=True)

tokens, _ = zip(*tokenizer_json["model"]["vocab"])
Expand Down
9 changes: 9 additions & 0 deletions model2vec/distill/utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
from __future__ import annotations

import re
from dataclasses import dataclass
from logging import getLogger

import torch

logger = getLogger(__name__)


@dataclass
class Token:
"""A class to represent a token."""

form: str
is_subword: bool


def select_optimal_device(device: str | None) -> str:
"""
Guess what your optimal device should be based on backend availability.
Expand Down