Skip to content

Commit 8067a80

Browse files
fix: fix tokenizers with both withespace and metaspace (#96)
1 parent fc716d5 commit 8067a80

File tree

1 file changed

+27
-6
lines changed

1 file changed

+27
-6
lines changed

router/src/main.rs

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use text_embeddings_core::queue::Queue;
1919
use text_embeddings_core::tokenization::Tokenization;
2020
use text_embeddings_router::{ClassifierModel, EmbeddingModel, Info, ModelType};
2121
use tokenizers::decoders::metaspace::PrependScheme;
22+
use tokenizers::pre_tokenizers::sequence::Sequence;
2223
use tokenizers::{PreTokenizerWrapper, Tokenizer};
2324
use tracing_subscriber::layer::SubscriberExt;
2425
use tracing_subscriber::util::SubscriberInitExt;
@@ -268,14 +269,34 @@ async fn main() -> Result<()> {
268269
m.set_prepend_scheme(PrependScheme::First);
269270
tokenizer.with_pre_tokenizer(PreTokenizerWrapper::Metaspace(m));
270271
} else if let PreTokenizerWrapper::Sequence(s) = pre_tokenizer {
271-
// We are forced to clone since `Tokenizer` does not have a `get_mut` for `pre_tokenizer`
272-
let mut s = s.clone();
273-
for pre_tokenizer in s.get_pre_tokenizers_mut() {
274-
if let PreTokenizerWrapper::Metaspace(m) = pre_tokenizer {
275-
m.set_prepend_scheme(PrependScheme::First);
272+
let pre_tokenizers = s.get_pre_tokenizers();
273+
// Check if we have a Metaspace pre tokenizer in the sequence
274+
let has_metaspace = pre_tokenizers
275+
.iter()
276+
.find(|t| matches!(t, PreTokenizerWrapper::Metaspace(_)))
277+
.is_some();
278+
279+
if has_metaspace {
280+
let mut new_pre_tokenizers = Vec::with_capacity(s.get_pre_tokenizers().len());
281+
282+
for pre_tokenizer in pre_tokenizers {
283+
if let PreTokenizerWrapper::WhitespaceSplit(_) = pre_tokenizer {
284+
// Remove WhitespaceSplit
285+
// This will be done by the Metaspace pre tokenizer
286+
continue;
287+
}
288+
289+
let mut pre_tokenizer = pre_tokenizer.clone();
290+
291+
if let PreTokenizerWrapper::Metaspace(ref mut m) = pre_tokenizer {
292+
m.set_prepend_scheme(PrependScheme::First);
293+
}
294+
new_pre_tokenizers.push(pre_tokenizer);
276295
}
296+
tokenizer.with_pre_tokenizer(PreTokenizerWrapper::Sequence(Sequence::new(
297+
new_pre_tokenizers,
298+
)));
277299
}
278-
tokenizer.with_pre_tokenizer(PreTokenizerWrapper::Sequence(s));
279300
}
280301
}
281302

0 commit comments

Comments
 (0)