@@ -19,6 +19,7 @@ use text_embeddings_core::queue::Queue;
19
19
use text_embeddings_core:: tokenization:: Tokenization ;
20
20
use text_embeddings_router:: { ClassifierModel , EmbeddingModel , Info , ModelType } ;
21
21
use tokenizers:: decoders:: metaspace:: PrependScheme ;
22
+ use tokenizers:: pre_tokenizers:: sequence:: Sequence ;
22
23
use tokenizers:: { PreTokenizerWrapper , Tokenizer } ;
23
24
use tracing_subscriber:: layer:: SubscriberExt ;
24
25
use tracing_subscriber:: util:: SubscriberInitExt ;
@@ -268,14 +269,34 @@ async fn main() -> Result<()> {
268
269
m. set_prepend_scheme ( PrependScheme :: First ) ;
269
270
tokenizer. with_pre_tokenizer ( PreTokenizerWrapper :: Metaspace ( m) ) ;
270
271
} else if let PreTokenizerWrapper :: Sequence ( s) = pre_tokenizer {
271
- // We are forced to clone since `Tokenizer` does not have a `get_mut` for `pre_tokenizer`
272
- let mut s = s. clone ( ) ;
273
- for pre_tokenizer in s. get_pre_tokenizers_mut ( ) {
274
- if let PreTokenizerWrapper :: Metaspace ( m) = pre_tokenizer {
275
- m. set_prepend_scheme ( PrependScheme :: First ) ;
272
+ let pre_tokenizers = s. get_pre_tokenizers ( ) ;
273
+ // Check if we have a Metaspace pre tokenizer in the sequence
274
+ let has_metaspace = pre_tokenizers
275
+ . iter ( )
276
+ . find ( |t| matches ! ( t, PreTokenizerWrapper :: Metaspace ( _) ) )
277
+ . is_some ( ) ;
278
+
279
+ if has_metaspace {
280
+ let mut new_pre_tokenizers = Vec :: with_capacity ( s. get_pre_tokenizers ( ) . len ( ) ) ;
281
+
282
+ for pre_tokenizer in pre_tokenizers {
283
+ if let PreTokenizerWrapper :: WhitespaceSplit ( _) = pre_tokenizer {
284
+ // Remove WhitespaceSplit
285
+ // This will be done by the Metaspace pre tokenizer
286
+ continue ;
287
+ }
288
+
289
+ let mut pre_tokenizer = pre_tokenizer. clone ( ) ;
290
+
291
+ if let PreTokenizerWrapper :: Metaspace ( ref mut m) = pre_tokenizer {
292
+ m. set_prepend_scheme ( PrependScheme :: First ) ;
293
+ }
294
+ new_pre_tokenizers. push ( pre_tokenizer) ;
276
295
}
296
+ tokenizer. with_pre_tokenizer ( PreTokenizerWrapper :: Sequence ( Sequence :: new (
297
+ new_pre_tokenizers,
298
+ ) ) ) ;
277
299
}
278
- tokenizer. with_pre_tokenizer ( PreTokenizerWrapper :: Sequence ( s) ) ;
279
300
}
280
301
}
281
302
0 commit comments