1
1
//! Creates `Vocabulary` manually or from pretrained large language model.
2
2
3
3
use bincode:: { Decode , Encode } ;
4
+ #[ cfg( feature = "hugginface-hub" ) ]
4
5
use locator:: { HFLocator , Locator } ;
6
+ #[ cfg( feature = "hugginface-hub" ) ]
5
7
use processor:: TokenProcessor ;
6
8
use rustc_hash:: FxHashMap as HashMap ;
9
+ #[ cfg( feature = "hugginface-hub" ) ]
7
10
use tokenizers:: normalizers:: Sequence ;
11
+ #[ cfg( feature = "hugginface-hub" ) ]
8
12
use tokenizers:: { NormalizerWrapper , Tokenizer } ;
9
13
10
14
use crate :: prelude:: * ;
11
15
use crate :: { Error , Result } ;
12
16
17
+ #[ cfg( feature = "hugginface-hub" ) ]
13
18
mod locator;
19
+ #[ cfg( feature = "hugginface-hub" ) ]
14
20
mod processor;
15
21
16
22
/// `Vocabulary` of large language model.
17
23
///
18
24
/// ## Examples
19
25
///
26
+ #[ cfg_attr(
27
+ feature = "hugginface-hub" ,
28
+ doc = r##"
20
29
/// ### Create a vocabulary from a pretrained model.
21
30
/// ```rust
22
31
/// use outlines_core::prelude::*;
@@ -51,6 +60,8 @@ mod processor;
51
60
/// vocabulary.remove("token");
52
61
/// assert_eq!(vocabulary.token_ids("token"), None);
53
62
/// ```
63
+ "##
64
+ ) ]
54
65
#[ derive( Clone , Debug , Default , PartialEq , Encode , Decode ) ]
55
66
pub struct Vocabulary {
56
67
eos_token_id : TokenId ,
@@ -67,6 +78,7 @@ impl Vocabulary {
67
78
}
68
79
69
80
/// Creates the vocabulary of pre-trained model from Hugging Face Hub.
81
+ #[ cfg( feature = "hugginface-hub" ) ]
70
82
pub fn from_pretrained (
71
83
model : & str ,
72
84
parameters : Option < FromPretrainedParameters > ,
@@ -76,6 +88,7 @@ impl Vocabulary {
76
88
77
89
#[ doc( hidden) ]
78
90
#[ inline( always) ]
91
+ #[ cfg( feature = "hugginface-hub" ) ]
79
92
fn from_pretrained_with_locator < L : Locator > (
80
93
model : & str ,
81
94
parameters : Option < FromPretrainedParameters > ,
@@ -158,6 +171,7 @@ impl Vocabulary {
158
171
}
159
172
160
173
/// Filters out `Prepend` kind of tokenizer's normalizers.
174
+ #[ cfg( feature = "hugginface-hub" ) ]
161
175
fn filter_prepend_normalizers ( tokenizer : & mut Tokenizer ) {
162
176
// Main concern is prepend normalizers, for example https://github.com/google/sentencepiece
163
177
// In `sentencepiece` tokenizer, `▁` is used to denote spaces in the source text,
@@ -248,8 +262,6 @@ impl TryFrom<(TokenId, HashMap<String, Vec<TokenId>>)> for Vocabulary {
248
262
249
263
#[ cfg( test) ]
250
264
mod tests {
251
- use rustc_hash:: FxHashSet as HashSet ;
252
-
253
265
use super :: * ;
254
266
255
267
#[ test]
@@ -305,6 +317,7 @@ mod tests {
305
317
assert ! ( vocabulary. tokens. is_empty( ) ) ;
306
318
}
307
319
320
+ #[ cfg( feature = "hugginface-hub" ) ]
308
321
#[ test]
309
322
fn supported_pretrained_models ( ) {
310
323
// Support is expected for these:
@@ -332,6 +345,7 @@ mod tests {
332
345
}
333
346
}
334
347
348
+ #[ cfg( feature = "hugginface-hub" ) ]
335
349
#[ test]
336
350
fn pretrained_from_gpt2 ( ) {
337
351
let model = "openai-community/gpt2" ;
@@ -363,8 +377,11 @@ mod tests {
363
377
}
364
378
}
365
379
380
+ #[ cfg( feature = "hugginface-hub" ) ]
366
381
#[ test]
367
382
fn pretrained_from_llama ( ) {
383
+ use rustc_hash:: FxHashSet as HashSet ;
384
+
368
385
let model = "hf-internal-testing/llama-tokenizer" ;
369
386
let tokenizer = Tokenizer :: from_pretrained ( model, None ) . expect ( "Tokenizer failed" ) ;
370
387
let vocabulary = Vocabulary :: from_pretrained ( model, None ) . expect ( "Vocabulary failed" ) ;
@@ -405,6 +422,7 @@ mod tests {
405
422
}
406
423
}
407
424
425
+ #[ cfg( feature = "hugginface-hub" ) ]
408
426
#[ test]
409
427
fn token_processor_error ( ) {
410
428
let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM" ;
@@ -419,6 +437,7 @@ mod tests {
419
437
}
420
438
}
421
439
440
+ #[ cfg( feature = "hugginface-hub" ) ]
422
441
#[ test]
423
442
fn tokenizer_error ( ) {
424
443
let model = "hf-internal-testing/some-non-existent-model" ;
@@ -430,7 +449,9 @@ mod tests {
430
449
}
431
450
}
432
451
452
+ #[ cfg( feature = "hugginface-hub" ) ]
433
453
struct NoneLocator ;
454
+ #[ cfg( feature = "hugginface-hub" ) ]
434
455
impl Locator for NoneLocator {
435
456
fn locate_eos_token_id (
436
457
_model : & str ,
@@ -441,6 +462,7 @@ mod tests {
441
462
}
442
463
}
443
464
465
+ #[ cfg( feature = "hugginface-hub" ) ]
444
466
#[ test]
445
467
fn unable_to_locate_eos_token_id_error ( ) {
446
468
let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM" ;
@@ -456,6 +478,7 @@ mod tests {
456
478
}
457
479
458
480
#[ test]
481
+ #[ cfg( feature = "hugginface-hub" ) ]
459
482
fn prepend_normalizers_filtered_out ( ) {
460
483
use tokenizers:: normalizers:: { Prepend , Sequence } ;
461
484
@@ -488,6 +511,7 @@ mod tests {
488
511
}
489
512
490
513
#[ test]
514
+ #[ cfg( feature = "hugginface-hub" ) ]
491
515
fn other_normalizers_being_kept ( ) {
492
516
use tokenizers:: normalizers:: BertNormalizer ;
493
517
0 commit comments