@@ -6,28 +6,28 @@ use fancy_regex::Regex;
6
6
7
7
static BPE_R50K : LazyLock < Tokenizer > = LazyLock :: new ( || {
8
8
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_r50k.dict" ) ) ;
9
- let bpe = rmp_serde:: from_slice ( bytes) . expect ( "" ) ;
9
+ let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data " ) ;
10
10
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)|\\ s+" ;
11
- Tokenizer :: new ( bpe, Some ( pat) ) . unwrap ( )
11
+ Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
12
12
} ) ;
13
13
14
14
static BPE_P50K : LazyLock < Tokenizer > = LazyLock :: new ( || {
15
15
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_p50k.dict" ) ) ;
16
- let bpe = rmp_serde:: from_slice ( bytes) . expect ( "" ) ;
16
+ let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data " ) ;
17
17
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)|\\ s+" ;
18
- Tokenizer :: new ( bpe, Some ( pat) ) . unwrap ( )
18
+ Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
19
19
} ) ;
20
20
21
21
static BPE_CL100K : LazyLock < Tokenizer > = LazyLock :: new ( || {
22
22
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_cl100k.dict" ) ) ;
23
- let bpe = rmp_serde:: from_slice ( bytes) . expect ( "" ) ;
23
+ let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data " ) ;
24
24
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" ;
25
- Tokenizer :: new ( bpe, Some ( pat) ) . unwrap ( )
25
+ Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
26
26
} ) ;
27
27
28
28
static BPE_O200K : LazyLock < Tokenizer > = LazyLock :: new ( || {
29
29
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_o200k.dict" ) ) ;
30
- let bpe = rmp_serde:: from_slice ( bytes) . expect ( "" ) ;
30
+ let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data " ) ;
31
31
let pat = [
32
32
"[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]*[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
33
33
"[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]+[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
@@ -37,7 +37,7 @@ static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
37
37
"\\ s+(?!\\ S)" ,
38
38
"\\ s+" ,
39
39
] . join ( "|" ) ;
40
- Tokenizer :: new ( bpe, Some ( & pat) ) . unwrap ( )
40
+ Tokenizer :: new ( bpe, Some ( & pat) ) . expect ( "valid regex" )
41
41
} ) ;
42
42
43
43
pub use bpe:: * ;
@@ -50,8 +50,9 @@ pub struct Tokenizer {
50
50
}
51
51
52
52
impl Tokenizer {
53
+ #[ allow( clippy:: result_large_err) ]
53
54
pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> fancy_regex:: Result < Self > {
54
- let pat = pat. map ( |pat| fancy_regex:: Regex :: new ( pat ) ) . transpose ( ) ?;
55
+ let pat = pat. map ( fancy_regex:: Regex :: new) . transpose ( ) ?;
55
56
Ok ( Self { bpe, pat } )
56
57
}
57
58
0 commit comments