File tree Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Original file line number Diff line number Diff line change @@ -431,9 +431,9 @@ struct llm_tokenizer_bpe : llm_tokenizer {
431
431
// 1. Han characters
432
432
" [\\ p{Han}]+" ,
433
433
// 2. Words ending in lowercase (non-Han) with contractions
434
- " [^\\ r\\ n\\ p{L}\\ p{N}]?(?:(?! [\\ p{Han}])[ \\ p{ Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}])*(?:(?![ \\ p{Han}]) [\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]) +(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD] )?" ,
434
+ " [^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}&&[^ \\ p{Han}]]* [\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}&&[^ \\ p{Han}]] +(?i:'s|'t|'re|'ve|'m|'ll|'d )?" ,
435
435
// 3. Words starting with uppercase (non-Han) with contractions
436
- " [^\\ r\\ n\\ p{L}\\ p{N}]?(?:(?! [\\ p{Han}])[ \\ p{ Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}])+(?:(?![ \\ p{Han}]) [\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]) *(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD] )?" ,
436
+ " [^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}&&[^ \\ p{Han}]]+ [\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}&&[^ \\ p{Han}]] *(?i:'s|'t|'re|'ve|'m|'ll|'d )?" ,
437
437
// 4. Numbers (1-3 digits)
438
438
" \\ p{N}{1,3}" ,
439
439
// 5. Punctuation and symbols
You can’t perform that action at this time.
0 commit comments